Cleaned up.
parent
7f69ac8414
commit
9136610047
|
|
@ -1,14 +1,16 @@
|
|||
from flask import Flask, request, jsonify
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from ocr_pdf_service.ocr_runner import run_ocr_and_extract
|
||||
from exxeta_service.exxetaGPT_api import extract_with_exxeta
|
||||
from spacy_service.spacy import extract_with_spacy
|
||||
from merge_validate_service.merge_validate import merge_and_validate_entities
|
||||
from exxeta_service.exxeta_client import extract_with_exxeta
|
||||
from spacy_service.spacy_extractor import extract_with_spacy
|
||||
from merge_validate_service.validator import merge_and_validate_entities
|
||||
|
||||
app = Flask(__name__)
|
||||
UPLOAD_FOLDER = "pitchbooks"
|
||||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||
UPLOAD_FOLDER = Path("pitchbooks")
|
||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@app.route("/")
|
||||
def home():
|
||||
|
|
@ -16,24 +18,25 @@ def home():
|
|||
|
||||
@app.route("/upload", methods=["POST"])
|
||||
def upload():
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file part"}), 400
|
||||
file = request.files.get("file")
|
||||
|
||||
file = request.files["file"]
|
||||
if not file or file.filename == "":
|
||||
return jsonify({"error": "No file provided"}), 400
|
||||
|
||||
if file.filename == "":
|
||||
return jsonify({"error": "No selected file"}), 400
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, file.filename)
|
||||
filepath = UPLOAD_FOLDER / file.filename
|
||||
file.save(filepath)
|
||||
|
||||
try:
|
||||
result_ocr = run_ocr_and_extract(filepath)
|
||||
with open(result_ocr["json_path"], encoding="utf-8") as f:
|
||||
# Step 1: Run OCR
|
||||
ocr_result = run_ocr_and_extract(filepath)
|
||||
with open(ocr_result["json_path"], encoding="utf-8") as f:
|
||||
pitchbook_pages = json.load(f)
|
||||
|
||||
# Step 2: Extract with both engines
|
||||
extract_with_exxeta(pitchbook_pages)
|
||||
extract_with_spacy(pitchbook_pages)
|
||||
|
||||
# Step 3: Merge and validate results
|
||||
merge_and_validate_entities()
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -52,7 +52,12 @@
|
|||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15
|
||||
"page": 14
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
|
|
@ -76,7 +81,12 @@
|
|||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core/Core+",
|
||||
"entity": "Core",
|
||||
"page": 15
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 19
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def normalize_entity(entity_str):
|
||||
if not entity_str:
|
||||
return ""
|
||||
normalized = entity_str.replace('\n', ' ')
|
||||
normalized = ''.join(normalized.lower().split())
|
||||
return normalized
|
||||
|
||||
def merge_and_validate_entities():
|
||||
base_dir = Path(__file__).resolve().parent
|
||||
spacy_results_path = base_dir / ".." / "spacy_service" / "output" / "spacy-results.json"
|
||||
exxeta_results_path = base_dir / ".." / "exxeta_service" / "output" / "exxeta-results.json"
|
||||
output_path = base_dir / "output" / "merged-results.json"
|
||||
|
||||
with open(spacy_results_path, "r", encoding="utf-8") as f:
|
||||
spacy_data = json.load(f)
|
||||
with open(exxeta_results_path, "r", encoding="utf-8") as f:
|
||||
exxeta_data = json.load(f)
|
||||
|
||||
merged = []
|
||||
seen = set()
|
||||
|
||||
for s in spacy_data:
|
||||
s_entity_norm = normalize_entity(s["entity"])
|
||||
s_page = s["page"]
|
||||
|
||||
found = False
|
||||
for e in exxeta_data:
|
||||
e_entity_norm = normalize_entity(e["entity"])
|
||||
e_page = e["page"]
|
||||
|
||||
if (s["label"] == e["label"] and
|
||||
s_entity_norm == e_entity_norm and
|
||||
s_page == e_page):
|
||||
|
||||
merged.append({
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "validated"
|
||||
})
|
||||
seen.add((e["entity"], e_page))
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
merged.append({
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
})
|
||||
|
||||
for e in exxeta_data:
|
||||
if (e["entity"], e["page"]) not in seen:
|
||||
merged.append({
|
||||
"label": e["label"],
|
||||
"entity": e["entity"],
|
||||
"page": e["page"],
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
})
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return merged
|
||||
|
|
@ -11,6 +11,13 @@
|
|||
"page": 7,
|
||||
"status": "validated"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 9,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core/Core+",
|
||||
|
|
@ -124,6 +131,55 @@
|
|||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Country /",
|
||||
|
|
@ -166,6 +222,41 @@
|
|||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "countries, giving",
|
||||
|
|
@ -173,6 +264,13 @@
|
|||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 19,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "core/core+",
|
||||
|
|
@ -199,6 +297,13 @@
|
|||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 26,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "core or",
|
||||
|
|
@ -215,134 +320,8 @@
|
|||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "kgCO,e",
|
||||
"page": 30,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "C,\n",
|
||||
"page": 32,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "KfW, Dwp",
|
||||
"page": 35,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Bank,",
|
||||
"page": 35,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "HSBC, RTE",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core WALB (",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core WALB (",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 9,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 14,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 15,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core/Core+",
|
||||
"page": 19,
|
||||
"entity": "Core Offices, Core + assets",
|
||||
"page": 27,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
|
|
@ -355,17 +334,17 @@
|
|||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
"page": 26,
|
||||
"entity": "kgCO,e",
|
||||
"page": 30,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core Offices, Core + assets",
|
||||
"page": 27,
|
||||
"entity": "C,\n",
|
||||
"page": 32,
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
|
|
@ -388,6 +367,20 @@
|
|||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "KfW, Dwp",
|
||||
"page": 35,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Bank,",
|
||||
"page": 35,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
|
|
@ -416,6 +409,27 @@
|
|||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "HSBC, RTE",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core WALB (",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core WALB (",
|
||||
"page": 37,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
},
|
||||
{
|
||||
"label": "RISIKOPROFIL",
|
||||
"entity": "Core",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
|
||||
def normalize_entity(entity_str):
|
||||
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
|
||||
|
||||
def load_json(path: Path):
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def merge_and_validate_entities():
|
||||
base = Path(__file__).resolve().parent.parent
|
||||
spacy_path = base / "spacy_service/output/spacy-results.json"
|
||||
exxeta_path = base / "exxeta_service/output/exxeta-results.json"
|
||||
output_path = base / "merge_validate_service/output/merged-results.json"
|
||||
|
||||
spacy_data = load_json(spacy_path)
|
||||
exxeta_data = load_json(exxeta_path)
|
||||
|
||||
merged = []
|
||||
seen = set()
|
||||
|
||||
for s in spacy_data:
|
||||
s_norm = normalize_entity(s["entity"])
|
||||
s_page = s["page"]
|
||||
|
||||
match = next(
|
||||
(e for e in exxeta_data
|
||||
if e["label"] == s["label"] and
|
||||
normalize_entity(e["entity"]) == s_norm and
|
||||
e["page"] == s_page),
|
||||
None
|
||||
)
|
||||
|
||||
if match:
|
||||
merged.append({**s, "status": "validated"})
|
||||
seen.add((match["entity"], match["page"]))
|
||||
else:
|
||||
merged.append({**s, "status": "single-source", "source": "spacy"})
|
||||
|
||||
for e in exxeta_data:
|
||||
if (e["entity"], e["page"]) not in seen:
|
||||
merged.append({**e, "status": "single-source", "source": "exxeta"})
|
||||
|
||||
merged.sort(key=lambda x: (x.get("page", 0), x.get("label", "")))
|
||||
with output_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, indent=2)
|
||||
|
||||
return merged
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue