Cleaned up.

pull/34/head
s8613 2025-04-29 19:44:56 +02:00
parent 7f69ac8414
commit 9136610047
14 changed files with 226 additions and 221 deletions

View File

@ -1,14 +1,16 @@
from flask import Flask, request, jsonify
import os
import json
from pathlib import Path
from ocr_pdf_service.ocr_runner import run_ocr_and_extract
from exxeta_service.exxetaGPT_api import extract_with_exxeta
from spacy_service.spacy import extract_with_spacy
from merge_validate_service.merge_validate import merge_and_validate_entities
from exxeta_service.exxeta_client import extract_with_exxeta
from spacy_service.spacy_extractor import extract_with_spacy
from merge_validate_service.validator import merge_and_validate_entities
app = Flask(__name__)
UPLOAD_FOLDER = "pitchbooks"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
UPLOAD_FOLDER = Path("pitchbooks")
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
@app.route("/")
def home():
@ -16,24 +18,25 @@ def home():
@app.route("/upload", methods=["POST"])
def upload():
if "file" not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files.get("file")
file = request.files["file"]
if not file or file.filename == "":
return jsonify({"error": "No file provided"}), 400
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
filepath = os.path.join(UPLOAD_FOLDER, file.filename)
filepath = UPLOAD_FOLDER / file.filename
file.save(filepath)
try:
result_ocr = run_ocr_and_extract(filepath)
with open(result_ocr["json_path"], encoding="utf-8") as f:
# Step 1: Run OCR
ocr_result = run_ocr_and_extract(filepath)
with open(ocr_result["json_path"], encoding="utf-8") as f:
pitchbook_pages = json.load(f)
# Step 2: Extract with both engines
extract_with_exxeta(pitchbook_pages)
extract_with_spacy(pitchbook_pages)
# Step 3: Merge and validate results
merge_and_validate_entities()
except Exception as e:

View File

@ -52,7 +52,12 @@
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
@ -76,7 +81,12 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 19
},
{

View File

@ -1,71 +0,0 @@
import json
from pathlib import Path
def normalize_entity(entity_str):
if not entity_str:
return ""
normalized = entity_str.replace('\n', ' ')
normalized = ''.join(normalized.lower().split())
return normalized
def merge_and_validate_entities():
base_dir = Path(__file__).resolve().parent
spacy_results_path = base_dir / ".." / "spacy_service" / "output" / "spacy-results.json"
exxeta_results_path = base_dir / ".." / "exxeta_service" / "output" / "exxeta-results.json"
output_path = base_dir / "output" / "merged-results.json"
with open(spacy_results_path, "r", encoding="utf-8") as f:
spacy_data = json.load(f)
with open(exxeta_results_path, "r", encoding="utf-8") as f:
exxeta_data = json.load(f)
merged = []
seen = set()
for s in spacy_data:
s_entity_norm = normalize_entity(s["entity"])
s_page = s["page"]
found = False
for e in exxeta_data:
e_entity_norm = normalize_entity(e["entity"])
e_page = e["page"]
if (s["label"] == e["label"] and
s_entity_norm == e_entity_norm and
s_page == e_page):
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "validated"
})
seen.add((e["entity"], e_page))
found = True
break
if not found:
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "single-source",
"source": "spacy"
})
for e in exxeta_data:
if (e["entity"], e["page"]) not in seen:
merged.append({
"label": e["label"],
"entity": e["entity"],
"page": e["page"],
"status": "single-source",
"source": "exxeta"
})
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(merged, f, indent=2, ensure_ascii=False)
return merged

View File

@ -11,6 +11,13 @@
"page": 7,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 9,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
@ -124,6 +131,55 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Country /",
@ -166,6 +222,41 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "countries, giving",
@ -173,6 +264,13 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 19,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
@ -199,6 +297,13 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core or",
@ -215,134 +320,8 @@
},
{
"label": "RISIKOPROFIL",
"entity": "kgCO,e",
"page": 30,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "C,\n",
"page": 32,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "KfW, Dwp",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Bank,",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "HSBC, RTE",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core WALB (",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core WALB (",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 9,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 19,
"entity": "Core Offices, Core + assets",
"page": 27,
"status": "single-source",
"source": "exxeta"
},
@ -355,17 +334,17 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 26,
"entity": "kgCO,e",
"page": 30,
"status": "single-source",
"source": "exxeta"
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core Offices, Core + assets",
"page": 27,
"entity": "C,\n",
"page": 32,
"status": "single-source",
"source": "exxeta"
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
@ -388,6 +367,20 @@
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "KfW, Dwp",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Bank,",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
@ -416,6 +409,27 @@
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "HSBC, RTE",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core WALB (",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core WALB (",
"page": 37,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",

View File

@ -0,0 +1,49 @@
from pathlib import Path
import json
def normalize_entity(entity_str):
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
def load_json(path: Path):
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def merge_and_validate_entities():
base = Path(__file__).resolve().parent.parent
spacy_path = base / "spacy_service/output/spacy-results.json"
exxeta_path = base / "exxeta_service/output/exxeta-results.json"
output_path = base / "merge_validate_service/output/merged-results.json"
spacy_data = load_json(spacy_path)
exxeta_data = load_json(exxeta_path)
merged = []
seen = set()
for s in spacy_data:
s_norm = normalize_entity(s["entity"])
s_page = s["page"]
match = next(
(e for e in exxeta_data
if e["label"] == s["label"] and
normalize_entity(e["entity"]) == s_norm and
e["page"] == s_page),
None
)
if match:
merged.append({**s, "status": "validated"})
seen.add((match["entity"], match["page"]))
else:
merged.append({**s, "status": "single-source", "source": "spacy"})
for e in exxeta_data:
if (e["entity"], e["page"]) not in seen:
merged.append({**e, "status": "single-source", "source": "exxeta"})
merged.sort(key=lambda x: (x.get("page", 0), x.get("label", "")))
with output_path.open("w", encoding="utf-8") as f:
json.dump(merged, f, indent=2)
return merged