Cleaned up.

2025-04-29 19:44:56 +02:00 · 2025-04-29 19:44:56 +02:00 · 9136610047
parent 7f69ac8414
commit 9136610047
14 changed files with 226 additions and 221 deletions
--- a/prototypes/arc2_prototype/app.py
+++ b/prototypes/arc2_prototype/app.py
@ -1,14 +1,16 @@
 from flask import Flask, request, jsonify
 import os
 import json
+from pathlib import Path
+
 from ocr_pdf_service.ocr_runner import run_ocr_and_extract
-from exxeta_service.exxetaGPT_api import extract_with_exxeta
-from spacy_service.spacy import extract_with_spacy
-from merge_validate_service.merge_validate import merge_and_validate_entities
+from exxeta_service.exxeta_client import extract_with_exxeta
+from spacy_service.spacy_extractor import extract_with_spacy
+from merge_validate_service.validator import merge_and_validate_entities

 app = Flask(__name__)
-UPLOAD_FOLDER = "pitchbooks"
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+UPLOAD_FOLDER = Path("pitchbooks")
+UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)

@app.route("/")
 def home():
@ -16,24 +18,25 @@ def home():

@app.route("/upload", methods=["POST"])
 def upload():
-    if "file" not in request.files:
-        return jsonify({"error": "No file part"}), 400
+    file = request.files.get("file")

-    file = request.files["file"]
+    if not file or file.filename == "":
+        return jsonify({"error": "No file provided"}), 400

-    if file.filename == "":
-        return jsonify({"error": "No selected file"}), 400
-
-    filepath = os.path.join(UPLOAD_FOLDER, file.filename)
+    filepath = UPLOAD_FOLDER / file.filename
    file.save(filepath)

    try:
-        result_ocr = run_ocr_and_extract(filepath)
-        with open(result_ocr["json_path"], encoding="utf-8") as f:
+        # Step 1: Run OCR
+        ocr_result = run_ocr_and_extract(filepath)
+        with open(ocr_result["json_path"], encoding="utf-8") as f:
            pitchbook_pages = json.load(f)

+        # Step 2: Extract with both engines
        extract_with_exxeta(pitchbook_pages)
        extract_with_spacy(pitchbook_pages)
+
+        # Step 3: Merge and validate results
        merge_and_validate_entities()

    except Exception as e:
--- a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
+++ b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
--- a/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json
+++ b/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json
@ -52,7 +52,12 @@
  {
    "label": "RISIKOPROFIL",
    "entity": "Core",
-    "page": 15
+    "page": 14
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14
  },
  {
    "label": "RISIKOPROFIL",
@ -76,7 +81,12 @@
  },
  {
    "label": "RISIKOPROFIL",
-    "entity": "Core/Core+",
+    "entity": "Core",
+    "page": 15
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
    "page": 19
  },
  {
--- a/prototypes/arc2_prototype/merge_validate_service/merge_validate.py
+++ b/prototypes/arc2_prototype/merge_validate_service/merge_validate.py
@ -1,71 +0,0 @@
-import json
-from pathlib import Path
-
-def normalize_entity(entity_str):
-    if not entity_str:
-        return ""
-    normalized = entity_str.replace('\n', ' ')
-    normalized = ''.join(normalized.lower().split())
-    return normalized
-
-def merge_and_validate_entities():
-    base_dir = Path(__file__).resolve().parent
-    spacy_results_path = base_dir / ".." / "spacy_service" / "output" / "spacy-results.json"
-    exxeta_results_path = base_dir / ".." / "exxeta_service" / "output" / "exxeta-results.json"
-    output_path = base_dir / "output" / "merged-results.json"
-
-    with open(spacy_results_path, "r", encoding="utf-8") as f:
-        spacy_data = json.load(f)
-    with open(exxeta_results_path, "r", encoding="utf-8") as f:
-        exxeta_data = json.load(f)
-
-    merged = []
-    seen = set()
-
-    for s in spacy_data:
-        s_entity_norm = normalize_entity(s["entity"])
-        s_page = s["page"]
-
-        found = False
-        for e in exxeta_data:
-            e_entity_norm = normalize_entity(e["entity"])
-            e_page = e["page"]
-
-            if (s["label"] == e["label"] and
-                    s_entity_norm == e_entity_norm and
-                    s_page == e_page):
-
-                merged.append({
-                    "label": s["label"],
-                    "entity": s["entity"],
-                    "page": s_page,
-                    "status": "validated"
-                })
-                seen.add((e["entity"], e_page))
-                found = True
-                break
-
-        if not found:
-            merged.append({
-                "label": s["label"],
-                "entity": s["entity"],
-                "page": s_page,
-                "status": "single-source",
-                "source": "spacy"
-            })
-
-    for e in exxeta_data:
-        if (e["entity"], e["page"]) not in seen:
-            merged.append({
-                "label": e["label"],
-                "entity": e["entity"],
-                "page": e["page"],
-                "status": "single-source",
-                "source": "exxeta"
-            })
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(merged, f, indent=2, ensure_ascii=False)
-
-    return merged
--- a/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json
+++ b/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json
@ -11,6 +11,13 @@
    "page": 7,
    "status": "validated"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 9,
+    "status": "single-source",
+    "source": "exxeta"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "Core/Core+",
@ -124,6 +131,55 @@
    "status": "single-source",
    "source": "spacy"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 14,
+    "status": "single-source",
+    "source": "exxeta"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "Country /",
@ -166,6 +222,41 @@
    "status": "single-source",
    "source": "spacy"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 15,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 15,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 15,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 15,
+    "status": "single-source",
+    "source": "exxeta"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 15,
+    "status": "single-source",
+    "source": "exxeta"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "countries, giving",
@ -173,6 +264,13 @@
    "status": "single-source",
    "source": "spacy"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 19,
+    "status": "single-source",
+    "source": "exxeta"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "core/core+",
@ -199,6 +297,13 @@
    "status": "single-source",
    "source": "spacy"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core",
+    "page": 26,
+    "status": "single-source",
+    "source": "exxeta"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "core or",
@ -215,134 +320,8 @@
  },
  {
    "label": "RISIKOPROFIL",
-    "entity": "kgCO,e",
-    "page": 30,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "C,\n",
-    "page": 32,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "KfW, Dwp",
-    "page": 35,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Bank,",
-    "page": 35,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "HSBC, RTE",
-    "page": 37,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core WALB (",
-    "page": 37,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core WALB (",
-    "page": 37,
-    "status": "single-source",
-    "source": "spacy"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 9,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 14,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 14,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 14,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 14,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 14,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 15,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 15,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 15,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 15,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 15,
-    "status": "single-source",
-    "source": "exxeta"
-  },
-  {
-    "label": "RISIKOPROFIL",
-    "entity": "Core/Core+",
-    "page": 19,
+    "entity": "Core Offices, Core + assets",
+    "page": 27,
    "status": "single-source",
    "source": "exxeta"
  },
@ -355,17 +334,17 @@
  },
  {
    "label": "RISIKOPROFIL",
-    "entity": "Core",
-    "page": 26,
+    "entity": "kgCO,e",
+    "page": 30,
    "status": "single-source",
-    "source": "exxeta"
+    "source": "spacy"
  },
  {
    "label": "RISIKOPROFIL",
-    "entity": "Core Offices, Core + assets",
-    "page": 27,
+    "entity": "C,\n",
+    "page": 32,
    "status": "single-source",
-    "source": "exxeta"
+    "source": "spacy"
  },
  {
    "label": "RISIKOPROFIL",
@ -388,6 +367,20 @@
    "status": "single-source",
    "source": "exxeta"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "KfW, Dwp",
+    "page": 35,
+    "status": "single-source",
+    "source": "spacy"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Bank,",
+    "page": 35,
+    "status": "single-source",
+    "source": "spacy"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "Core",
@ -416,6 +409,27 @@
    "status": "single-source",
    "source": "exxeta"
  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "HSBC, RTE",
+    "page": 37,
+    "status": "single-source",
+    "source": "spacy"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core WALB (",
+    "page": 37,
+    "status": "single-source",
+    "source": "spacy"
+  },
+  {
+    "label": "RISIKOPROFIL",
+    "entity": "Core WALB (",
+    "page": 37,
+    "status": "single-source",
+    "source": "spacy"
+  },
  {
    "label": "RISIKOPROFIL",
    "entity": "Core",
--- a/prototypes/arc2_prototype/merge_validate_service/validator.py
+++ b/prototypes/arc2_prototype/merge_validate_service/validator.py
@ -0,0 +1,49 @@
+from pathlib import Path
+import json
+
+def normalize_entity(entity_str):
+    return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
+
+def load_json(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+def merge_and_validate_entities():
+    base = Path(__file__).resolve().parent.parent
+    spacy_path = base / "spacy_service/output/spacy-results.json"
+    exxeta_path = base / "exxeta_service/output/exxeta-results.json"
+    output_path = base / "merge_validate_service/output/merged-results.json"
+
+    spacy_data = load_json(spacy_path)
+    exxeta_data = load_json(exxeta_path)
+
+    merged = []
+    seen = set()
+
+    for s in spacy_data:
+        s_norm = normalize_entity(s["entity"])
+        s_page = s["page"]
+
+        match = next(
+            (e for e in exxeta_data
+             if e["label"] == s["label"] and
+             normalize_entity(e["entity"]) == s_norm and
+             e["page"] == s_page),
+            None
+        )
+
+        if match:
+            merged.append({**s, "status": "validated"})
+            seen.add((match["entity"], match["page"]))
+        else:
+            merged.append({**s, "status": "single-source", "source": "spacy"})
+
+    for e in exxeta_data:
+        if (e["entity"], e["page"]) not in seen:
+            merged.append({**e, "status": "single-source", "source": "exxeta"})
+
+    merged.sort(key=lambda x: (x.get("page", 0), x.get("label", "")))
+    with output_path.open("w", encoding="utf-8") as f:
+        json.dump(merged, f, indent=2)
+
+    return merged
--- a/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf
+++ b/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf
--- a/prototypes/arc2_prototype/pitchbooks/Pitchbook
+++ b/prototypes/arc2_prototype/pitchbooks/Pitchbook
--- a/prototypes/arc2_prototype/pitchbooks/Teaser
+++ b/prototypes/arc2_prototype/pitchbooks/Teaser
--- a/prototypes/arc2_prototype/pitchbooks/Teaser
+++ b/prototypes/arc2_prototype/pitchbooks/Teaser
--- a/prototypes/arc2_prototype/pitchbooks/Teaser
+++ b/prototypes/arc2_prototype/pitchbooks/Teaser
--- a/prototypes/arc2_prototype/pitchbooks/Teaser
+++ b/prototypes/arc2_prototype/pitchbooks/Teaser
--- a/prototypes/arc2_prototype/pitchbooks/Teaser
+++ b/prototypes/arc2_prototype/pitchbooks/Teaser
--- a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
+++ b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py