diff --git a/prototypes/arc2_prototype/app.py b/prototypes/arc2_prototype/app.py index 243e841..97466d8 100644 --- a/prototypes/arc2_prototype/app.py +++ b/prototypes/arc2_prototype/app.py @@ -1,14 +1,16 @@ from flask import Flask, request, jsonify import os import json +from pathlib import Path + from ocr_pdf_service.ocr_runner import run_ocr_and_extract -from exxeta_service.exxetaGPT_api import extract_with_exxeta -from spacy_service.spacy import extract_with_spacy -from merge_validate_service.merge_validate import merge_and_validate_entities +from exxeta_service.exxeta_client import extract_with_exxeta +from spacy_service.spacy_extractor import extract_with_spacy +from merge_validate_service.validator import merge_and_validate_entities app = Flask(__name__) -UPLOAD_FOLDER = "pitchbooks" -os.makedirs(UPLOAD_FOLDER, exist_ok=True) +UPLOAD_FOLDER = Path("pitchbooks") +UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) @app.route("/") def home(): @@ -16,24 +18,25 @@ def home(): @app.route("/upload", methods=["POST"]) def upload(): - if "file" not in request.files: - return jsonify({"error": "No file part"}), 400 + file = request.files.get("file") - file = request.files["file"] + if not file or file.filename == "": + return jsonify({"error": "No file provided"}), 400 - if file.filename == "": - return jsonify({"error": "No selected file"}), 400 - - filepath = os.path.join(UPLOAD_FOLDER, file.filename) + filepath = UPLOAD_FOLDER / file.filename file.save(filepath) try: - result_ocr = run_ocr_and_extract(filepath) - with open(result_ocr["json_path"], encoding="utf-8") as f: + # Step 1: Run OCR + ocr_result = run_ocr_and_extract(filepath) + with open(ocr_result["json_path"], encoding="utf-8") as f: pitchbook_pages = json.load(f) + # Step 2: Extract with both engines extract_with_exxeta(pitchbook_pages) extract_with_spacy(pitchbook_pages) + + # Step 3: Merge and validate results merge_and_validate_entities() except Exception as e: diff --git a/prototypes/arc2_prototype/exxeta_service/exxetaGPT_api.py b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py similarity index 100% rename from prototypes/arc2_prototype/exxeta_service/exxetaGPT_api.py rename to prototypes/arc2_prototype/exxeta_service/exxeta_client.py diff --git a/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json b/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json index c93c677..53a163e 100644 --- a/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json +++ b/prototypes/arc2_prototype/exxeta_service/output/exxeta-results.json @@ -52,7 +52,12 @@ { "label": "RISIKOPROFIL", "entity": "Core", - "page": 15 + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 }, { "label": "RISIKOPROFIL", @@ -76,7 +81,12 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core/Core+", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", "page": 19 }, { diff --git a/prototypes/arc2_prototype/merge_validate_service/merge_validate.py b/prototypes/arc2_prototype/merge_validate_service/merge_validate.py deleted file mode 100644 index 9dd75f1..0000000 --- a/prototypes/arc2_prototype/merge_validate_service/merge_validate.py +++ /dev/null @@ -1,71 +0,0 @@ -import json -from pathlib import Path - -def normalize_entity(entity_str): - if not entity_str: - return "" - normalized = entity_str.replace('\n', ' ') - normalized = ''.join(normalized.lower().split()) - return normalized - -def merge_and_validate_entities(): - base_dir = Path(__file__).resolve().parent - spacy_results_path = base_dir / ".." / "spacy_service" / "output" / "spacy-results.json" - exxeta_results_path = base_dir / ".." / "exxeta_service" / "output" / "exxeta-results.json" - output_path = base_dir / "output" / "merged-results.json" - - with open(spacy_results_path, "r", encoding="utf-8") as f: - spacy_data = json.load(f) - with open(exxeta_results_path, "r", encoding="utf-8") as f: - exxeta_data = json.load(f) - - merged = [] - seen = set() - - for s in spacy_data: - s_entity_norm = normalize_entity(s["entity"]) - s_page = s["page"] - - found = False - for e in exxeta_data: - e_entity_norm = normalize_entity(e["entity"]) - e_page = e["page"] - - if (s["label"] == e["label"] and - s_entity_norm == e_entity_norm and - s_page == e_page): - - merged.append({ - "label": s["label"], - "entity": s["entity"], - "page": s_page, - "status": "validated" - }) - seen.add((e["entity"], e_page)) - found = True - break - - if not found: - merged.append({ - "label": s["label"], - "entity": s["entity"], - "page": s_page, - "status": "single-source", - "source": "spacy" - }) - - for e in exxeta_data: - if (e["entity"], e["page"]) not in seen: - merged.append({ - "label": e["label"], - "entity": e["entity"], - "page": e["page"], - "status": "single-source", - "source": "exxeta" - }) - - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - json.dump(merged, f, indent=2, ensure_ascii=False) - - return merged \ No newline at end of file diff --git a/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json b/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json index 060fe46..eb4f254 100644 --- a/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json +++ b/prototypes/arc2_prototype/merge_validate_service/output/merged-results.json @@ -11,6 +11,13 @@ "page": 7, "status": "validated" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 9, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "Core/Core+", @@ -124,6 +131,55 @@ "status": "single-source", "source": "spacy" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "Country /", @@ -166,6 +222,41 @@ "status": "single-source", "source": "spacy" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "countries, giving", @@ -173,6 +264,13 @@ "status": "single-source", "source": "spacy" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 19, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "core/core+", @@ -199,6 +297,13 @@ "status": "single-source", "source": "spacy" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 26, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "core or", @@ -215,134 +320,8 @@ }, { "label": "RISIKOPROFIL", - "entity": "kgCO,e", - "page": 30, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "C,\n", - "page": 32, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "KfW, Dwp", - "page": 35, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "Bank,", - "page": 35, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "HSBC, RTE", - "page": 37, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core WALB (", - "page": 37, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core WALB (", - "page": 37, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 9, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 14, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 14, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 14, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 14, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 14, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core/Core+", - "page": 19, + "entity": "Core Offices, Core + assets", + "page": 27, "status": "single-source", "source": "exxeta" }, @@ -355,17 +334,17 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core", - "page": 26, + "entity": "kgCO,e", + "page": 30, "status": "single-source", - "source": "exxeta" + "source": "spacy" }, { "label": "RISIKOPROFIL", - "entity": "Core Offices, Core + assets", - "page": 27, + "entity": "C,\n", + "page": 32, "status": "single-source", - "source": "exxeta" + "source": "spacy" }, { "label": "RISIKOPROFIL", @@ -388,6 +367,20 @@ "status": "single-source", "source": "exxeta" }, + { + "label": "RISIKOPROFIL", + "entity": "KfW, Dwp", + "page": 35, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Bank,", + "page": 35, + "status": "single-source", + "source": "spacy" + }, { "label": "RISIKOPROFIL", "entity": "Core", @@ -416,6 +409,27 @@ "status": "single-source", "source": "exxeta" }, + { + "label": "RISIKOPROFIL", + "entity": "HSBC, RTE", + "page": 37, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core WALB (", + "page": 37, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core WALB (", + "page": 37, + "status": "single-source", + "source": "spacy" + }, { "label": "RISIKOPROFIL", "entity": "Core", diff --git a/prototypes/arc2_prototype/merge_validate_service/validator.py b/prototypes/arc2_prototype/merge_validate_service/validator.py new file mode 100644 index 0000000..e194021 --- /dev/null +++ b/prototypes/arc2_prototype/merge_validate_service/validator.py @@ -0,0 +1,49 @@ +from pathlib import Path +import json + +def normalize_entity(entity_str): + return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else "" + +def load_json(path: Path): + with path.open("r", encoding="utf-8") as f: + return json.load(f) + +def merge_and_validate_entities(): + base = Path(__file__).resolve().parent.parent + spacy_path = base / "spacy_service/output/spacy-results.json" + exxeta_path = base / "exxeta_service/output/exxeta-results.json" + output_path = base / "merge_validate_service/output/merged-results.json" + + spacy_data = load_json(spacy_path) + exxeta_data = load_json(exxeta_path) + + merged = [] + seen = set() + + for s in spacy_data: + s_norm = normalize_entity(s["entity"]) + s_page = s["page"] + + match = next( + (e for e in exxeta_data + if e["label"] == s["label"] and + normalize_entity(e["entity"]) == s_norm and + e["page"] == s_page), + None + ) + + if match: + merged.append({**s, "status": "validated"}) + seen.add((match["entity"], match["page"])) + else: + merged.append({**s, "status": "single-source", "source": "spacy"}) + + for e in exxeta_data: + if (e["entity"], e["page"]) not in seen: + merged.append({**e, "status": "single-source", "source": "exxeta"}) + + merged.sort(key=lambda x: (x.get("page", 0), x.get("label", ""))) + with output_path.open("w", encoding="utf-8") as f: + json.dump(merged, f, indent=2) + + return merged \ No newline at end of file diff --git a/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf b/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf index 9b16efd..ffcd519 100644 Binary files a/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf and b/prototypes/arc2_prototype/ocr_pdf_service/output/pitchbook-OCR.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Pitchbook 2.pdf b/prototypes/arc2_prototype/pitchbooks/Pitchbook 2.pdf new file mode 100644 index 0000000..4d627a5 Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Pitchbook 2.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Teaser 1 FINAL.pdf b/prototypes/arc2_prototype/pitchbooks/Teaser 1 FINAL.pdf new file mode 100644 index 0000000..9c5c643 Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Teaser 1 FINAL.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Teaser 2 FINAL.pdf b/prototypes/arc2_prototype/pitchbooks/Teaser 2 FINAL.pdf new file mode 100644 index 0000000..f67bfcf Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Teaser 2 FINAL.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Teaser 3 FINAL.pdf b/prototypes/arc2_prototype/pitchbooks/Teaser 3 FINAL.pdf new file mode 100644 index 0000000..7a0e342 Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Teaser 3 FINAL.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Teaser 4 FINAL.pdf b/prototypes/arc2_prototype/pitchbooks/Teaser 4 FINAL.pdf new file mode 100644 index 0000000..cfb8e18 Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Teaser 4 FINAL.pdf differ diff --git a/prototypes/arc2_prototype/pitchbooks/Teaser 5 FINAL.pdf b/prototypes/arc2_prototype/pitchbooks/Teaser 5 FINAL.pdf new file mode 100644 index 0000000..3898d24 Binary files /dev/null and b/prototypes/arc2_prototype/pitchbooks/Teaser 5 FINAL.pdf differ diff --git a/prototypes/arc2_prototype/spacy_service/spacy.py b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py similarity index 100% rename from prototypes/arc2_prototype/spacy_service/spacy.py rename to prototypes/arc2_prototype/spacy_service/spacy_extractor.py