diff --git a/prototypes/merge_validate-arc2/config.py b/prototypes/merge_validate-arc2/config.py new file mode 100644 index 0000000..3b27716 --- /dev/null +++ b/prototypes/merge_validate-arc2/config.py @@ -0,0 +1,3 @@ +EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0" +EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" +MODEL_ID = "gpt-35-turbo" \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/exxeta_api.py b/prototypes/merge_validate-arc2/exxeta_api.py new file mode 100644 index 0000000..f3fb069 --- /dev/null +++ b/prototypes/merge_validate-arc2/exxeta_api.py @@ -0,0 +1,66 @@ +from config import EXXETA_API_KEY, EXXETA_BASE_URL +import requests +import json + +MODEL = "gpt-35-turbo" + +def extract_risikoprofil_from_exxeta(pages_json): + results = [] + skipped_pages = [] + + for page_data in pages_json: + page_num = page_data.get("page") + text = page_data.get("text", "").strip() + + prompt = ( + f"Bitte extrahiere alle Vorkommen von Risikoprofilen wie \"Core\", \"Core+\", " + f"\"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" " + f"auf **Seite {page_num}** im folgenden Text.\n\n" + f"Liefere das Ergebnis NUR als valides JSON-Array:\n" + f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core\", \"page\": {page_num}}}]\n\n" + f"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array zurück: []\n\n" + f"Keine ESG-Profile oder Carbon-Ziele. Nur Risikoprofilierungen. Keine Kommentare oder Text außerhalb des JSON.\n\n" + f"TEXT:\n{text}" + ) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {EXXETA_API_KEY}" + } + + payload = { + "model": MODEL, + "messages": [ + {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."}, + {"role": "user", "content": prompt} + ], + "temperature": 0.0 + } + + url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" + + for attempt in range(2): + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + + content = response.json()["choices"][0]["message"]["content"] + content = content.strip() + + if content.startswith("```json"): + content = content.split("```json")[1] + if content.endswith("```"): + content = content.split("```")[0] + content = content.strip() + + page_results = json.loads(content) + + if page_results: + results.extend(page_results) + + break + + except Exception as e: + print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}") + + return results \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/exxeta_result.json b/prototypes/merge_validate-arc2/exxeta_result.json new file mode 100644 index 0000000..ba19bfd --- /dev/null +++ b/prototypes/merge_validate-arc2/exxeta_result.json @@ -0,0 +1,202 @@ +[ + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 1 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 2 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 4 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 7 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 7 + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 7 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/Core+", + "page": 10 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 11 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 11 + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 11 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 12 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/core+", + "page": 20 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/core+", + "page": 20 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 24 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 24 + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 24 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 26 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 26 + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 26 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 27 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 27 + }, + { + "label": "RISIKOPROFIL", + "entity": "Opportunistisch", + "page": 27 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 34 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 34 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 35 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 35 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 36 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38 + } +] \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/merge_logic.py b/prototypes/merge_validate-arc2/merge_logic.py new file mode 100644 index 0000000..b6cf2c3 --- /dev/null +++ b/prototypes/merge_validate-arc2/merge_logic.py @@ -0,0 +1,59 @@ +def normalize_entity(entity_str): + if not entity_str: + return "" + normalized = entity_str.replace('\n', ' ') + normalized = ''.join(normalized.lower().split()) + return normalized + +def merge_and_validate_entities(spacy_data, exxeta_data): + merged = [] + seen = set() + + # Process SpaCy entities first + for s in spacy_data: + s_entity_norm = normalize_entity(s["entity"]) + s_page = s["page"] + + # Look for matching Exxeta entities + found = False + for e in exxeta_data: + e_entity_norm = normalize_entity(e["entity"]) + e_page = e["page"] + + # Match if normalized entity and page match + if (s["label"] == e["label"] and + s_entity_norm == e_entity_norm and + s_page == e_page): + + merged.append({ + "label": s["label"], + "entity": s["entity"], + "page": s_page, + "status": "validated" + }) + seen.add((e["entity"], e_page)) + found = True + break + + # If no match found, add as single-source + if not found: + merged.append({ + "label": s["label"], + "entity": s["entity"], + "page": s_page, + "status": "single-source", + "source": "spacy" + }) + + # Add remaining Exxeta entities not already processed + for e in exxeta_data: + if (e["entity"], e["page"]) not in seen: + merged.append({ + "label": e["label"], + "entity": e["entity"], + "page": e["page"], + "status": "single-source", + "source": "exxeta" + }) + + return merged \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/merge_validate.py b/prototypes/merge_validate-arc2/merge_validate.py new file mode 100644 index 0000000..e9b2886 --- /dev/null +++ b/prototypes/merge_validate-arc2/merge_validate.py @@ -0,0 +1,37 @@ +from pathlib import Path +import json +from spacy_extract import load_spacy_entities +from exxeta_api import extract_risikoprofil_from_exxeta +from merge_logic import merge_and_validate_entities + +SPACY_PATH = "../fine_tuning_spaCy/entities_output.json" +PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json" + +def load_pitchbook_pages(): + path = Path(PITCHBOOK_PATH) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + +def save_json(data, filename): + with open(filename, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + +def sort_by_page_number(entities): + return sorted(entities, key=lambda x: x.get("page", 0)) + +def run(): + spacy_entities = load_spacy_entities(SPACY_PATH) + pitchbook_pages = load_pitchbook_pages() + exxeta_entities = extract_risikoprofil_from_exxeta(pitchbook_pages) + save_json(exxeta_entities, "exxeta_result.json") + merged = merge_and_validate_entities(spacy_entities, exxeta_entities) + merged_sorted = sort_by_page_number(merged) + save_json(merged_sorted, "merged_result.json") + + print("\n✅ All done! Files written:") + print("- exxeta_result.json") + print("- merged_result.json") + print(f"- Total entities in merged result: {len(merged_sorted)}") + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/merged_result.json b/prototypes/merge_validate-arc2/merged_result.json new file mode 100644 index 0000000..8ee2784 --- /dev/null +++ b/prototypes/merge_validate-arc2/merged_result.json @@ -0,0 +1,430 @@ +[ + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 1, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 2, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core and Core+", + "page": 4, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 4, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "core, core+, value-added", + "page": 7, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 7, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 7, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 7, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/Core+", + "page": 10, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 10, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/Core+", + "page": 10, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "UK, DE, BE, NL, LU,", + "page": 10, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 11, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 11, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 11, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core / Core +", + "page": 12, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "core\n/ core+", + "page": 12, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "core", + "page": 12, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Term / core+", + "page": 12, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 12, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "6,4 6,4", + "page": 13, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Country /", + "page": 14, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore", + "page": 14, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 14, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Country /", + "page": 15, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core\nCore\nCore\nCore\nCore\nCore", + "page": 15, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "countries, giving", + "page": 18, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 20, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 20, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "D, and", + "page": 21, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 24, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 24, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 24, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "UK, DE, BE, NL, LU,", + "page": 26, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 26, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core+", + "page": 26, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Value-added", + "page": 26, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "core or", + "page": 27, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core +", + "page": 27, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 27, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Opportunistisch", + "page": 27, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "kgCO,e", + "page": 30, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "C,", + "page": 32, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 34, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 34, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "KfW, Dwp", + "page": 35, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Bank,", + "page": 35, + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 35, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 35, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 36, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 36, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38, + "status": "validated" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38, + "status": "validated" + } +] \ No newline at end of file diff --git a/prototypes/merge_validate-arc2/spacy_extract.py b/prototypes/merge_validate-arc2/spacy_extract.py new file mode 100644 index 0000000..0ccc818 --- /dev/null +++ b/prototypes/merge_validate-arc2/spacy_extract.py @@ -0,0 +1,7 @@ +import json +from pathlib import Path + +def load_spacy_entities(path): + path = Path(path) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) \ No newline at end of file