First working attempt in merging and validating arc2 with spacy and exxeta.

pull/34/head
s8613 2025-04-25 21:15:33 +02:00
parent 3f71189f66
commit 5945122fb0
7 changed files with 804 additions and 0 deletions

View File

@ -0,0 +1,3 @@
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
MODEL_ID = "gpt-35-turbo"

View File

@ -0,0 +1,66 @@
from config import EXXETA_API_KEY, EXXETA_BASE_URL
import requests
import json
MODEL = "gpt-35-turbo"
def extract_risikoprofil_from_exxeta(pages_json):
results = []
skipped_pages = []
for page_data in pages_json:
page_num = page_data.get("page")
text = page_data.get("text", "").strip()
prompt = (
f"Bitte extrahiere alle Vorkommen von Risikoprofilen wie \"Core\", \"Core+\", "
f"\"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" "
f"auf **Seite {page_num}** im folgenden Text.\n\n"
f"Liefere das Ergebnis NUR als valides JSON-Array:\n"
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core\", \"page\": {page_num}}}]\n\n"
f"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array zurück: []\n\n"
f"Keine ESG-Profile oder Carbon-Ziele. Nur Risikoprofilierungen. Keine Kommentare oder Text außerhalb des JSON.\n\n"
f"TEXT:\n{text}"
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"
}
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
{"role": "user", "content": prompt}
],
"temperature": 0.0
}
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
for attempt in range(2):
try:
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
content = response.json()["choices"][0]["message"]["content"]
content = content.strip()
if content.startswith("```json"):
content = content.split("```json")[1]
if content.endswith("```"):
content = content.split("```")[0]
content = content.strip()
page_results = json.loads(content)
if page_results:
results.extend(page_results)
break
except Exception as e:
print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}")
return results

View File

@ -0,0 +1,202 @@
[
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 1
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 2
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 4
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 10
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 12
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core/core+",
"page": 20
},
{
"label": "RISIKOPROFIL",
"entity": "Core/core+",
"page": 20
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 26
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 26
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 26
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Opportunistisch",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 35
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 35
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38
}
]

View File

@ -0,0 +1,59 @@
def normalize_entity(entity_str):
if not entity_str:
return ""
normalized = entity_str.replace('\n', ' ')
normalized = ''.join(normalized.lower().split())
return normalized
def merge_and_validate_entities(spacy_data, exxeta_data):
merged = []
seen = set()
# Process SpaCy entities first
for s in spacy_data:
s_entity_norm = normalize_entity(s["entity"])
s_page = s["page"]
# Look for matching Exxeta entities
found = False
for e in exxeta_data:
e_entity_norm = normalize_entity(e["entity"])
e_page = e["page"]
# Match if normalized entity and page match
if (s["label"] == e["label"] and
s_entity_norm == e_entity_norm and
s_page == e_page):
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "validated"
})
seen.add((e["entity"], e_page))
found = True
break
# If no match found, add as single-source
if not found:
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "single-source",
"source": "spacy"
})
# Add remaining Exxeta entities not already processed
for e in exxeta_data:
if (e["entity"], e["page"]) not in seen:
merged.append({
"label": e["label"],
"entity": e["entity"],
"page": e["page"],
"status": "single-source",
"source": "exxeta"
})
return merged

View File

@ -0,0 +1,37 @@
from pathlib import Path
import json
from spacy_extract import load_spacy_entities
from exxeta_api import extract_risikoprofil_from_exxeta
from merge_logic import merge_and_validate_entities
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
def load_pitchbook_pages():
path = Path(PITCHBOOK_PATH)
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_json(data, filename):
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def sort_by_page_number(entities):
return sorted(entities, key=lambda x: x.get("page", 0))
def run():
spacy_entities = load_spacy_entities(SPACY_PATH)
pitchbook_pages = load_pitchbook_pages()
exxeta_entities = extract_risikoprofil_from_exxeta(pitchbook_pages)
save_json(exxeta_entities, "exxeta_result.json")
merged = merge_and_validate_entities(spacy_entities, exxeta_entities)
merged_sorted = sort_by_page_number(merged)
save_json(merged_sorted, "merged_result.json")
print("\n✅ All done! Files written:")
print("- exxeta_result.json")
print("- merged_result.json")
print(f"- Total entities in merged result: {len(merged_sorted)}")
if __name__ == "__main__":
run()

View File

@ -0,0 +1,430 @@
[
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 1,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 2,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core and Core+",
"page": 4,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 4,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core, core+, value-added",
"page": 7,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 7,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 7,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 7,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 10,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 10,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 10,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "UK, DE, BE, NL, LU,",
"page": 10,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core / Core +",
"page": 12,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "core\n/ core+",
"page": 12,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "core",
"page": 12,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Term / core+",
"page": 12,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 12,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "6,4 6,4",
"page": 13,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Country /",
"page": 14,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore",
"page": 14,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 14,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Country /",
"page": 15,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core\nCore\nCore\nCore\nCore\nCore",
"page": 15,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "countries, giving",
"page": 18,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 20,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 20,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "D, and",
"page": 21,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "UK, DE, BE, NL, LU,",
"page": 26,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core or",
"page": 27,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core +",
"page": 27,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 27,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Opportunistisch",
"page": 27,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "kgCO,e",
"page": 30,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "C,",
"page": 32,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "KfW, Dwp",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Bank,",
"page": 35,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 35,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 35,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38,
"status": "validated"
}
]

View File

@ -0,0 +1,7 @@
import json
from pathlib import Path
def load_spacy_entities(path):
path = Path(path)
with open(path, "r", encoding="utf-8") as f:
return json.load(f)