Added arc1 validation process spacy results and exxeta checks. and also the option that both check each other.

pull/34/head
s8613 2025-04-26 15:36:02 +02:00
parent fc9302cbbb
commit 14b66a31b8
5 changed files with 714 additions and 0 deletions

View File

@ -0,0 +1,3 @@
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
MODEL_ID = "gpt-35-turbo"

View File

@ -0,0 +1,97 @@
from config import EXXETA_API_KEY, EXXETA_BASE_URL
import requests
import json
from pathlib import Path
MODEL = "gpt-35-turbo"
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
OUTPUT_PATH = "mcp_spacy_validated_result.json"
def load_spacy_entities():
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
return json.load(f)
def load_pitchbook_pages():
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
return json.load(f)
def get_page_text(pages, page_number):
for page in pages:
if page.get("page") == page_number:
return page.get("text", "")
return ""
def normalize_entity(entity):
return ' '.join(entity.replace('\n', ' ').split())
def validate_entity_with_exxeta(entity, page_num, text):
prompt = (
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
f"Ziel-Formulierung:\n"
f"\"{entity}\"\n\n"
f"Validierungsregeln:\n"
f"- Groß- und Kleinschreibung ignorieren.\n"
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
f"OCR-Text auf Seite {page_num}:\n{text}"
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"
}
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
{"role": "user", "content": prompt}
],
"temperature": 0.0
}
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
try:
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
content = response.json()["choices"][0]["message"]["content"].strip().lower()
return "true" in content
except Exception as e:
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
return False
def run():
spacy_entities = load_spacy_entities()
pitchbook_pages = load_pitchbook_pages()
validated_results = []
for entity_data in spacy_entities:
raw_entity = entity_data.get("entity")
page = entity_data.get("page")
entity = normalize_entity(raw_entity)
page_text = get_page_text(pitchbook_pages, page)
is_valid = validate_entity_with_exxeta(entity, page, page_text)
validated_results.append({
"label": entity_data.get("label"),
"entity": raw_entity,
"page": page,
"validated": is_valid
})
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(validated_results, f, indent=2, ensure_ascii=False)
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
if __name__ == "__main__":
run()

View File

@ -0,0 +1,200 @@
[
{
"label": "RISIKOPROFIL",
"entity": "Core and Core+",
"page": 4,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core, core+, value-added",
"page": 7,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 10,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 10,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core/Core+",
"page": 10,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "UK, DE, BE, NL, LU,",
"page": 10,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Core / Core +",
"page": 12,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core\n/ core+",
"page": 12,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core",
"page": 12,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Term / core+",
"page": 12,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 12,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "6,4 6,4",
"page": 13,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Country /",
"page": 14,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore",
"page": 14,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Country /",
"page": 15,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Core\nCore\nCore\nCore\nCore\nCore",
"page": 15,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "countries, giving",
"page": 18,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 20,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 20,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "D, and",
"page": 21,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "UK, DE, BE, NL, LU,",
"page": 26,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "core or",
"page": 27,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Core +",
"page": 27,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "kgCO,e",
"page": 30,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "C,",
"page": 32,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "KfW, Dwp",
"page": 35,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Bank,",
"page": 35,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 37,
"validated": true
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38,
"validated": false
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 38,
"validated": true
}
]

View File

@ -0,0 +1,100 @@
import json
from pathlib import Path
KPI_SERVICE_MAP = {
"risikoprofil": ["spacy", "exxeta"],
# "fondsname": ["exxeta"],
# "fundvolume": ["spacy", "exxeta"],
}
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
def load_spacy_entities(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def load_exxeta_entities(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def normalize(text):
if not text:
return ""
return text.strip().lower().replace(" ", "").replace("/", "/")
def validate_kpi(kpi, spacy_entities, exxeta_entities):
results = []
spacy_kpi = [e for e in spacy_entities if e.get("label", "").lower() == kpi]
exxeta_kpi = [e for e in exxeta_entities if e.get("label", "").lower() == kpi]
spacy_by_page = {}
for e in spacy_kpi:
spacy_by_page.setdefault(e["page"], []).append(e)
exxeta_by_page = {}
for e in exxeta_kpi:
exxeta_by_page.setdefault(e["page"], []).append(e)
all_pages = set(spacy_by_page.keys()).union(exxeta_by_page.keys())
for page in sorted(all_pages):
spacy_entries = spacy_by_page.get(page, [])
exxeta_entries = exxeta_by_page.get(page, [])
for se in spacy_entries:
se_entity = normalize(se["entity"])
matched = False
for ee in exxeta_entries:
ee_entity = normalize(ee["entity"])
if se_entity == ee_entity:
results.append({
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "validated"
})
matched = True
break
if not matched:
results.append({
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "spacy-only"
})
for ee in exxeta_entries:
ee_entity = normalize(ee["entity"])
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
results.append({
"kpi": kpi,
"entity": ee["entity"],
"page": page,
"validation_status": "exxeta-only"
})
return results
def save_results(results, filename):
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def run():
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
all_results = []
for kpi, services in KPI_SERVICE_MAP.items():
results = validate_kpi(kpi, spacy_entities, exxeta_entities)
all_results.extend(results)
save_results(all_results, "mcp_validated_result.json")
print("✅ Validation complete! Output: mcp_validated_result.json")
if __name__ == "__main__":
run()

View File

@ -0,0 +1,314 @@
[
{
"kpi": "risikoprofil",
"entity": "Core and Core+",
"page": 4,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "core, core+, value-added",
"page": 7,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 9,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core/Core+",
"page": 10,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "core/core+",
"page": 10,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "Core/Core+",
"page": 10,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "UK, DE, BE, NL, LU,",
"page": 10,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core / Core +",
"page": 12,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "core\n/ core+",
"page": 12,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "core",
"page": 12,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Term / core+",
"page": 12,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "core/core+",
"page": 12,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "6,4 6,4",
"page": 13,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Country /",
"page": 14,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore",
"page": 14,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 14,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 14,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 14,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 14,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 14,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Country /",
"page": 15,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core\nCore\nCore\nCore\nCore\nCore",
"page": 15,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 15,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 15,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 15,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 15,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 15,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "countries, giving",
"page": 18,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 19,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "core/core+",
"page": 20,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "core/core+",
"page": 20,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "D, and",
"page": 21,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "UK, DE, BE, NL, LU,",
"page": 26,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 26,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "core or",
"page": 27,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core +",
"page": 27,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core Offices, Core + assets",
"page": 27,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "kgCO,e",
"page": 30,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "C,",
"page": 32,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core, Core+",
"page": 33,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "KfW, Dwp",
"page": 35,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Bank,",
"page": 35,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 35,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 35,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 36,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 36,
"validation_status": "spacy-only"
},
{
"kpi": "risikoprofil",
"entity": "Core Parking",
"page": 36,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core Parking",
"page": 36,
"validation_status": "exxeta-only"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 37,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 37,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 38,
"validation_status": "validated"
},
{
"kpi": "risikoprofil",
"entity": "Core",
"page": 38,
"validation_status": "validated"
}
]