From fc9302cbbb5d1e463c6806704d5308e9660bd8c7 Mon Sep 17 00:00:00 2001 From: s8613 Date: Sat, 26 Apr 2025 13:23:49 +0200 Subject: [PATCH] =?UTF-8?q?Fine=20Tuning=20prompt=20f=C3=BCr=20ExxetaGPT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- prototypes/merge_validate-arc2/exxeta_api.py | 26 ++- .../merge_validate-arc2/exxeta_result.json | 113 +++--------- .../merge_validate-arc2/merged_result.json | 166 +++++------------- 3 files changed, 90 insertions(+), 215 deletions(-) diff --git a/prototypes/merge_validate-arc2/exxeta_api.py b/prototypes/merge_validate-arc2/exxeta_api.py index f3fb069..3d8f8b2 100644 --- a/prototypes/merge_validate-arc2/exxeta_api.py +++ b/prototypes/merge_validate-arc2/exxeta_api.py @@ -6,23 +6,31 @@ MODEL = "gpt-35-turbo" def extract_risikoprofil_from_exxeta(pages_json): results = [] - skipped_pages = [] for page_data in pages_json: page_num = page_data.get("page") text = page_data.get("text", "").strip() + if not text: + continue + prompt = ( - f"Bitte extrahiere alle Vorkommen von Risikoprofilen wie \"Core\", \"Core+\", " - f"\"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" " - f"auf **Seite {page_num}** im folgenden Text.\n\n" - f"Liefere das Ergebnis NUR als valides JSON-Array:\n" - f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core\", \"page\": {page_num}}}]\n\n" - f"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array zurück: []\n\n" - f"Keine ESG-Profile oder Carbon-Ziele. Nur Risikoprofilierungen. Keine Kommentare oder Text außerhalb des JSON.\n\n" - f"TEXT:\n{text}" + "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n" + "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n" + "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, " + "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n" + "Beispiele:\n" + "- \"Core, Core+\" → entity: \"Core, Core+\"\n" + "- \"Core/Core+\" → entity: \"Core/Core+\"\n" + "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n" + "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n" + f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n" + "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n" + "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n" + "TEXT:\n" + text ) + headers = { "Content-Type": "application/json", "Authorization": f"Bearer {EXXETA_API_KEY}" diff --git a/prototypes/merge_validate-arc2/exxeta_result.json b/prototypes/merge_validate-arc2/exxeta_result.json index ba19bfd..438b005 100644 --- a/prototypes/merge_validate-arc2/exxeta_result.json +++ b/prototypes/merge_validate-arc2/exxeta_result.json @@ -1,33 +1,18 @@ [ { "label": "RISIKOPROFIL", - "entity": "Core", - "page": 1 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 2 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", + "entity": "Core and Core+", "page": 4 }, + { + "label": "RISIKOPROFIL", + "entity": "Core, core+, value-added", + "page": 7 + }, { "label": "RISIKOPROFIL", "entity": "Core", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 7 + "page": 9 }, { "label": "RISIKOPROFIL", @@ -36,22 +21,7 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", + "entity": "Core / Core+", "page": 12 }, { @@ -101,29 +71,24 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core/core+", + "entity": "Core", + "page": 15 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 19 + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", "page": 20 }, { "label": "RISIKOPROFIL", - "entity": "Core/core+", + "entity": "core/core+", "page": 20 }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 24 - }, { "label": "RISIKOPROFIL", "entity": "Core", @@ -131,38 +96,13 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", + "entity": "Core Offices, Core + assets", "page": 27 }, { "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Opportunistisch", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 34 + "entity": "Core, Core+", + "page": 33 }, { "label": "RISIKOPROFIL", @@ -176,7 +116,12 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core", + "entity": "Core Parking", + "page": 36 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core Parking", "page": 36 }, { diff --git a/prototypes/merge_validate-arc2/merged_result.json b/prototypes/merge_validate-arc2/merged_result.json index 8ee2784..0342297 100644 --- a/prototypes/merge_validate-arc2/merged_result.json +++ b/prototypes/merge_validate-arc2/merged_result.json @@ -1,57 +1,20 @@ [ - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 1, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 2, - "status": "single-source", - "source": "exxeta" - }, { "label": "RISIKOPROFIL", "entity": "Core and Core+", "page": 4, - "status": "single-source", - "source": "spacy" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 4, - "status": "single-source", - "source": "exxeta" + "status": "validated" }, { "label": "RISIKOPROFIL", "entity": "core, core+, value-added", "page": 7, - "status": "single-source", - "source": "spacy" + "status": "validated" }, { "label": "RISIKOPROFIL", "entity": "Core", - "page": 7, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 7, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 7, + "page": 9, "status": "single-source", "source": "exxeta" }, @@ -80,46 +43,24 @@ "status": "single-source", "source": "spacy" }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 11, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 11, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 11, - "status": "single-source", - "source": "exxeta" - }, { "label": "RISIKOPROFIL", "entity": "Core / Core +", "page": 12, - "status": "single-source", - "source": "spacy" + "status": "validated" }, { "label": "RISIKOPROFIL", "entity": "core\n/ core+", "page": 12, - "status": "single-source", - "source": "spacy" + "status": "validated" }, { "label": "RISIKOPROFIL", "entity": "core", "page": 12, - "status": "validated" + "status": "single-source", + "source": "spacy" }, { "label": "RISIKOPROFIL", @@ -132,8 +73,7 @@ "label": "RISIKOPROFIL", "entity": "core/core+", "page": 12, - "status": "single-source", - "source": "spacy" + "status": "validated" }, { "label": "RISIKOPROFIL", @@ -233,6 +173,13 @@ "status": "single-source", "source": "exxeta" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 15, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "countries, giving", @@ -240,6 +187,13 @@ "status": "single-source", "source": "spacy" }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 19, + "status": "single-source", + "source": "exxeta" + }, { "label": "RISIKOPROFIL", "entity": "core/core+", @@ -259,27 +213,6 @@ "status": "single-source", "source": "spacy" }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 24, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 24, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 24, - "status": "single-source", - "source": "exxeta" - }, { "label": "RISIKOPROFIL", "entity": "UK, DE, BE, NL, LU,", @@ -294,20 +227,6 @@ "status": "single-source", "source": "exxeta" }, - { - "label": "RISIKOPROFIL", - "entity": "Core+", - "page": 26, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Value-added", - "page": 26, - "status": "single-source", - "source": "exxeta" - }, { "label": "RISIKOPROFIL", "entity": "core or", @@ -319,18 +238,12 @@ "label": "RISIKOPROFIL", "entity": "Core +", "page": 27, - "status": "validated" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 27, "status": "single-source", - "source": "exxeta" + "source": "spacy" }, { "label": "RISIKOPROFIL", - "entity": "Opportunistisch", + "entity": "Core Offices, Core + assets", "page": 27, "status": "single-source", "source": "exxeta" @@ -351,15 +264,8 @@ }, { "label": "RISIKOPROFIL", - "entity": "Core", - "page": 34, - "status": "single-source", - "source": "exxeta" - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 34, + "entity": "Core, Core+", + "page": 33, "status": "single-source", "source": "exxeta" }, @@ -395,13 +301,29 @@ "label": "RISIKOPROFIL", "entity": "Core", "page": 36, - "status": "validated" + "status": "single-source", + "source": "spacy" }, { "label": "RISIKOPROFIL", "entity": "Core", "page": 36, - "status": "validated" + "status": "single-source", + "source": "spacy" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core Parking", + "page": 36, + "status": "single-source", + "source": "exxeta" + }, + { + "label": "RISIKOPROFIL", + "entity": "Core Parking", + "page": 36, + "status": "single-source", + "source": "exxeta" }, { "label": "RISIKOPROFIL",