Fine Tuning prompt für ExxetaGPT

pull/34/head
s8613 2025-04-26 13:23:49 +02:00
parent 7b6a19bbc3
commit fc9302cbbb
3 changed files with 90 additions and 215 deletions

View File

@ -6,23 +6,31 @@ MODEL = "gpt-35-turbo"
def extract_risikoprofil_from_exxeta(pages_json):
results = []
skipped_pages = []
for page_data in pages_json:
page_num = page_data.get("page")
text = page_data.get("text", "").strip()
if not text:
continue
prompt = (
f"Bitte extrahiere alle Vorkommen von Risikoprofilen wie \"Core\", \"Core+\", "
f"\"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" "
f"auf **Seite {page_num}** im folgenden Text.\n\n"
f"Liefere das Ergebnis NUR als valides JSON-Array:\n"
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core\", \"page\": {page_num}}}]\n\n"
f"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array zurück: []\n\n"
f"Keine ESG-Profile oder Carbon-Ziele. Nur Risikoprofilierungen. Keine Kommentare oder Text außerhalb des JSON.\n\n"
f"TEXT:\n{text}"
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
"Beispiele:\n"
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
"TEXT:\n" + text
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"

View File

@ -1,33 +1,18 @@
[
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 1
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 2
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"entity": "Core and Core+",
"page": 4
},
{
"label": "RISIKOPROFIL",
"entity": "Core, core+, value-added",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 7
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 7
"page": 9
},
{
"label": "RISIKOPROFIL",
@ -36,22 +21,7 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 11
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"entity": "Core / Core+",
"page": 12
},
{
@ -101,29 +71,24 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core/core+",
"entity": "Core",
"page": 15
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 19
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 20
},
{
"label": "RISIKOPROFIL",
"entity": "Core/core+",
"entity": "core/core+",
"page": 20
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 24
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
@ -131,38 +96,13 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 26
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 26
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"entity": "Core Offices, Core + assets",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Opportunistisch",
"page": 27
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34
"entity": "Core, Core+",
"page": 33
},
{
"label": "RISIKOPROFIL",
@ -176,7 +116,12 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"entity": "Core Parking",
"page": 36
},
{
"label": "RISIKOPROFIL",
"entity": "Core Parking",
"page": 36
},
{

View File

@ -1,57 +1,20 @@
[
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 1,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 2,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core and Core+",
"page": 4,
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 4,
"status": "single-source",
"source": "exxeta"
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "core, core+, value-added",
"page": 7,
"status": "single-source",
"source": "spacy"
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 7,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 7,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 7,
"page": 9,
"status": "single-source",
"source": "exxeta"
},
@ -80,46 +43,24 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 11,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core / Core +",
"page": 12,
"status": "single-source",
"source": "spacy"
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "core\n/ core+",
"page": 12,
"status": "single-source",
"source": "spacy"
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "core",
"page": 12,
"status": "validated"
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
@ -132,8 +73,7 @@
"label": "RISIKOPROFIL",
"entity": "core/core+",
"page": 12,
"status": "single-source",
"source": "spacy"
"status": "validated"
},
{
"label": "RISIKOPROFIL",
@ -233,6 +173,13 @@
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 15,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "countries, giving",
@ -240,6 +187,13 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 19,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core/core+",
@ -259,27 +213,6 @@
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 24,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "UK, DE, BE, NL, LU,",
@ -294,20 +227,6 @@
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core+",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Value-added",
"page": 26,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "core or",
@ -319,18 +238,12 @@
"label": "RISIKOPROFIL",
"entity": "Core +",
"page": 27,
"status": "validated"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 27,
"status": "single-source",
"source": "exxeta"
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Opportunistisch",
"entity": "Core Offices, Core + assets",
"page": 27,
"status": "single-source",
"source": "exxeta"
@ -351,15 +264,8 @@
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 34,
"entity": "Core, Core+",
"page": 33,
"status": "single-source",
"source": "exxeta"
},
@ -395,13 +301,29 @@
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"status": "validated"
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core",
"page": 36,
"status": "validated"
"status": "single-source",
"source": "spacy"
},
{
"label": "RISIKOPROFIL",
"entity": "Core Parking",
"page": 36,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",
"entity": "Core Parking",
"page": 36,
"status": "single-source",
"source": "exxeta"
},
{
"label": "RISIKOPROFIL",