Merge remote-tracking branch 'origin/main' into neue-Kennzahl-spacy

pull/94/head
Abdulrahman Dabbagh 2025-06-27 10:16:43 +02:00
commit 77d169633e
7 changed files with 166 additions and 63 deletions

View File

@ -1,7 +1,6 @@
import requests
import json
import os
import time
import logging
from dotenv import load_dotenv
@ -17,6 +16,18 @@ TIMEOUT = 180
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_dynamic_labels():
url = f"{COORDINATOR_URL}/api/kpi_setting/"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
kpi_list = response.json()
labels = [kpi["name"].upper() for kpi in kpi_list if kpi.get("active", False)]
return labels
except Exception as e:
logger.warning(f"Konnte dynamische Labels nicht laden: {e}")
return []
def extract_with_exxeta(pages_json, pitchbook_id):
results = []
@ -30,9 +41,7 @@ def extract_with_exxeta(pages_json, pitchbook_id):
if i % 8 == 0:
requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35 + 60/len(pages_json)*i})
page_num = page_data.get("page")
page_data.get("page")
text = page_data.get("text", "")
if not text:
@ -51,9 +60,9 @@ def extract_with_exxeta(pages_json, pitchbook_id):
"- Gib die Antwort als **JSON-Array** im folgenden Format zurück:\n\n"
"[\n"
" {\n"
" \"label\": \"FONDSNAME\",\n"
" \"entity\": \"...\",\n"
f" \"page\": {page_num},\n"
' "label": "FONDSNAME",\n'
' "entity": "...",\n'
f' "page": {page_num},\n'
" },\n"
" ...\n"
"]\n\n"
@ -61,45 +70,29 @@ def extract_with_exxeta(pages_json, pitchbook_id):
f"TEXT:\n{text}"
)
else:
labels = get_dynamic_labels()
prompt_kennzahlen = "".join([f"- {label}\n" for label in labels])
prompt = (
"Bitte extrahiere relevante Fondskennzahlen aus dem folgenden Pitchbook-Text. "
"Analysiere den Text sorgfältig, um **nur exakt benannte und relevante Werte** zu extrahieren.\n\n"
"ZU EXTRAHIERENDE KENNZAHLEN (immer exakt wie unten angegeben):\n"
"- FONDSNAME\n"
"- FONDSMANAGER\n"
"- AIFM (z. B. Name Kapitalverwaltungsgesellschaft)\n"
"- DATUM\n"
"- RISIKOPROFIL (z. B. CORE, CORE+, VALUE-ADDED, OPPORTUNISTISCH)\n"
"- ARTIKEL (z. B. ARTIKEL 6, 8, 9)\n"
"- ZIELRENDITE\n"
"- RENDITE\n"
"- ZIELAUSSCHÜTTUNG\n"
"- AUSSCHÜTTUNG\n"
"- LAUFZEIT\n"
"- LTV\n"
"- MANAGEMENTGEBÜHREN (ggf. mit Staffelung und Bezug auf NAV/GAV)\n"
"- SEKTORENALLOKATION (z. B. BÜRO, LOGISTIK, WOHNEN... inkl. %-Angaben)\n"
"- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n"
f"{prompt_kennzahlen}\n"
"WICHTIG:\n"
"- Gib **nur eine Entität pro Kennzahl** an - keine Listen oder Interpretationen.\n"
"- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n"
'- Wenn mehrere Varianten genannt werden (z. B. "Core und Core+"), gib sie im Originalformat als **eine entity** an.\n'
"- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n"
"- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n"
"- Jeder gefundene Wert muss einem der obigen Label **eindeutig zuordenbar** sein.\n\n"
"FORMAT:\n"
"Antworte als **reines JSON-Array** mit folgendem Format:\n"
"[\n"
" {\n"
" \"label\": \"Kennzahlname (exakt wie oben)\",\n"
" \"entity\": \"Wert aus dem Text (exakt im Original)\",\n"
f" \"page\": {page_num},\n"
' "label": "Kennzahlname (exakt wie oben)",\n'
' "entity": "Wert aus dem Text (exakt im Original)",\n'
f' "page": {page_num},\n'
" },\n"
" ...\n"
"]\n\n"
f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n"
f"Nur JSON-Antwort - keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
f"TEXT:\n{text}"
@ -125,10 +118,7 @@ def extract_with_exxeta(pages_json, pitchbook_id):
try:
response = requests.post(url, headers=headers, json=payload, timeout=TIMEOUT)
response.raise_for_status()
content = response.json()["choices"][0]["message"]["content"]
content = content.strip()
content = response.json()["choices"][0]["message"]["content"].strip()
if content.startswith("```json"):
content = content.split("```json")[1]
if content.endswith("```"):
@ -143,14 +133,16 @@ def extract_with_exxeta(pages_json, pitchbook_id):
if isinstance(page_results, list):
results.extend(page_results)
break
except requests.exceptions.RequestException as e:
except requests.exceptions.RequestException:
if attempt == MAX_RETRIES:
results.extend([])
except Exception as e:
except Exception:
if attempt == MAX_RETRIES:
results.extend([])
requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 95})
return json.dumps(results, indent=2, ensure_ascii=False)
if __name__ == "__main__":
print("📡 Test-Aufruf get_dynamic_labels:")
print(get_dynamic_labels())

View File

@ -1,15 +1,36 @@
from typing import Dict, List
import re
import requests
import os
# SETTINGS = [{"id": "Rendite", "type": "number"}]
COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5000")
def validate_entities(entities):
try:
response = requests.get(COORDINATOR_URL + "/api/kpi_setting/")
if response.status_code == 200:
settings = response.json()
else:
settings = []
except requests.exceptions.RequestException as e:
print(f"Error fetching settings: {e}")
settings = []
# settings = SETTINGS
result = []
reduced_kpi: Dict[str, List[Dict[str, str | int]]] = {}
reduced_kpi: Dict[str, List[Dict[str, str]]] = {}
# reduce entities by label. Example: {"PERSON": [{"label": "PERSON", "entity": "John Doe", "status": "validated"}]}
for item in entities:
label = item["label"]
if label not in reduced_kpi:
reduced_kpi[label] = []
reduced_kpi[label].append(item)
reduced_kpi = delete_exxeta_unknown(reduced_kpi)
reduced_kpi = validate_number(reduced_kpi, settings)
reduced_kpi = delete_duplicate_entities(reduced_kpi)
for item in reduced_kpi.items():
if item[0] == "FONDSNAME":
result.extend(item[1])
@ -21,6 +42,8 @@ def validate_entities(entities):
result.extend(item[1])
continue
# Filter not validated, if there are valid values
validated = False
for entity in item[1]:
if entity["status"] == "validated":
@ -34,13 +57,82 @@ def validate_entities(entities):
return result
def validate_number(entity_list, settings):
filtered_kpi = {}
for label, entity_list in entity_list.items():
setting = next((s for s in settings if s["name"].upper() == label), None)
if setting and setting["type"] == "number":
filtered_entities = [
entity for entity in entity_list
if is_valid_number(str(entity["entity"]))
]
for entity in entity_list:
if not is_valid_number(str(entity["entity"])):
print(f"Invalid number: {entity}")
if filtered_entities: # Only add the label if there are entities left
filtered_kpi[label] = filtered_entities
else:
filtered_kpi[label] = entity_list
return filtered_kpi
def is_valid_number(number):
pattern = r'^[0-9\-\s%,.€]+$'
return any(char.isdigit() for char in number) and not re.search(r'\d+\s\d+', number) and re.fullmatch(pattern, number)
def delete_exxeta_unknown(entity_list):
filtered_kpi = {}
for label, entity_list in entity_list.items():
# Filter out entities with "nichtangegeben" or "n/a" (case-insensitive and stripped)
filtered_entities = [
entity for entity in entity_list
if str(entity["entity"]).lower().replace(" ", "") not in {"nichtangegeben", "n/a"}
]
for entity in entity_list:
if str(entity["entity"]).lower().replace(" ", "") in {"nichtangegeben", "n/a"}:
print(f"filtered out: {entity}")
if filtered_entities: # Only add the label if there are entities left
filtered_kpi[label] = filtered_entities
return filtered_kpi
def delete_duplicate_entities(entity_list):
unique_entities = {}
for label, entity_list in entity_list.items():
values = set()
filtered_entities = []
for entity in entity_list:
if str(entity["entity"]).lower().replace(" ", "") not in values:
filtered_entities.append(entity)
else:
print(f"Duplicate entity: {entity}")
values.add(str(entity["entity"]).lower().replace(" ", ""))
if filtered_entities:
unique_entities[label] = filtered_entities
return unique_entities
if __name__ == "__main__":
entities = [
{"label": "PERSON", "entity": "John Doe", "status": "validated"},
{"label": "PERSON", "entity": "Exxeta", "status": "invalid"},
{"label": "ORG", "entity": "Google", "status": "invalid"},
{"label": "FONDSNAME", "entity": "Microsoft", "status": "validated"},
{"label": "FONDSNAME", "entity": "Amazon", "status": "invalid"},
{"label": "FONDSNAME", "entity": "Apple", "status": "invalid"}
# {"label": "PERSON", "entity": "John Doe", "status": "validated"},
# {"label": "PERSON", "entity": "Exxeta", "status": "invalid"},
# {"label": "ORG", "entity": "Google", "status": "invalid"},
# {"label": "FONDSNAME", "entity": "Microsoft", "status": "validated"},
# {"label": "FONDSNAME", "entity": "Amazon", "status": "invalid"},
# {"label": "FONDSNAME", "entity": "Apple", "status": "invalid"},
{"label": "RENDITE", "entity": "8 8 8 8 8", "status": "validated"},
{"label": "RENDITE", "entity": "N/A", "status": "validated"},
{"label": "RENDITE", "entity": "nicht angegeben", "status": "validated"},
{"label": "RENDITE", "entity": "uaieluae--t>", "status": "validated"},
{"label": "RENDITE", "entity": "3,5", "status": "validated"},
{"label": "RENDITE", "entity": "3,5", "status": "validated"},
{"label": "RENDITE", "entity": "3 , 5", "status": "validated"},
{"label": "RENDITE", "entity": "3%", "status": "validated"},
{"label": "RENDITE", "entity": "", "status": "invalid"},
{"label": "RENDITE", "entity": "2 mehr als 6", "status": "invalid"},
{"label": "RENDITE", "entity": 2, "status": "invalid"},
]
print(validate_entities(entities))

View File

@ -330,6 +330,9 @@ export function ConfigTable({ from }: ConfigTableProps) {
>
<span title={`Click to view details (ID: ${kennzahl.id})`}>
{kennzahl.name}
{kennzahl.mandatory && (
<span> *</span>
)}
</span>
</td>
<td style={{ padding: "12px" }}>

View File

@ -360,7 +360,7 @@ export function PitchBooksTable() {
{status === "completed" ? (
<Chip
icon={<CheckCircleIcon />}
label="Abgeschlossen"
label="Extraktion Abgeschlossen"
size="small"
sx={{
backgroundColor: "#e8f5e9",

View File

@ -121,7 +121,7 @@ export default function UploadPage() {
fontWeight: "bold",
color: "#383838",
marginBottom: 12,
marginTop: 6,
marginTop: 3,
}}
>
Pitchbook Extractor
@ -207,7 +207,7 @@ export default function UploadPage() {
onMouseEnter={() => router.preloadRoute({ to: "/pitchbooks" })}
onClick={() => navigate({ to: "/pitchbooks" })}
>
Alle Pitch Books anzeigen
Alle Pitchbooks anzeigen
</Button>
</Box>
</>

View File

@ -1,5 +1,5 @@
import ContentPasteIcon from "@mui/icons-material/ContentPaste";
import { Box, Button, Paper, Typography, Snackbar, Alert, IconButton } from "@mui/material";
import { Box, Button, Paper, Typography, Snackbar, Alert, IconButton, Tooltip } from "@mui/material";
import ArrowBackIcon from "@mui/icons-material/ArrowBack";
import { useSuspenseQuery } from "@tanstack/react-query";
import { createFileRoute, useNavigate } from "@tanstack/react-router";
@ -50,6 +50,8 @@ function ExtractedResultsPage() {
const { data: kpi } = useSuspenseQuery(kpiQueryOptions(pitchBook));
const { data: settings } = useSuspenseQuery(settingsQueryOptions());
const fundName = kpi["FONDSNAME"]?.[0]?.entity;
const status = useMemo(() => {
let hasRedBorders = false;
let hasYellowBorders = false;
@ -158,7 +160,9 @@ function ExtractedResultsPage() {
}}
/>
<Typography variant="h5" gutterBottom>
<strong>Extrahierte Kennzahlen</strong>
<strong>
{fundName ? `Kennzahlen extrahiert aus: ${fundName}` : "Extrahierte Kennzahlen"}
</strong>
</Typography>
</Box>
<Box
@ -235,17 +239,29 @@ function ExtractedResultsPage() {
gap={2}
sx={{ flexShrink: 0 }}
>
<Button variant="contained" sx={{ backgroundColor: "#383838" }}
onClick={handleCopyToClipboard}>
<ContentPasteIcon sx={{ fontSize: 18, mr: 1 }} />
{copied ? "Kopiert!" : "Kennzahlenzeile kopieren"}
</Button>
<Tooltip
title={
<>
<b>Kennzahlen kopieren</b>
<br />
Kopiert alle aktiven Kennzahlen als Excel-Zeile in die Zwischenablage. Kann direkt in Excel eingefügt werden.
</>
}
placement="top"
arrow
>
<Button variant="contained" sx={{ backgroundColor: "#383838" }}
onClick={handleCopyToClipboard}>
<ContentPasteIcon sx={{ fontSize: 18, mr: 1 }} />
{copied ? "Kopiert!" : "Kennzahlenzeile kopieren"}
</Button>
</Tooltip>
<Button
variant="contained"
sx={{ backgroundColor: "#383838" }}
onClick={() => navigate({ to: "/" })}
>
Neu hochladen
Neues Pitchbook hochladen
</Button>
</Box>
</Box>

View File

@ -1,11 +1,11 @@
export const formatDate = (dateString: string): string => {
const date = new Date(dateString);
const date = new Date(dateString);
const hours = String(date.getHours()).padStart(2, '0');
const minutes = String(date.getMinutes()).padStart(2, '0');
const month = String(date.getMonth() + 1).padStart(2, '0'); // Months are zero-based
const day = String(date.getDate()).padStart(2, '0');
const year = date.getFullYear();
const hours = String(date.getHours() + 2).padStart(2, "0");
const minutes = String(date.getMinutes()).padStart(2, "0");
const month = String(date.getMonth() + 1).padStart(2, "0"); // Months are zero-based
const day = String(date.getDate()).padStart(2, "0");
const year = date.getFullYear();
return `${hours}:${minutes} ${day}.${month}.${year}`;
return `${hours}:${minutes} ${day}.${month}.${year}`;
};