KPI_data angepasst

pull/94/head
Abdulrahman Dabbagh 2025-06-29 04:57:24 +02:00
parent 12783539b3
commit 360da3acb0
11 changed files with 234 additions and 133 deletions

View File

@ -29,13 +29,11 @@ def create_kpi_setting():
required_fields = [ required_fields = [
"name", "name",
"description",
"mandatory", "mandatory",
"type", "type",
"translation",
"example",
"position", "position",
"active", "active",
"examples",
] ]
for field in required_fields: for field in required_fields:
if field not in data: if field not in data:
@ -55,13 +53,11 @@ def create_kpi_setting():
new_kpi_setting = KPISettingModel( new_kpi_setting = KPISettingModel(
name=data["name"], name=data["name"],
description=data["description"],
mandatory=data["mandatory"], mandatory=data["mandatory"],
type=kpi_type, type=kpi_type,
translation=data["translation"],
example=data["example"],
position=data["position"], position=data["position"],
active=data["active"], active=data["active"],
examples=data.get("examples", []),
) )
db.session.add(new_kpi_setting) db.session.add(new_kpi_setting)
@ -84,9 +80,6 @@ def update_kpi_setting(id):
return jsonify({"error": "KPI Setting with this name already exists"}), 409 return jsonify({"error": "KPI Setting with this name already exists"}), 409
kpi_setting.name = data["name"] kpi_setting.name = data["name"]
if "description" in data:
kpi_setting.description = data["description"]
if "mandatory" in data: if "mandatory" in data:
kpi_setting.mandatory = data["mandatory"] kpi_setting.mandatory = data["mandatory"]
@ -100,18 +93,15 @@ def update_kpi_setting(id):
400, 400,
) )
if "translation" in data:
kpi_setting.translation = data["translation"]
if "example" in data:
kpi_setting.example = data["example"]
if "position" in data: if "position" in data:
kpi_setting.position = data["position"] kpi_setting.position = data["position"]
if "active" in data: if "active" in data:
kpi_setting.active = data["active"] kpi_setting.active = data["active"]
if "examples" in data:
kpi_setting.examples = data["examples"]
db.session.commit() db.session.commit()
return jsonify(kpi_setting.to_dict()), 200 return jsonify(kpi_setting.to_dict()), 200

View File

@ -2,6 +2,8 @@ from model.database import db
from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import Enum as SQLAlchemyEnum from sqlalchemy import Enum as SQLAlchemyEnum
from enum import Enum from enum import Enum
from sqlalchemy.dialects.postgresql import JSONB
from collections import OrderedDict
class KPISettingType(Enum): class KPISettingType(Enum):
@ -18,37 +20,31 @@ class KPISettingModel(db.Model):
id: Mapped[int] = mapped_column(primary_key=True) id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(unique=True) name: Mapped[str] = mapped_column(unique=True)
description: Mapped[str]
mandatory: Mapped[bool] mandatory: Mapped[bool]
type: Mapped[KPISettingType] = mapped_column( type: Mapped[KPISettingType] = mapped_column(
SQLAlchemyEnum(KPISettingType, native_enum=True) SQLAlchemyEnum(KPISettingType, native_enum=True)
) )
translation: Mapped[str]
example: Mapped[str]
position: Mapped[int] position: Mapped[int]
active: Mapped[bool] active: Mapped[bool]
examples: Mapped[list] = mapped_column(JSONB, default=[])
def to_dict(self): def to_dict(self):
return { return OrderedDict(
"id": self.id, [
"name": self.name, ("id", self.id),
"description": self.description, ("name", self.name),
"mandatory": self.mandatory, ("mandatory", self.mandatory),
"type": self.type.value, ("type", self.type.value),
"translation": self.translation, ("position", self.position),
"example": self.example, ("examples", self.examples),
"position": self.position, ("active", self.active),
"active": self.active, ]
} )
def __init__( def __init__(self, name, mandatory, type, position, active, examples=None):
self, name, description, mandatory, type, translation, example, position, active
):
self.name = name self.name = name
self.description = description
self.mandatory = mandatory self.mandatory = mandatory
self.type = type self.type = type
self.translation = translation
self.example = example
self.position = position self.position = position
self.active = active self.active = active
self.examples = examples or []

View File

@ -10,153 +10,243 @@ def seed_default_kpi_settings():
default_kpi_settings = [ default_kpi_settings = [
{ {
"name": "Fondsname", "name": "Fondsname",
"description": "Der vollständige Name des Investmentfonds",
"mandatory": True, "mandatory": True,
"type": KPISettingType.STRING, "type": KPISettingType.STRING,
"translation": "Fund Name",
"example": "Alpha Real Estate Fund I",
"position": 1, "position": 1,
"active": True, "active": True,
"examples": [
{
"sentence": "Der Fonds trägt den Namen Alpha Real Estate Fund I.",
"value": "Alpha Real Estate Fund I",
},
{
"sentence": "Im Pitchbook wird der Fondsname als Alpha Real Estate Fund I angegeben.",
"value": "Alpha Real Estate Fund I",
},
],
}, },
{ {
"name": "Fondsmanager", "name": "Fondsmanager",
"description": "Verantwortlicher Manager für die Fondsverwaltung",
"mandatory": True, "mandatory": True,
"type": KPISettingType.STRING, "type": KPISettingType.STRING,
"translation": "Fund Manager",
"example": "Max Mustermann",
"position": 2, "position": 2,
"active": True, "active": True,
"examples": [
{
"sentence": "Fondsmanager des Projekts ist Max Mustermann.",
"value": "Max Mustermann",
},
{
"sentence": "Die Verwaltung liegt bei Max Mustermann.",
"value": "Max Mustermann",
},
],
}, },
{ {
"name": "AIFM", "name": "AIFM",
"description": "Alternative Investment Fund Manager",
"mandatory": True, "mandatory": True,
"type": KPISettingType.STRING, "type": KPISettingType.STRING,
"translation": "AIFM",
"example": "Alpha Investment Management GmbH",
"position": 3, "position": 3,
"active": True, "active": True,
"examples": [
{
"sentence": "AIFM ist die Alpha Investment Management GmbH.",
"value": "Alpha Investment Management GmbH",
},
{
"sentence": "Die Alpha Investment Management GmbH fungiert als AIFM.",
"value": "Alpha Investment Management GmbH",
},
],
}, },
{ {
"name": "Datum", "name": "Datum",
"description": "Stichtag der Datenerfassung",
"mandatory": True, "mandatory": True,
"type": KPISettingType.DATE, "type": KPISettingType.DATE,
"translation": "Date",
"example": "05.05.2025",
"position": 4, "position": 4,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Daten basieren auf dem Stand vom 05.05.2025.",
"value": "05.05.2025",
},
{
"sentence": "Stichtag der Angaben ist der 05.05.2025.",
"value": "05.05.2025",
},
],
}, },
{ {
"name": "Risikoprofil", "name": "Risikoprofil",
"description": "Klassifizierung des Risikos des Fonds",
"mandatory": True, "mandatory": True,
"type": KPISettingType.STRING, "type": KPISettingType.STRING,
"translation": "Risk Profile",
"example": "Core/Core++",
"position": 5, "position": 5,
"active": True, "active": True,
"examples": [
{
"sentence": "Der Fonds hat das Risikoprofil Core/Core++.",
"value": "Core/Core++",
},
{
"sentence": "Einstufung des Fondsrisikos: Core/Core++.",
"value": "Core/Core++",
},
],
}, },
{ {
"name": "Artikel", "name": "Artikel",
"description": "Artikel 8 SFDR-Klassifizierung",
"mandatory": False, "mandatory": False,
"type": KPISettingType.BOOLEAN, "type": KPISettingType.BOOLEAN,
"translation": "Article",
"example": "Artikel 8",
"position": 6, "position": 6,
"active": True, "active": True,
"examples": [
{
"sentence": "Der Fonds erfüllt die Anforderungen von Artikel 8.",
"value": "Artikel 8",
},
{
"sentence": "Gemäß SFDR fällt dieser Fonds unter Artikel 8.",
"value": "Artikel 8",
},
],
}, },
{ {
"name": "Zielrendite", "name": "Zielrendite",
"description": "Angestrebte jährliche Rendite in Prozent",
"mandatory": True, "mandatory": True,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "Target Return",
"example": "6.5",
"position": 7, "position": 7,
"active": True, "active": True,
"examples": [
{
"sentence": "Die angestrebte Zielrendite liegt bei 6.5%.",
"value": "6.5%",
},
{"sentence": "Zielrendite des Fonds beträgt 6.5%.", "value": "6.5%"},
],
}, },
{ {
"name": "Rendite", "name": "Rendite",
"description": "Tatsächlich erzielte Rendite in Prozent",
"mandatory": False, "mandatory": False,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "Return",
"example": "5.8",
"position": 8, "position": 8,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Rendite für das Jahr beträgt 5.8%.",
"value": "5.8%",
},
{
"sentence": "Im letzten Jahr wurde eine Rendite von 5.8% erzielt.",
"value": "5.8%",
},
],
}, },
{ {
"name": "Zielausschüttung", "name": "Zielausschüttung",
"description": "Geplante Ausschüttung in Prozent",
"mandatory": False, "mandatory": False,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "Target Distribution",
"example": "4.0",
"position": 9, "position": 9,
"active": True, "active": True,
"examples": [
{"sentence": "Die Zielausschüttung beträgt 4.0%.", "value": "4.0%"},
{
"sentence": "Geplante Ausschüttung: 4.0% pro Jahr.",
"value": "4.0%",
},
],
}, },
{ {
"name": "Ausschüttung", "name": "Ausschüttung",
"description": "Tatsächliche Ausschüttung in Prozent",
"mandatory": False, "mandatory": False,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "Distribution",
"example": "3.8",
"position": 10, "position": 10,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Ausschüttung im Jahr 2024 lag bei 3.8%.",
"value": "3.8%",
},
{
"sentence": "Es wurde eine Ausschüttung von 3.8% vorgenommen.",
"value": "3.8%",
},
],
}, },
{ {
"name": "Laufzeit", "name": "Laufzeit",
"description": "Geplante Laufzeit des Fonds",
"mandatory": True, "mandatory": True,
"type": KPISettingType.STRING, "type": KPISettingType.STRING,
"translation": "Duration",
"example": "7 Jahre, 10, Evergreen",
"position": 11, "position": 11,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Laufzeit des Fonds beträgt 7 Jahre.",
"value": "7 Jahre",
},
{"sentence": "Geplante Dauer: Evergreen-Modell.", "value": "Evergreen"},
],
}, },
{ {
"name": "LTV", "name": "LTV",
"description": "Loan-to-Value Verhältnis in Prozent",
"mandatory": False, "mandatory": False,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "LTV",
"example": "65.0",
"position": 12, "position": 12,
"active": True, "active": True,
"examples": [
{"sentence": "Der LTV beträgt 65.0%.", "value": "65.0%"},
{"sentence": "Loan-to-Value-Ratio: 65.0%.", "value": "65.0%"},
],
}, },
{ {
"name": "Managementgebühren", "name": "Managementgebühren",
"description": "Jährliche Verwaltungsgebühren in Prozent",
"mandatory": True, "mandatory": True,
"type": KPISettingType.NUMBER, "type": KPISettingType.NUMBER,
"translation": "Management Fees",
"example": "1.5",
"position": 13, "position": 13,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Managementgebühren betragen jährlich 1.5%.",
"value": "1.5%",
},
{
"sentence": "Für die Verwaltung wird eine Gebühr von 1.5% erhoben.",
"value": "1.5%",
},
],
}, },
{ {
"name": "Sektorenallokation", "name": "Sektorenallokation",
"description": "Verteilung der Investments nach Sektoren",
"mandatory": False, "mandatory": False,
"type": KPISettingType.ARRAY, "type": KPISettingType.ARRAY,
"translation": "Sector Allocation",
"example": "Büro, Wohnen, Logistik, Studentenwohnen",
"position": 14, "position": 14,
"active": True, "active": True,
"examples": [
{
"sentence": "Die Sektorenallokation umfasst Büro, Wohnen und Logistik.",
"value": "Büro, Wohnen, Logistik",
},
{
"sentence": "Investiert wird in Büro, Logistik und Studentenwohnen.",
"value": "Büro, Logistik, Studentenwohnen",
},
],
}, },
{ {
"name": "Länderallokation", "name": "Länderallokation",
"description": "Geografische Verteilung der Investments",
"mandatory": False, "mandatory": False,
"type": KPISettingType.ARRAY, "type": KPISettingType.ARRAY,
"translation": "Country Allocation",
"example": "Deutschland,Frankreich, Österreich, Schweiz",
"position": 15, "position": 15,
"active": True, "active": True,
"examples": [
{
"sentence": "Investitionen erfolgen in Deutschland, Frankreich und Österreich.",
"value": "Deutschland, Frankreich, Österreich",
},
{
"sentence": "Die Länderallokation umfasst Deutschland, Schweiz und Frankreich.",
"value": "Deutschland, Schweiz, Frankreich",
},
],
}, },
] ]
@ -165,13 +255,11 @@ def seed_default_kpi_settings():
for kpi_data in default_kpi_settings: for kpi_data in default_kpi_settings:
kpi_setting = KPISettingModel( kpi_setting = KPISettingModel(
name=kpi_data["name"], name=kpi_data["name"],
description=kpi_data["description"],
mandatory=kpi_data["mandatory"], mandatory=kpi_data["mandatory"],
type=kpi_data["type"], type=kpi_data["type"],
translation=kpi_data["translation"],
example=kpi_data["example"],
position=kpi_data["position"], position=kpi_data["position"],
active=kpi_data["active"], active=kpi_data["active"],
examples=kpi_data.get("examples", []),
) )
db.session.add(kpi_setting) db.session.add(kpi_setting)

View File

@ -6,9 +6,12 @@ import json
app = Flask(__name__) app = Flask(__name__)
VALIDATE_SERVICE_URL = os.getenv("VALIDATE_SERVICE_URL", "http://localhost:5054/validate") VALIDATE_SERVICE_URL = os.getenv(
"VALIDATE_SERVICE_URL", "http://localhost:5054/validate"
)
@app.route('/extract', methods=['POST'])
@app.route("/extract", methods=["POST"])
def extract_text_from_ocr_json(): def extract_text_from_ocr_json():
json_data = request.get_json() json_data = request.get_json()
@ -16,19 +19,19 @@ def extract_text_from_ocr_json():
pages_data = json_data["extracted_text_per_page"] pages_data = json_data["extracted_text_per_page"]
entities_json = extract_with_exxeta(pages_data, pitchbook_id) entities_json = extract_with_exxeta(pages_data, pitchbook_id)
entities = json.loads(entities_json) if isinstance(entities_json, str) else entities_json entities = (
json.loads(entities_json) if isinstance(entities_json, str) else entities_json
)
validate_payload = { validate_payload = {"id": pitchbook_id, "service": "exxeta", "entities": entities}
"id": pitchbook_id,
"service": "exxeta",
"entities": entities
}
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}") print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}") print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
try: try:
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) response = requests.post(
VALIDATE_SERVICE_URL, json=validate_payload, timeout=600
)
print(f"[EXXETA] Validate service response: {response.status_code}") print(f"[EXXETA] Validate service response: {response.status_code}")
if response.status_code != 200: if response.status_code != 200:
print(f"[EXXETA] Validate service error: {response.text}") print(f"[EXXETA] Validate service error: {response.text}")

View File

@ -16,6 +16,7 @@ TIMEOUT = 180
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_dynamic_labels(): def get_dynamic_labels():
url = f"{COORDINATOR_URL}/api/kpi_setting/" url = f"{COORDINATOR_URL}/api/kpi_setting/"
try: try:
@ -28,6 +29,7 @@ def get_dynamic_labels():
logger.warning(f"Konnte dynamische Labels nicht laden: {e}") logger.warning(f"Konnte dynamische Labels nicht laden: {e}")
return [] return []
def extract_with_exxeta(pages_json, pitchbook_id): def extract_with_exxeta(pages_json, pitchbook_id):
results = [] results = []
@ -39,7 +41,10 @@ def extract_with_exxeta(pages_json, pitchbook_id):
for page_data in pages_json: for page_data in pages_json:
i += 1 i += 1
if i % 8 == 0: if i % 8 == 0:
requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35 + 60/len(pages_json)*i}) requests.post(
COORDINATOR_URL + "/api/progress",
json={"id": pitchbook_id, "progress": 35 + 60 / len(pages_json) * i},
)
page_num = page_data.get("page") page_num = page_data.get("page")
text = page_data.get("text", "") text = page_data.get("text", "")
@ -100,23 +105,28 @@ def extract_with_exxeta(pages_json, pitchbook_id):
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}" "Authorization": f"Bearer {EXXETA_API_KEY}",
} }
payload = { payload = {
"model": MODEL, "model": MODEL,
"messages": [ "messages": [
{"role": "system", "content": "Du bist ein Finanzanalyst. Antworte ausschließlich mit einem validen JSON-Array."}, {
{"role": "user", "content": prompt} "role": "system",
"content": "Du bist ein Finanzanalyst. Antworte ausschließlich mit einem validen JSON-Array.",
},
{"role": "user", "content": prompt},
], ],
"temperature": 0.0 "temperature": 0.0,
} }
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
for attempt in range(1, MAX_RETRIES + 1): for attempt in range(1, MAX_RETRIES + 1):
try: try:
response = requests.post(url, headers=headers, json=payload, timeout=TIMEOUT) response = requests.post(
url, headers=headers, json=payload, timeout=TIMEOUT
)
response.raise_for_status() response.raise_for_status()
content = response.json()["choices"][0]["message"]["content"].strip() content = response.json()["choices"][0]["message"]["content"].strip()
if content.startswith("```json"): if content.startswith("```json"):
@ -140,9 +150,12 @@ def extract_with_exxeta(pages_json, pitchbook_id):
if attempt == MAX_RETRIES: if attempt == MAX_RETRIES:
results.extend([]) results.extend([])
requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 95}) requests.post(
COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 95}
)
return json.dumps(results, indent=2, ensure_ascii=False) return json.dumps(results, indent=2, ensure_ascii=False)
if __name__ == "__main__": if __name__ == "__main__":
print("📡 Test-Aufruf get_dynamic_labels:") print("📡 Test-Aufruf get_dynamic_labels:")
print(get_dynamic_labels()) print(get_dynamic_labels())

View File

@ -29,19 +29,17 @@ def convert_pdf_async(temp_path, pitchbook_id):
temp_path.unlink() # cleanup temp_path.unlink() # cleanup
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
with open(ocr_path, 'rb') as ocr_file: with open(ocr_path, "rb") as ocr_file:
ocr_file.seek(0) ocr_file.seek(0)
result = pdf_to_json(ocr_file) result = pdf_to_json(ocr_file)
payload = {"id": int(pitchbook_id), "extracted_text_per_page": result["pages"]}
payload = {
"id": int(pitchbook_id),
"extracted_text_per_page": result["pages"]
}
logger.info("Sending payload to EXXETA and SPACY services") logger.info("Sending payload to EXXETA and SPACY services")
requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35}) requests.post(
COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35}
)
try: try:
exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600) exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
logger.info(f"EXXETA response: {exxeta_response.status_code}") logger.info(f"EXXETA response: {exxeta_response.status_code}")
@ -54,14 +52,16 @@ def convert_pdf_async(temp_path, pitchbook_id):
except Exception as e: except Exception as e:
logger.error(f"Error calling SPACY: {e}") logger.error(f"Error calling SPACY: {e}")
files=[ files = [("file", ("", open(ocr_path, "rb"), "application/pdf"))]
('file',('',open(ocr_path,'rb'),'application/pdf'))
]
headers = {} headers = {}
try: try:
requests.put(
requests.put(f"{COORDINATOR_URL}/api/pitch_book/{pitchbook_id}", files=files, timeout=600, headers=headers) f"{COORDINATOR_URL}/api/pitch_book/{pitchbook_id}",
files=files,
timeout=600,
headers=headers,
)
logger.info("COORDINATOR response: Progress + File updated") logger.info("COORDINATOR response: Progress + File updated")
except Exception as e: except Exception as e:
logger.error(f"Error calling COORDINATOR: {e}") logger.error(f"Error calling COORDINATOR: {e}")
@ -72,7 +72,7 @@ def convert_pdf_async(temp_path, pitchbook_id):
logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True) logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)
@app.route('/ocr', methods=['POST']) @app.route("/ocr", methods=["POST"])
def convert_extract_text_from_pdf(): def convert_extract_text_from_pdf():
if "file" not in request.files: if "file" not in request.files:
return {"error": "No file"}, 400 return {"error": "No file"}, 400
@ -85,7 +85,7 @@ def convert_extract_text_from_pdf():
if not pitchbook_id: if not pitchbook_id:
return {"error": "No ID"}, 400 return {"error": "No ID"}, 400
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
file.seek(0) file.seek(0)
temp_file.write(file.read()) temp_file.write(file.read())
temp_path = Path(temp_file.name) temp_path = Path(temp_file.name)
@ -93,10 +93,7 @@ def convert_extract_text_from_pdf():
thread = threading.Thread(target=convert_pdf_async, args=(temp_path, pitchbook_id)) thread = threading.Thread(target=convert_pdf_async, args=(temp_path, pitchbook_id))
thread.start() thread.start()
return { return {"status": "sent", "message": "PDF successfully OCR'd and processed"}, 200
"status": "sent",
"message": "PDF successfully OCR'd and processed"
}, 200
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -17,9 +17,10 @@ log_folder = TEMP_DIR / "logs"
output_folder.mkdir(exist_ok=True) output_folder.mkdir(exist_ok=True)
log_folder.mkdir(exist_ok=True) log_folder.mkdir(exist_ok=True)
def pdf_to_json(pdf_input): def pdf_to_json(pdf_input):
try: try:
if hasattr(pdf_input, 'read'): if hasattr(pdf_input, "read"):
pdf_input.seek(0) pdf_input.seek(0)
with pdfplumber.open(pdf_input) as pdf: with pdfplumber.open(pdf_input) as pdf:
@ -83,7 +84,9 @@ def ocr_pdf(input_file_path: Path):
if result.returncode == 0: if result.returncode == 0:
if output_file.exists(): if output_file.exists():
logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes") logger.info(
f"OCR successful, output file size: {output_file.stat().st_size} bytes"
)
return output_file return output_file
else: else:
logger.error(f"OCR completed but output file not found: {output_file}") logger.error(f"OCR completed but output file not found: {output_file}")

View File

@ -40,7 +40,9 @@ def send_to_coordinator_service(processed_data, request_id):
def process_data_async(request_id, spacy_data, exxeta_data): def process_data_async(request_id, spacy_data, exxeta_data):
try: try:
requests.post(COORDINATOR_URL + "/api/progress", json={"id": request_id, "progress": 95}) requests.post(
COORDINATOR_URL + "/api/progress", json={"id": request_id, "progress": 95}
)
print(f"Start asynchronous processing for PitchBook: {request_id}") print(f"Start asynchronous processing for PitchBook: {request_id}")
# Perform merge # Perform merge
@ -96,7 +98,6 @@ def validate():
# If both datasets are present, start asynchronous processing # If both datasets are present, start asynchronous processing
if spacy_data is not None and exxeta_data is not None: if spacy_data is not None and exxeta_data is not None:
# Start asynchronous processing in a separate thread # Start asynchronous processing in a separate thread
processing_thread = threading.Thread( processing_thread = threading.Thread(
target=process_data_async, target=process_data_async,

View File

@ -27,7 +27,6 @@ def merge_entities(spacy_data, exxeta_data):
and s_entity_norm == e_entity_norm and s_entity_norm == e_entity_norm
and s_page == e_page and s_page == e_page
): ):
merged.append( merged.append(
{ {
"label": s["label"], "label": s["label"],

View File

@ -5,6 +5,8 @@ import os
# SETTINGS = [{"id": "Rendite", "type": "number"}] # SETTINGS = [{"id": "Rendite", "type": "number"}]
COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5000") COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5000")
def validate_entities(entities): def validate_entities(entities):
try: try:
response = requests.get(COORDINATOR_URL + "/api/kpi_setting/") response = requests.get(COORDINATOR_URL + "/api/kpi_setting/")
@ -42,7 +44,6 @@ def validate_entities(entities):
result.extend(item[1]) result.extend(item[1])
continue continue
# Filter not validated, if there are valid values # Filter not validated, if there are valid values
validated = False validated = False
for entity in item[1]: for entity in item[1]:
@ -61,11 +62,11 @@ def validate_entities(entities):
def validate_number(entity_list, settings): def validate_number(entity_list, settings):
filtered_kpi = {} filtered_kpi = {}
for label, entity_list in entity_list.items(): for label, entity_list in entity_list.items():
setting = next((s for s in settings if s["name"].upper() == label), None) setting = next((s for s in settings if s["name"].upper() == label), None)
if setting and setting["type"] == "number": if setting and setting["type"] == "number":
filtered_entities = [ filtered_entities = [
entity for entity in entity_list entity
for entity in entity_list
if is_valid_number(str(entity["entity"])) if is_valid_number(str(entity["entity"]))
] ]
for entity in entity_list: for entity in entity_list:
@ -80,8 +81,12 @@ def validate_number(entity_list, settings):
def is_valid_number(number): def is_valid_number(number):
pattern = r'^[0-9\-\s%,.€]+$' pattern = r"^[0-9\-\s%,.€]+$"
return any(char.isdigit() for char in number) and not re.search(r'\d+\s\d+', number) and re.fullmatch(pattern, number) return (
any(char.isdigit() for char in number)
and not re.search(r"\d+\s\d+", number)
and re.fullmatch(pattern, number)
)
def delete_exxeta_unknown(entity_list): def delete_exxeta_unknown(entity_list):
@ -89,11 +94,16 @@ def delete_exxeta_unknown(entity_list):
for label, entity_list in entity_list.items(): for label, entity_list in entity_list.items():
# Filter out entities with "nichtangegeben" or "n/a" (case-insensitive and stripped) # Filter out entities with "nichtangegeben" or "n/a" (case-insensitive and stripped)
filtered_entities = [ filtered_entities = [
entity for entity in entity_list entity
if str(entity["entity"]).lower().replace(" ", "") not in {"nichtangegeben", "n/a"} for entity in entity_list
if str(entity["entity"]).lower().replace(" ", "")
not in {"nichtangegeben", "n/a"}
] ]
for entity in entity_list: for entity in entity_list:
if str(entity["entity"]).lower().replace(" ", "") in {"nichtangegeben", "n/a"}: if str(entity["entity"]).lower().replace(" ", "") in {
"nichtangegeben",
"n/a",
}:
print(f"filtered out: {entity}") print(f"filtered out: {entity}")
if filtered_entities: # Only add the label if there are entities left if filtered_entities: # Only add the label if there are entities left
filtered_kpi[label] = filtered_entities filtered_kpi[label] = filtered_entities
@ -115,6 +125,7 @@ def delete_duplicate_entities(entity_list):
unique_entities[label] = filtered_entities unique_entities[label] = filtered_entities
return unique_entities return unique_entities
if __name__ == "__main__": if __name__ == "__main__":
entities = [ entities = [
# {"label": "PERSON", "entity": "John Doe", "status": "validated"}, # {"label": "PERSON", "entity": "John Doe", "status": "validated"},

View File

@ -122,7 +122,7 @@ export function KPIForm({ mode, initialData, onSave, onCancel, loading = false,
example: formData.example || '', example: formData.example || '',
position: formData.position ?? 0, position: formData.position ?? 0,
active: formData.active ?? true, active: formData.active ?? true,
examples: [{ sentence: '', value: '' }] examples: formData.examples ?? []
}); });
// Formular zurücksetzen: // Formular zurücksetzen:
setFormData(emptyKPI); setFormData(emptyKPI);