Merge pull request 'backend/flask-setup' (#38) from backend/flask-setup into main

Reviewed-on: #38
2025-05-26 18:20:44 +02:00 · 2025-05-26 18:20:44 +02:00 · cc321fea4a
parent 3ddb35e51e f504cc87e8
commit cc321fea4a
39 changed files with 637 additions and 307 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,3 @@
 # .flake8
 [flake8]
 max-line-length = 88
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,13 @@
 repos:
  - repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
      - id: black
        language_version: python3
        files: ^project/backend/
  - repo: https://github.com/pycqa/flake8
    rev: 6.1.0
    hooks:
      - id: flake8
        files: ^project/backend/
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,14 @@
 services:
  backend:
    build: ./project/backend
    container_name: fundfuechse-backend
    ports:
      - "5000:5000"
    restart: always
  # frontend:
  #   build: ./project/frontend
  #   container_name: fundfuechse-frontend
  #   ports:
  #     - "3000:80"
  #   restart: always
--- a/project/backend/coordinator/Dockerfile
+++ b/project/backend/coordinator/Dockerfile
@ -0,0 +1,19 @@
 # 1. Python-Image verwenden
 FROM python:3.11-alpine
 # 2. Arbeitsverzeichnis im Container setzen
 WORKDIR /app
 # 3. requirements.txt kopieren und Pakete installieren
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # 4. Quellcode kopieren (z. B. app.py)
 COPY . .
 # 5. Flask-App starten
 # production-style server mit gunicorn
 RUN pip install gunicorn
 CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
--- a/project/backend/coordinator/README.md
+++ b/project/backend/coordinator/README.md
@ -0,0 +1,85 @@
 ##  Setup
 ### Voraussetzungen
 - Python 3.11+
 - pip
 - Docker (Desktop)
 - Optional: `pre-commit`
 ### Abhängigkeiten installieren
 ```bash
 pip install -r requirements.txt
 # Codequalität (lokal prüfen)
 black app.py
 flake8 app.py
 ## Anwendung starten
 ### Lokal 
 1. Abhängigkeiten installieren:
 ```bash
 pip install -r requirements.txt
 ```
 2. Flask-App starten:
 ```bash
 python app.py
 ```
 3. Aufrufen im Browser:
 ```
 http://localhost:5000/
 ```
 ---
 ###  Option 2: Mit Docker
 1. Image bauen:
 ```bash
 docker build -t fundfuechse-backend .
 ```
 2. Container starten:
 ```bash
 docker run -p 5000:5000 fundfuechse-backend
 ```
 Die API läuft dann unter:
 ```
 http://localhost:5000/
 ```
 ---
 ###  Option 3: Mit docker-compose
 ```bash
 docker-compose up --build
 ```
 Danach ist der Service erreichbar unter:
 ```
 http://localhost:5000/
 ```
 ---
 ### Testaufruf per curl (PDF hochladen)
 ```bash
 curl.exe -X POST -F "file=@Pitchbook 1.pdf" http://localhost:5000/upload
 ```
--- a/project/backend/coordinator/app.py
+++ b/project/backend/coordinator/app.py
@ -0,0 +1,66 @@
 from flask import Flask, jsonify
 from flask import request
 import os
 app = Flask(__name__)
@app.route("/health")
 def health_check():
    return "OK"
 # gibt Beispiel-Konfig der Kennzahlen zurück (für die UI)
@app.route("/config", methods=["GET"])
 def get_config():
    config = [
        {"name": "Fondname", "format": "Text", "required": True},
        {"name": "IRR", "format": "Prozent", "required": False},
    ]
    return jsonify(config)
 # liefert Beispiel-Ergebnisse der Extraktion
@app.route("/extraction_results", methods=["GET"])
 def get_extraction_results():
    results = [
        {"label": "Fondname", "entity": "ABC Fonds", "page": 1, "status": "validated"},
        {
            "label": "IRR",
            "entity": "6,0%",
            "page": 3,
            "status": "single-source",
            "source": "spaCy",
        },
    ]
    return jsonify(results)
 # legt Upload-Ordner an, falls nicht vorhanden
 UPLOAD_FOLDER = "uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 # nimmt eine PDF-Datei per POST entgegen und speichert sie
@app.route("/upload", methods=["POST"])
 def upload_pdf():
    if "file" not in request.files:
        return {"error": "Keine Datei hochgeladen."}, 400
    file = request.files["file"]
    if file.filename == "":
        return {"error": "Dateiname fehlt."}, 400
    if not file.filename.endswith(".pdf"):
        return {"error": "Nur PDF-Dateien erlaubt."}, 400
    file_path = os.path.join(UPLOAD_FOLDER, file.filename)
    file.save(file_path)
    return {"message": f"Datei {file.filename} erfolgreich gespeichert!"}, 200
 # für Docker wichtig: host='0.0.0.0'
 if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0")
--- a/project/backend/coordinator/requirements.txt
+++ b/project/backend/coordinator/requirements.txt
@ -0,0 +1,4 @@
 Flask
 black
 flake8
 pre-commit
--- a/prototypes/PDFMiner/test.py
+++ b/prototypes/PDFMiner/test.py
@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
-fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
+fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
 rsrcmgr = PDFResourceManager()
 laparams = LAParams()
 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
 pages = PDFPage.get_pages(fp)
 for page in pages:
-    print('Processing next page...')
+    print("Processing next page...")
    interpreter.process_page(page)
    layout = device.get_result()
    for lobj in layout:
        if isinstance(lobj, LTTextBox):
            x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
-            print('At %r is text: %s' % ((x, y), text))
+            print("At %r is text: %s" % ((x, y), text))
--- a/prototypes/PyMuPdf/PyMuPdf_st.py
+++ b/prototypes/PyMuPdf/PyMuPdf_st.py
@ -28,18 +28,14 @@ if uploaded_file and suchwort:
        rects = page.search_for(suchwort)
        for rect in rects:
-            fundstellen.append({
+            fundstellen.append({"seite": page_num, "rect": rect})
                "seite": page_num,
                "rect": rect
            })
    if fundstellen:
        st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
        # Auswahl der Fundstelle
        auswahl = st.selectbox(
-            "Fundstelle auswählen:",
+            "Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
            [f"Seite {f['seite'] + 1}" for f in fundstellen]
        )
        index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
--- a/prototypes/PyMuPdf/prototype.py
+++ b/prototypes/PyMuPdf/prototype.py
@ -38,7 +38,9 @@ for eintrag in kennzahlen:
                highlight = page.add_highlight_annot(rect)
                highlight.update()
        else:
-            st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
+            st.warning(
                f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
            )
    except Exception as e:
        st.error(f" Fehler bei Eintrag {eintrag}: {e}")
@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
 # PDF anzeigen mit Scroll zu aktueller Seite
 st.subheader(f"Vorschau")
 with open(highlighted_path, "rb") as f:
-    base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    base64_pdf = base64.b64encode(f.read()).decode("utf-8")
 # Seite direkt ansteuern
-pdf_display = f'''
+pdf_display = f"""
 <iframe 
    src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}" 
    width="100%" height="800px" type="application/pdf">
 </iframe>
-'''
+"""
 st.markdown(pdf_display, unsafe_allow_html=True)
--- a/prototypes/arc1_prototype/client.py
+++ b/prototypes/arc1_prototype/client.py
@ -87,9 +87,9 @@ class Server:
        server_params = StdioServerParameters(
            command=command,
            args=self.config["args"],
-            env={**os.environ, **self.config["env"]}
+            env=(
-            if self.config.get("env")
+                {**os.environ, **self.config["env"]} if self.config.get("env") else None
-            else None,
+            ),
        )
        try:
            stdio_transport = await self.exit_stack.enter_async_context(
@ -244,15 +244,10 @@ class LLMClient:
        formatted_messages = []
        for msg in messages:
            # print(msg)
-            formatted_messages.append({
+            formatted_messages.append({"role": msg["role"], "content": msg["content"]})
                "role": msg["role"],
                "content": msg["content"]
            })
        client = AzureOpenAI(
-            api_key=self.api_key,
+            api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
            api_version="2023-07-01-preview",
            base_url=url
        )
        response = client.chat.completions.create(
            messages=formatted_messages,
@ -412,12 +407,16 @@ class ChatSession:
                "4. Use appropriate context from the user's question\n"
                "5. Avoid simply repeating the raw data\n\n"
                "Please use only the tools that are explicitly defined above."
            )
            messages = [{"role": "system", "content": system_message}]
-            messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
+            messages.append(
-            "For each value there is only one correct answer, try to find it with the tools provided."})
+                {
                    "role": "assistant",
                    "content": "You have to extract data from pdf files and have different tools for extracting."
                    "For each value there is only one correct answer, try to find it with the tools provided.",
                }
            )
            while True:
                try:
@ -455,7 +454,6 @@ class ChatSession:
                    # messages.append({"role": "assistant", "content": llm_response})
                    # logging.info("\nFinal response: %s", llm_response)
                except KeyboardInterrupt:
                    logging.info("\nExiting...")
                    break
@ -476,5 +474,6 @@ async def main() -> None:
    chat_session = ChatSession(servers, llm_client)
    await chat_session.start()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/prototypes/arc1_prototype/server.py
+++ b/prototypes/arc1_prototype/server.py
@ -8,54 +8,86 @@ mcp = FastMCP("Demo")
 risikoProfile = ["Core/Core+, Core", "Value Add"]
 risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
 # Add an addition tool
@mcp.tool()
 def add(a: int, b: int) -> int:
    """Add two numbers"""
    return a + b
@mcp.tool()
 def getFromSpaCy() -> list:
    """Get data from SpaCy"""
-    return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
+    return [
-            {"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
+        {
            "page": random.randint(1, 35),
            "value": random.choice(risikoProfileSpacy),
            "key": "Risiko",
        },
        {"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
    ]
@mcp.tool()
 def getFromChatGPT() -> list:
    """Get data from ChatGPT"""
-    return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
+    return [
-            {"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
+        {
            "page": random.randint(1, 35),
            "value": random.choice(risikoProfile),
            "key": "Risiko",
        },
        {"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
    ]
@mcp.tool()
 def checkSpacyResult() -> dict:
    """This tool checks the result of SpaCy, ensuring it meets certain criteria."""
-    return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
+    return {
        "page": random.randint(1, 35),
        "value": random.choice(risikoProfile),
        "key": "Risiko",
    }
@mcp.tool()
 def getFromChatGPTSingle(value: str) -> dict:
    """This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
-    return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
+    return {
        "page": random.randint(1, 35),
        "value": random.choice(risikoProfile),
        "key": value,
    }
 context = ""
@mcp.tool()
 def getContext() -> str:
    """This tool gets context information."""
    return context
@mcp.tool()
 def setContext(value: str) -> None:
    """This tool sets context information."""
    global context
    context = value
 # Add a dynamic greeting resource
@mcp.resource("greeting://{name}")
 def get_greeting(name: str) -> str:
    """Get a personalized greeting"""
    return f"Hello, {name}!"
 """ Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
@mcp.tool()
 def validate_entity(entity: str, label: str) -> dict:
    """Returns if the entity is valid based on hardcoded rules."""
@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict:
        return {"status": "valid", "entity": entity}
    return {"status": "invalid", "entity": entity}
 """ Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
@mcp.tool()
-def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
+def merge_spacy_exxeta(
    spacy_result: list[dict], exxeta_result: list[dict]
 ) -> list[dict]:
    """Merge two results, mark as validated if label/entity/page match."""
-    def norm(e): return e["entity"].lower().replace(" ", "")
+
    def norm(e):
        return e["entity"].lower().replace(" ", "")
    merged = []
    seen = set()
@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
    for s in spacy_result:
        s_norm = norm(s)
        s_page = s["page"]
-        match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
+        match = next(
            (
                e
                for e in exxeta_result
                if e["label"] == s["label"]
                and norm(e) == s_norm
                and e["page"] == s_page
            ),
            None,
        )
        if match:
            merged.append({**s, "status": "validated"})
            seen.add((match["entity"], match["page"]))
--- a/prototypes/arc2_prototype/app.py
+++ b/prototypes/arc2_prototype/app.py
@ -12,10 +12,12 @@ app = Flask(__name__)
 UPLOAD_FOLDER = Path("pitchbooks")
 UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
@app.route("/")
 def home():
    return "Backend is running!"
@app.route("/upload", methods=["POST"])
 def upload():
    file = request.files.get("file")
@ -44,5 +46,6 @@ def upload():
    return "status: complete\n"
 if __name__ == "__main__":
    app.run(debug=True)
--- a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
+++ b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo"
 OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
 OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 def extract_with_exxeta(pages_json):
    results = []
@ -19,15 +20,15 @@ def extract_with_exxeta(pages_json):
        prompt = (
            "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
-                "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
+            'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
-                "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
+            'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
            "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
            "Beispiele:\n"
-                "- \"Core, Core+\" → entity: \"Core, Core+\"\n"
+            '- "Core, Core+" → entity: "Core, Core+"\n'
-                "- \"Core/Core+\" → entity: \"Core/Core+\"\n"
+            '- "Core/Core+" → entity: "Core/Core+"\n'
-                "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
+            '- "Core and Core+" → entity: "Core and Core+"\n\n'
            "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
-                f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
+            f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
            "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
            "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
            "TEXT:\n" + text
@ -35,16 +36,19 @@ def extract_with_exxeta(pages_json):
        headers = {
            "Content-Type": "application/json",
-            "Authorization": f"Bearer {EXXETA_API_KEY}"
+            "Authorization": f"Bearer {EXXETA_API_KEY}",
        }
        payload = {
            "model": MODEL,
            "messages": [
-                {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
+                {
-                {"role": "user", "content": prompt}
+                    "role": "system",
                    "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
                },
                {"role": "user", "content": prompt},
            ],
-            "temperature": 0.0
+            "temperature": 0.0,
        }
        url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
--- a/prototypes/arc2_prototype/merge_validate_service/validator.py
+++ b/prototypes/arc2_prototype/merge_validate_service/validator.py
@ -1,13 +1,16 @@
 from pathlib import Path
 import json
 def normalize_entity(entity_str):
-    return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
+    return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
 def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)
 def merge_and_validate_entities(filter_label=None):
    base = Path(__file__).resolve().parent.parent
    spacy_path = base / "spacy_service/output/spacy-results.json"
@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None):
        s_page = s["page"]
        match = next(
-            (e for e in exxeta_data
+            (
-             if e["label"] == s["label"] and
+                e
-             normalize_entity(e["entity"]) == s_norm and
+                for e in exxeta_data
-             e["page"] == s_page),
+                if e["label"] == s["label"]
-            None
+                and normalize_entity(e["entity"]) == s_norm
                and e["page"] == s_page
            ),
            None,
        )
        if match:
--- a/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py
+++ b/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py
@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent
 OUTPUT_FOLDER = BASE_DIR / "output"
 OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 def run_ocr_and_extract(pdf_path: str):
    pdf_path = Path(pdf_path)
    output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str):
    cmd = [
        "ocrmypdf",
        "--force-ocr",
-        "--output-type", "pdfa",
+        "--output-type",
-        "--language", "deu+eng",
+        "pdfa",
        "--language",
        "deu+eng",
        str(pdf_path),
-        str(output_pdf)
+        str(output_pdf),
    ]
    result = subprocess.run(cmd, capture_output=True)
@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str):
        raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
    with pdfplumber.open(output_pdf) as pdf:
-        pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
+        pages = [
            {"page": i + 1, "text": (page.extract_text() or "").strip()}
            for i, page in enumerate(pdf.pages)
        ]
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(pages, f, indent=2, ensure_ascii=False)
-    return {
+    return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}
        "ocr_pdf": str(output_pdf),
        "json_path": str(json_path)
    }
--- a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
+++ b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
 nlp = spacy.load(model_path)
-input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
+input_pdf_path = (
    Path(__file__).resolve().parent
    / ".."
    / "ocr_pdf_service"
    / "output"
    / "pitchbook-OCR.pdf"
 )
 input_pdf = Path(input_pdf_path)
@ -25,11 +31,7 @@ def extract_with_spacy(pages_json):
        doc = nlp(text)
        for ent in doc.ents:
-            results.append({
+            results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
                "label": ent.label_,
                "entity": ent.text,
                "page": page_num
            })
    output_path = OUTPUT_FOLDER / f"spacy-results.json"
    with open(output_path, "w", encoding="utf-8") as f:
--- a/prototypes/exxeta/index.py
+++ b/prototypes/exxeta/index.py
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
 API_KEY = os.getenv("API_KEY")
 client = AzureOpenAI(
-    api_key=API_KEY,
+    api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
    api_version="2023-07-01-preview",
        base_url=BASE_URL
 )
 def extract_text_from_pdf(file_path):
    """Extract text content from a PDF file using PyMuPDF (fitz)."""
    all_text = ""
@ -40,10 +40,7 @@ pdf_text = extract_text_from_pdf(file_path)
 response = client.chat.completions.create(
    messages=[
-            {
+        {"role": "system", "content": "Always respond with a valid JSON object"},
                "role": "system",
                "content": "Always respond with a valid JSON object"
            },
        {
            "role": "user",
            "content": """extract the values from the text. let not found values empty:
@ -71,11 +68,12 @@ response = client.chat.completions.create(
                    - the page where this value was found
                    - a confidence score, how confident the model is about the value (low, medium, high)
-                Here ist the text:""" + pdf_text
+                Here ist the text:"""
-            }
+            + pdf_text,
        },
    ],
    model="gpt-4o-mini",
-        response_format={"type": "json_object"}
+    response_format={"type": "json_object"},
    # temperature=0.7,
    # top_p=0.95,
    # frequency_penalty=0,
@ -86,5 +84,4 @@ response = client.chat.completions.create(
 )
 print(response.choices[0].message.content)
--- a/prototypes/exxetaGPT/index.py
+++ b/prototypes/exxetaGPT/index.py
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
 API_KEY = os.getenv("API_KEY")
 client = AzureOpenAI(
-    api_key=API_KEY,
+    api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
    api_version="2023-07-01-preview",
        base_url=BASE_URL
 )
 def extract_text_from_pdf(file_path):
    """Extract text content from a PDF file using PyMuPDF (fitz)."""
    all_text = ""
@ -40,10 +40,7 @@ pdf_text = extract_text_from_pdf(file_path)
 response = client.chat.completions.create(
    messages=[
-            {
+        {"role": "system", "content": "Always respond with a valid JSON object"},
                "role": "system",
                "content": "Always respond with a valid JSON object"
            },
        {
            "role": "user",
            "content": """extract the values from the text. let not found values empty:
@ -71,11 +68,12 @@ response = client.chat.completions.create(
                    - the page where this value was found
                    - a confidence score, how confident the model is about the value (low, medium, high)
-                Here ist the text:""" + pdf_text
+                Here ist the text:"""
-            }
+            + pdf_text,
        },
    ],
    model="gpt-4o-mini",
-        response_format={"type": "json_object"}
+    response_format={"type": "json_object"},
    # temperature=0.7,
    # top_p=0.95,
    # frequency_penalty=0,
@ -86,5 +84,4 @@ response = client.chat.completions.create(
 )
 print(response.choices[0].message.content)
--- a/prototypes/fine_tuning_spaCy/annotated_data.json
+++ b/prototypes/fine_tuning_spaCy/annotated_data.json
@ -0,0 +1,2 @@
 {"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
 {"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
--- a/prototypes/fine_tuning_spaCy/neue_kennzahl.py
+++ b/prototypes/fine_tuning_spaCy/neue_kennzahl.py
@ -0,0 +1,17 @@
 import streamlit as st
 import json
 st.title("Neue Kennzahl annotieren")
 text = st.text_area("Text", "Das geplante Projektvolumen beträgt 120 Mio. €.")
 start = st.number_input("Start-Position", min_value=0, max_value=len(text), value=28)
 end = st.number_input("End-Position", min_value=0, max_value=len(text), value=44)
 label = st.text_input("Label (z. B. KENNZAHL)", "KENNZAHL")
 if st.button("Speichern"):
    example = {"text": text, "entities": [[start, end, label]]}
    with open("annotated_data.json", "a", encoding="utf-8") as f:
        f.write(json.dumps(example, ensure_ascii=False) + "\n")
    st.success("✅ Annotation gespeichert!")
--- a/prototypes/fine_tuning_spaCy/test_model.py
+++ b/prototypes/fine_tuning_spaCy/test_model.py
@ -15,11 +15,9 @@ for page_number in range(len(doc)):
    text = page.get_text()
    spacy_doc = nlp(text)
    for ent in spacy_doc.ents:
-        results.append({
+        results.append(
-            "label": ent.label_,
+            {"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
-            "entity": ent.text.strip(),
+        )
            "page": page_number + 1 
        })
 with open("entities_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
--- a/prototypes/fine_tuning_spaCy/training_data.py
+++ b/prototypes/fine_tuning_spaCy/training_data.py
@ -226,9 +226,5 @@ TRAINING_DATA = [
    (
        "Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
        {"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
-    )
+    ),
 ]
--- a/prototypes/fine_tuning_spaCy/training_model.py
+++ b/prototypes/fine_tuning_spaCy/training_model.py
@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA):
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
-            print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+            print(
                f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
            )
        else:
            ents.append(span)
-            print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+            print(
                f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
            )
    # label the text with the ents
    doc.ents = ents
    db.add(doc)
--- a/prototypes/first-mcp-python/client.py
+++ b/prototypes/first-mcp-python/client.py
@ -87,9 +87,9 @@ class Server:
        server_params = StdioServerParameters(
            command=command,
            args=self.config["args"],
-            env={**os.environ, **self.config["env"]}
+            env=(
-            if self.config.get("env")
+                {**os.environ, **self.config["env"]} if self.config.get("env") else None
-            else None,
+            ),
        )
        try:
            stdio_transport = await self.exit_stack.enter_async_context(
@ -244,15 +244,10 @@ class LLMClient:
        formatted_messages = []
        for msg in messages:
            print(msg)
-            formatted_messages.append({
+            formatted_messages.append({"role": msg["role"], "content": msg["content"]})
                "role": msg["role"],
                "content": msg["content"]
            })
        client = AzureOpenAI(
-            api_key=self.api_key,
+            api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
            api_version="2023-07-01-preview",
            base_url=url
        )
        response = client.chat.completions.create(
            messages=formatted_messages,
--- a/prototypes/first-mcp-python/main.py
+++ b/prototypes/first-mcp-python/main.py
@ -1,5 +1,6 @@
 # server.py
 from mcp.server.fastmcp import FastMCP
 # Create an MCP server
 mcp = FastMCP("Demo")
--- a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py
+++ b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py
@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
 OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
 OUTPUT_PATH = "mcp_spacy_validated_result.json"
 def load_spacy_entities():
    with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
        return json.load(f)
 def load_pitchbook_pages():
    with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
        return json.load(f)
 def get_page_text(pages, page_number):
    for page in pages:
        if page.get("page") == page_number:
            return page.get("text", "")
    return ""
 def normalize_entity(entity):
-    return ' '.join(entity.replace('\n', ' ').split())
+    return " ".join(entity.replace("\n", " ").split())
 def validate_entity_with_exxeta(entity, page_num, text):
    prompt = (
        f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
        f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
        f"Ziel-Formulierung:\n"
-        f"\"{entity}\"\n\n"
+        f'"{entity}"\n\n'
        f"Validierungsregeln:\n"
        f"- Groß- und Kleinschreibung ignorieren.\n"
        f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
        f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
        f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
-        f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
+        f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
        f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
        f"OCR-Text auf Seite {page_num}:\n{text}"
    )
    headers = {
        "Content-Type": "application/json",
-        "Authorization": f"Bearer {EXXETA_API_KEY}"
+        "Authorization": f"Bearer {EXXETA_API_KEY}",
    }
    payload = {
        "model": MODEL,
        "messages": [
-            {"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
+            {
-            {"role": "user", "content": prompt}
+                "role": "system",
                "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
            },
            {"role": "user", "content": prompt},
        ],
-        "temperature": 0.0
+        "temperature": 0.0,
    }
    url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text):
        print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
        return False
 def run():
    spacy_entities = load_spacy_entities()
    pitchbook_pages = load_pitchbook_pages()
@ -81,17 +90,20 @@ def run():
        page_text = get_page_text(pitchbook_pages, page)
        is_valid = validate_entity_with_exxeta(entity, page, page_text)
-        validated_results.append({
+        validated_results.append(
            {
                "label": entity_data.get("label"),
                "entity": raw_entity,
                "page": page,
-            "validated": is_valid
+                "validated": is_valid,
-        })
+            }
        )
    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(validated_results, f, indent=2, ensure_ascii=False)
    print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
 if __name__ == "__main__":
    run()
--- a/prototypes/mcp_validate-arc1/mcp_validate.py
+++ b/prototypes/mcp_validate-arc1/mcp_validate.py
@ -10,19 +10,23 @@ KPI_SERVICE_MAP = {
 SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
 EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
 def load_spacy_entities(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
 def load_exxeta_entities(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
 def normalize(text):
    if not text:
        return ""
    return text.strip().lower().replace(" ", "").replace("/", "/")
 def validate_kpi(kpi, spacy_entities, exxeta_entities):
    results = []
@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
            for ee in exxeta_entries:
                ee_entity = normalize(ee["entity"])
                if se_entity == ee_entity:
-                    results.append({
+                    results.append(
                        {
                            "kpi": kpi,
                            "entity": se["entity"],
                            "page": page,
-                        "validation_status": "validated"
+                            "validation_status": "validated",
-                    })
+                        }
                    )
                    matched = True
                    break
            if not matched:
-                results.append({
+                results.append(
                    {
                        "kpi": kpi,
                        "entity": se["entity"],
                        "page": page,
-                    "validation_status": "spacy-only"
+                        "validation_status": "spacy-only",
-                })
+                    }
                )
        for ee in exxeta_entries:
            ee_entity = normalize(ee["entity"])
            if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
-                results.append({
+                results.append(
                    {
                        "kpi": kpi,
                        "entity": ee["entity"],
                        "page": page,
-                    "validation_status": "exxeta-only"
+                        "validation_status": "exxeta-only",
-                })
+                    }
                )
    return results
 def save_results(results, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
 def run():
    spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
    exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
@ -96,5 +108,6 @@ def run():
    save_results(all_results, "mcp_validated_result.json")
    print("✅ Validation complete! Output: mcp_validated_result.json")
 if __name__ == "__main__":
    run()
--- a/prototypes/merge_validate-arc2/exxeta_api.py
+++ b/prototypes/merge_validate-arc2/exxeta_api.py
@ -4,6 +4,7 @@ import json
 MODEL = "gpt-35-turbo"
 def extract_risikoprofil_from_exxeta(pages_json):
    results = []
@ -16,33 +17,35 @@ def extract_risikoprofil_from_exxeta(pages_json):
        prompt = (
            "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
-                "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
+            'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
-                "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
+            'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
            "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
            "Beispiele:\n"
-                "- \"Core, Core+\" → entity: \"Core, Core+\"\n"
+            '- "Core, Core+" → entity: "Core, Core+"\n'
-                "- \"Core/Core+\" → entity: \"Core/Core+\"\n"
+            '- "Core/Core+" → entity: "Core/Core+"\n'
-                "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
+            '- "Core and Core+" → entity: "Core and Core+"\n\n'
            "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
-                f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
+            f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
            "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
            "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
            "TEXT:\n" + text
        )
        headers = {
            "Content-Type": "application/json",
-            "Authorization": f"Bearer {EXXETA_API_KEY}"
+            "Authorization": f"Bearer {EXXETA_API_KEY}",
        }
        payload = {
            "model": MODEL,
            "messages": [
-                {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
+                {
-                {"role": "user", "content": prompt}
+                    "role": "system",
                    "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
                },
                {"role": "user", "content": prompt},
            ],
-            "temperature": 0.0
+            "temperature": 0.0,
        }
        url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
--- a/prototypes/merge_validate-arc2/merge_logic.py
+++ b/prototypes/merge_validate-arc2/merge_logic.py
@ -1,10 +1,11 @@
 def normalize_entity(entity_str):
    if not entity_str:
        return ""
-    normalized = entity_str.replace('\n', ' ')
+    normalized = entity_str.replace("\n", " ")
-    normalized = ''.join(normalized.lower().split())
+    normalized = "".join(normalized.lower().split())
    return normalized
 def merge_and_validate_entities(spacy_data, exxeta_data):
    merged = []
    seen = set()
@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
            e_page = e["page"]
            # Match if normalized entity and page match
-            if (s["label"] == e["label"] and
+            if (
-                    s_entity_norm == e_entity_norm and
+                s["label"] == e["label"]
-                    s_page == e_page):
+                and s_entity_norm == e_entity_norm
                and s_page == e_page
            ):
-                merged.append({
+                merged.append(
                    {
                        "label": s["label"],
                        "entity": s["entity"],
                        "page": s_page,
-                    "status": "validated"
+                        "status": "validated",
-                })
+                    }
                )
                seen.add((e["entity"], e_page))
                found = True
                break
        # If no match found, add as single-source
        if not found:
-            merged.append({
+            merged.append(
                {
                    "label": s["label"],
                    "entity": s["entity"],
                    "page": s_page,
                    "status": "single-source",
-                "source": "spacy"
+                    "source": "spacy",
-            })
+                }
            )
    # Add remaining Exxeta entities not already processed
    for e in exxeta_data:
        if (e["entity"], e["page"]) not in seen:
-            merged.append({
+            merged.append(
                {
                    "label": e["label"],
                    "entity": e["entity"],
                    "page": e["page"],
                    "status": "single-source",
-                "source": "exxeta"
+                    "source": "exxeta",
-            })
+                }
            )
    return merged
--- a/prototypes/merge_validate-arc2/merge_validate.py
+++ b/prototypes/merge_validate-arc2/merge_validate.py
@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities
 SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
 PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
 def load_pitchbook_pages():
    path = Path(PITCHBOOK_PATH)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
 def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
 def sort_by_page_number(entities):
    return sorted(entities, key=lambda x: x.get("page", 0))
 def run():
    spacy_entities = load_spacy_entities(SPACY_PATH)
    pitchbook_pages = load_pitchbook_pages()
@ -33,5 +37,6 @@ def run():
    print("- merged_result.json")
    print(f"- Total entities in merged result: {len(merged_sorted)}")
 if __name__ == "__main__":
    run()
--- a/prototypes/merge_validate-arc2/spacy_extract.py
+++ b/prototypes/merge_validate-arc2/spacy_extract.py
@ -1,6 +1,7 @@
 import json
 from pathlib import Path
 def load_spacy_entities(path):
    path = Path(path)
    with open(path, "r", encoding="utf-8") as f:
--- a/prototypes/ocr/ocr.py
+++ b/prototypes/ocr/ocr.py
@ -11,15 +11,20 @@ log_folder = Path("logs")
 for folder in [output_folder, log_folder]:
    folder.mkdir(parents=True, exist_ok=True)
 def extract_text_to_json(pdf_path: Path):
    json_path = output_folder / f"{pdf_path.stem}.json"
    with pdfplumber.open(pdf_path) as pdf:
-        pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
+        pages = [
            {"page": i + 1, "text": (page.extract_text() or "").strip()}
            for i, page in enumerate(pdf.pages)
        ]
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(pages, f, indent=2, ensure_ascii=False)
    print(f"📄 Text JSON saved: {json_path.name}")
 def ocr_pdf(input_file: Path):
    output_file = output_folder / f"{input_file.stem}-OCR.pdf"
    log_file = log_folder / f"{input_file.stem}.log"
@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path):
    cmd = [
        "ocrmypdf",
        "--force-ocr",
-        "--output-type", "pdfa",
+        "--output-type",
-        "--language", "deu+eng",  
+        "pdfa",
-        "--sidecar", str(sidecar_txt),
+        "--language",
        "deu+eng",
        "--sidecar",
        str(sidecar_txt),
        str(input_file),
-        str(output_file)
+        str(output_file),
    ]
    with open(log_file, "w") as log:
@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path):
    else:
        print(f"❌ OCR failed. See log: {log_file}")
 if __name__ == "__main__":
    if not input_folder.exists():
        print("Input folder does not exist!")
--- a/prototypes/spacy-layout/extract_pitchbooks.py
+++ b/prototypes/spacy-layout/extract_pitchbooks.py
@ -34,14 +34,14 @@ for ent in doc_ner.ents:
            break
    if ent.text.strip():
-        ner_text_results.append({
+        ner_text_results.append(
-            "label": ent.label_,
+            {"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
-            "entity": ent.text.strip(),
+        )
            "page": page_number
        })
 print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
-(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
+(output_dir / "ner_text.json").write_text(
    json.dumps(ner_text_results, indent=2, ensure_ascii=False)
 )
 # 2. NER on table cells
 table_ner_results = []
@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1):
            doc_cell = nlp(cell)
            for ent in doc_cell.ents:
                if ent.text.strip():
-                    table_ner_results.append({
+                    table_ner_results.append(
                        {
                            "label": ent.label_,
                            "entity": ent.text.strip(),
                            "page": page_number,
-                        "table": i
+                            "table": i,
-                    })
+                        }
                    )
 print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
-(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
+(output_dir / "ner_tables.json").write_text(
    json.dumps(table_ner_results, indent=2, ensure_ascii=False)
 )
 print("✅ Done! Extracted data saved to /output")
		`@ -0,0 +1,2 @@`
							`{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}`
							`{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}`