From cd6c66a1fdc40ffa2fbff3ac2f2986e0fb7677f9 Mon Sep 17 00:00:00 2001
From: Abdulraahman Dabbagh <1924466@stud.hs-mannheim.de>
Date: Sun, 25 May 2025 10:47:16 +0200
Subject: [PATCH] Backend Flask aufsetzen (Ticket #4)

---
 .pre-commit-config.yaml                       |  2 +
 prototypes/PDFMiner/test.py                   |  6 +-
 prototypes/PyMuPdf/PyMuPdf_st.py              | 10 +--
 prototypes/PyMuPdf/prototype.py               | 10 +--
 prototypes/arc1_prototype/client.py           | 49 +++++++------
 prototypes/arc1_prototype/server.py           | 68 ++++++++++++++++---
 prototypes/arc2_prototype/app.py              |  5 +-
 .../arc2_prototype/exxeta_service/config.py   |  2 +-
 .../exxeta_service/exxeta_client.py           | 40 ++++++-----
 .../merge_validate_service/validator.py       | 18 +++--
 .../ocr_pdf_service/ocr_runner.py             | 19 +++---
 .../spacy_service/spacy_extractor.py          | 16 +++--
 prototypes/exxeta/index.py                    | 49 +++++++------
 prototypes/exxetaGPT/index.py                 | 49 +++++++------
 .../fine_tuning_spaCy/convert_to_spacy.py     |  4 +-
 prototypes/fine_tuning_spaCy/test_model.py    | 10 ++-
 prototypes/fine_tuning_spaCy/training_data.py | 56 +++++++--------
 .../fine_tuning_spaCy/training_model.py       |  8 ++-
 prototypes/first-mcp-python/client.py         | 37 +++++-----
 prototypes/first-mcp-python/main.py           |  1 +
 prototypes/mcp_validate-arc1/config.py        |  2 +-
 .../mcp_spacy_validate_with_exxeta.py         | 40 +++++++----
 prototypes/mcp_validate-arc1/mcp_validate.py  | 49 ++++++++-----
 prototypes/merge_validate-arc2/config.py      |  2 +-
 prototypes/merge_validate-arc2/exxeta_api.py  | 41 +++++------
 prototypes/merge_validate-arc2/merge_logic.py | 61 ++++++++++-------
 .../merge_validate-arc2/merge_validate.py     |  7 +-
 .../merge_validate-arc2/spacy_extract.py      |  3 +-
 prototypes/ocr/ocr.py                         | 21 ++++--
 prototypes/pdfplumber/tabellentext_holen.py   |  4 +-
 prototypes/spacy-layout/extract_pitchbooks.py | 34 ++++++----
 31 files changed, 416 insertions(+), 307 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9fcd752..dd98e70 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,8 +4,10 @@ repos:
     hooks:
       - id: black
         language_version: python3
+        files: ^project/backend/
 
   - repo: https://github.com/pycqa/flake8
     rev: 6.1.0
     hooks:
       - id: flake8
+        files: ^project/backend/
diff --git a/prototypes/PDFMiner/test.py b/prototypes/PDFMiner/test.py
index 148046a..7db4ada 100644
--- a/prototypes/PDFMiner/test.py
+++ b/prototypes/PDFMiner/test.py
@@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 
-fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
+fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
 rsrcmgr = PDFResourceManager()
 laparams = LAParams()
 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
@@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
 pages = PDFPage.get_pages(fp)
 
 for page in pages:
-    print('Processing next page...')
+    print("Processing next page...")
     interpreter.process_page(page)
     layout = device.get_result()
     for lobj in layout:
         if isinstance(lobj, LTTextBox):
             x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
-            print('At %r is text: %s' % ((x, y), text))
+            print("At %r is text: %s" % ((x, y), text))
diff --git a/prototypes/PyMuPdf/PyMuPdf_st.py b/prototypes/PyMuPdf/PyMuPdf_st.py
index 3af7af1..6da9cea 100644
--- a/prototypes/PyMuPdf/PyMuPdf_st.py
+++ b/prototypes/PyMuPdf/PyMuPdf_st.py
@@ -1,5 +1,5 @@
 #########################################################
-#Run: in Terminal -> streamlit run PyMuPdf_st.py  
+# Run: in Terminal -> streamlit run PyMuPdf_st.py
 #########################################################
 
 import streamlit as st
@@ -28,18 +28,14 @@ if uploaded_file and suchwort:
         rects = page.search_for(suchwort)
 
         for rect in rects:
-            fundstellen.append({
-                "seite": page_num,
-                "rect": rect
-            })
+            fundstellen.append({"seite": page_num, "rect": rect})
 
     if fundstellen:
         st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
 
         # Auswahl der Fundstelle
         auswahl = st.selectbox(
-            "Fundstelle auswählen:",
-            [f"Seite {f['seite'] + 1}" for f in fundstellen]
+            "Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
         )
 
         index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
diff --git a/prototypes/PyMuPdf/prototype.py b/prototypes/PyMuPdf/prototype.py
index 138f80c..c268f4e 100644
--- a/prototypes/PyMuPdf/prototype.py
+++ b/prototypes/PyMuPdf/prototype.py
@@ -38,7 +38,9 @@ for eintrag in kennzahlen:
                 highlight = page.add_highlight_annot(rect)
                 highlight.update()
         else:
-            st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
+            st.warning(
+                f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
+            )
     except Exception as e:
         st.error(f" Fehler bei Eintrag {eintrag}: {e}")
 
@@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
 # PDF anzeigen mit Scroll zu aktueller Seite
 st.subheader(f"Vorschau")
 with open(highlighted_path, "rb") as f:
-    base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    base64_pdf = base64.b64encode(f.read()).decode("utf-8")
 
 # Seite direkt ansteuern
-pdf_display = f'''
+pdf_display = f"""
 <iframe 
     src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}" 
     width="100%" height="800px" type="application/pdf">
 </iframe>
-'''
+"""
 st.markdown(pdf_display, unsafe_allow_html=True)
diff --git a/prototypes/arc1_prototype/client.py b/prototypes/arc1_prototype/client.py
index 519d514..b3ae21b 100644
--- a/prototypes/arc1_prototype/client.py
+++ b/prototypes/arc1_prototype/client.py
@@ -87,9 +87,9 @@ class Server:
         server_params = StdioServerParameters(
             command=command,
             args=self.config["args"],
-            env={**os.environ, **self.config["env"]}
-            if self.config.get("env")
-            else None,
+            env=(
+                {**os.environ, **self.config["env"]} if self.config.get("env") else None
+            ),
         )
         try:
             stdio_transport = await self.exit_stack.enter_async_context(
@@ -244,28 +244,23 @@ class LLMClient:
         formatted_messages = []
         for msg in messages:
             # print(msg)
-            formatted_messages.append({
-                "role": msg["role"],
-                "content": msg["content"]
-            })
+            formatted_messages.append({"role": msg["role"], "content": msg["content"]})
 
         client = AzureOpenAI(
-            api_key=self.api_key,
-            api_version="2023-07-01-preview",
-            base_url=url
+            api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
         )
         response = client.chat.completions.create(
-                messages=formatted_messages,
-                model="gpt-4o-mini",
-                # response_format={"type": "json_object"}
-                # temperature=0.7,
-                # top_p=0.95,
-                # frequency_penalty=0,
-                # presence_penalty=0,
-                # max_tokens=800,
-                # stop="",
-                # stream=False
-            )
+            messages=formatted_messages,
+            model="gpt-4o-mini",
+            # response_format={"type": "json_object"}
+            # temperature=0.7,
+            # top_p=0.95,
+            # frequency_penalty=0,
+            # presence_penalty=0,
+            # max_tokens=800,
+            # stop="",
+            # stream=False
+        )
         if response.choices[0].message.content:
             # print("response: " + response.choices[0].message.content)
             return response.choices[0].message.content
@@ -412,12 +407,16 @@ class ChatSession:
                 "4. Use appropriate context from the user's question\n"
                 "5. Avoid simply repeating the raw data\n\n"
                 "Please use only the tools that are explicitly defined above."
-
             )
 
             messages = [{"role": "system", "content": system_message}]
-            messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
-            "For each value there is only one correct answer, try to find it with the tools provided."})
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": "You have to extract data from pdf files and have different tools for extracting."
+                    "For each value there is only one correct answer, try to find it with the tools provided.",
+                }
+            )
 
             while True:
                 try:
@@ -455,7 +454,6 @@ class ChatSession:
                     # messages.append({"role": "assistant", "content": llm_response})
                     # logging.info("\nFinal response: %s", llm_response)
 
-
                 except KeyboardInterrupt:
                     logging.info("\nExiting...")
                     break
@@ -476,5 +474,6 @@ async def main() -> None:
     chat_session = ChatSession(servers, llm_client)
     await chat_session.start()
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/prototypes/arc1_prototype/server.py b/prototypes/arc1_prototype/server.py
index a588659..1788914 100644
--- a/prototypes/arc1_prototype/server.py
+++ b/prototypes/arc1_prototype/server.py
@@ -8,54 +8,86 @@ mcp = FastMCP("Demo")
 risikoProfile = ["Core/Core+, Core", "Value Add"]
 risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
 
+
 # Add an addition tool
 @mcp.tool()
 def add(a: int, b: int) -> int:
     """Add two numbers"""
     return a + b
 
+
 @mcp.tool()
 def getFromSpaCy() -> list:
     """Get data from SpaCy"""
-    return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
-            {"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
+    return [
+        {
+            "page": random.randint(1, 35),
+            "value": random.choice(risikoProfileSpacy),
+            "key": "Risiko",
+        },
+        {"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
+    ]
+
 
 @mcp.tool()
 def getFromChatGPT() -> list:
     """Get data from ChatGPT"""
-    return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
-            {"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
+    return [
+        {
+            "page": random.randint(1, 35),
+            "value": random.choice(risikoProfile),
+            "key": "Risiko",
+        },
+        {"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
+    ]
+
 
 @mcp.tool()
 def checkSpacyResult() -> dict:
     """This tool checks the result of SpaCy, ensuring it meets certain criteria."""
-    return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
+    return {
+        "page": random.randint(1, 35),
+        "value": random.choice(risikoProfile),
+        "key": "Risiko",
+    }
+
 
 @mcp.tool()
 def getFromChatGPTSingle(value: str) -> dict:
     """This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
-    return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
+    return {
+        "page": random.randint(1, 35),
+        "value": random.choice(risikoProfile),
+        "key": value,
+    }
+
 
 context = ""
 
+
 @mcp.tool()
 def getContext() -> str:
     """This tool gets context information."""
     return context
 
+
 @mcp.tool()
 def setContext(value: str) -> None:
     """This tool sets context information."""
     global context
     context = value
 
+
 # Add a dynamic greeting resource
 @mcp.resource("greeting://{name}")
 def get_greeting(name: str) -> str:
     """Get a personalized greeting"""
     return f"Hello, {name}!"
 
+
 """ Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
+
+
 @mcp.tool()
 def validate_entity(entity: str, label: str) -> dict:
     """Returns if the entity is valid based on hardcoded rules."""
@@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict:
         return {"status": "valid", "entity": entity}
     return {"status": "invalid", "entity": entity}
 
+
 """ Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
+
+
 @mcp.tool()
-def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
+def merge_spacy_exxeta(
+    spacy_result: list[dict], exxeta_result: list[dict]
+) -> list[dict]:
     """Merge two results, mark as validated if label/entity/page match."""
-    def norm(e): return e["entity"].lower().replace(" ", "")
+
+    def norm(e):
+        return e["entity"].lower().replace(" ", "")
 
     merged = []
     seen = set()
@@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
     for s in spacy_result:
         s_norm = norm(s)
         s_page = s["page"]
-        match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
+        match = next(
+            (
+                e
+                for e in exxeta_result
+                if e["label"] == s["label"]
+                and norm(e) == s_norm
+                and e["page"] == s_page
+            ),
+            None,
+        )
         if match:
             merged.append({**s, "status": "validated"})
             seen.add((match["entity"], match["page"]))
@@ -88,4 +136,4 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
     for e in exxeta_result:
         if (e["entity"], e["page"]) not in seen:
             merged.append({**e, "status": "exxeta_only"})
-    return merged
\ No newline at end of file
+    return merged
diff --git a/prototypes/arc2_prototype/app.py b/prototypes/arc2_prototype/app.py
index 0f73b37..1aa0707 100644
--- a/prototypes/arc2_prototype/app.py
+++ b/prototypes/arc2_prototype/app.py
@@ -12,10 +12,12 @@ app = Flask(__name__)
 UPLOAD_FOLDER = Path("pitchbooks")
 UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
 
+
 @app.route("/")
 def home():
     return "Backend is running!"
 
+
 @app.route("/upload", methods=["POST"])
 def upload():
     file = request.files.get("file")
@@ -44,5 +46,6 @@ def upload():
 
     return "status: complete\n"
 
+
 if __name__ == "__main__":
-    app.run(debug=True)
\ No newline at end of file
+    app.run(debug=True)
diff --git a/prototypes/arc2_prototype/exxeta_service/config.py b/prototypes/arc2_prototype/exxeta_service/config.py
index 7085986..df5494f 100644
--- a/prototypes/arc2_prototype/exxeta_service/config.py
+++ b/prototypes/arc2_prototype/exxeta_service/config.py
@@ -1,2 +1,2 @@
 EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
-EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
\ No newline at end of file
+EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
diff --git a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
index a66832d..7a1c74e 100644
--- a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
+++ b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py
@@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo"
 OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
 OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 
+
 def extract_with_exxeta(pages_json):
     results = []
 
@@ -18,33 +19,36 @@ def extract_with_exxeta(pages_json):
             continue
 
         prompt = (
-                "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
-                "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
-                "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
-                "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
-                "Beispiele:\n"
-                "- \"Core, Core+\" → entity: \"Core, Core+\"\n"
-                "- \"Core/Core+\" → entity: \"Core/Core+\"\n"
-                "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
-                "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
-                f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
-                "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
-                "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
-                "TEXT:\n" + text
+            "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
+            'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
+            'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
+            "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
+            "Beispiele:\n"
+            '- "Core, Core+" → entity: "Core, Core+"\n'
+            '- "Core/Core+" → entity: "Core/Core+"\n'
+            '- "Core and Core+" → entity: "Core and Core+"\n\n'
+            "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
+            f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
+            "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
+            "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
+            "TEXT:\n" + text
         )
 
         headers = {
             "Content-Type": "application/json",
-            "Authorization": f"Bearer {EXXETA_API_KEY}"
+            "Authorization": f"Bearer {EXXETA_API_KEY}",
         }
 
         payload = {
             "model": MODEL,
             "messages": [
-                {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
-                {"role": "user", "content": prompt}
+                {
+                    "role": "system",
+                    "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
+                },
+                {"role": "user", "content": prompt},
             ],
-            "temperature": 0.0
+            "temperature": 0.0,
         }
 
         url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@@ -77,4 +81,4 @@ def extract_with_exxeta(pages_json):
     with open(out_path, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=2, ensure_ascii=False)
 
-    return results
\ No newline at end of file
+    return results
diff --git a/prototypes/arc2_prototype/merge_validate_service/validator.py b/prototypes/arc2_prototype/merge_validate_service/validator.py
index f5045e4..e355dff 100644
--- a/prototypes/arc2_prototype/merge_validate_service/validator.py
+++ b/prototypes/arc2_prototype/merge_validate_service/validator.py
@@ -1,13 +1,16 @@
 from pathlib import Path
 import json
 
+
 def normalize_entity(entity_str):
-    return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
+    return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
+
 
 def load_json(path: Path):
     with path.open("r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def merge_and_validate_entities(filter_label=None):
     base = Path(__file__).resolve().parent.parent
     spacy_path = base / "spacy_service/output/spacy-results.json"
@@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None):
         s_page = s["page"]
 
         match = next(
-            (e for e in exxeta_data
-             if e["label"] == s["label"] and
-             normalize_entity(e["entity"]) == s_norm and
-             e["page"] == s_page),
-            None
+            (
+                e
+                for e in exxeta_data
+                if e["label"] == s["label"]
+                and normalize_entity(e["entity"]) == s_norm
+                and e["page"] == s_page
+            ),
+            None,
         )
 
         if match:
diff --git a/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py b/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py
index 0db3dc0..c0c8729 100644
--- a/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py
+++ b/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py
@@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent
 OUTPUT_FOLDER = BASE_DIR / "output"
 OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 
+
 def run_ocr_and_extract(pdf_path: str):
     pdf_path = Path(pdf_path)
     output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
@@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str):
     cmd = [
         "ocrmypdf",
         "--force-ocr",
-        "--output-type", "pdfa",
-        "--language", "deu+eng",
+        "--output-type",
+        "pdfa",
+        "--language",
+        "deu+eng",
         str(pdf_path),
-        str(output_pdf)
+        str(output_pdf),
     ]
 
     result = subprocess.run(cmd, capture_output=True)
@@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str):
         raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
 
     with pdfplumber.open(output_pdf) as pdf:
-        pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
+        pages = [
+            {"page": i + 1, "text": (page.extract_text() or "").strip()}
+            for i, page in enumerate(pdf.pages)
+        ]
 
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(pages, f, indent=2, ensure_ascii=False)
 
-    return {
-        "ocr_pdf": str(output_pdf),
-        "json_path": str(json_path)
-    }
\ No newline at end of file
+    return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}
diff --git a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
index 28d5a34..bf0bd6e 100644
--- a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
+++ b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
@@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 
 model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
 nlp = spacy.load(model_path)
-input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
+input_pdf_path = (
+    Path(__file__).resolve().parent
+    / ".."
+    / "ocr_pdf_service"
+    / "output"
+    / "pitchbook-OCR.pdf"
+)
 input_pdf = Path(input_pdf_path)
 doc = fitz.open(input_pdf)
 
@@ -26,14 +32,10 @@ def extract_with_spacy(pages_json):
 
         doc = nlp(text)
         for ent in doc.ents:
-            results.append({
-                "label": ent.label_,
-                "entity": ent.text,
-                "page": page_num
-            })
+            results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
 
     output_path = OUTPUT_FOLDER / f"spacy-results.json"
     with open(output_path, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=2, ensure_ascii=False)
 
-    return results
\ No newline at end of file
+    return results
diff --git a/prototypes/exxeta/index.py b/prototypes/exxeta/index.py
index 32e0cd7..c1a023e 100644
--- a/prototypes/exxeta/index.py
+++ b/prototypes/exxeta/index.py
@@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
 API_KEY = os.getenv("API_KEY")
 
 client = AzureOpenAI(
-    api_key=API_KEY,
-    api_version="2023-07-01-preview",
-        base_url=BASE_URL
-    )
+    api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
+)
+
+
 def extract_text_from_pdf(file_path):
     """Extract text content from a PDF file using PyMuPDF (fitz)."""
     all_text = ""
@@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
 pdf_text = extract_text_from_pdf(file_path)
 
 response = client.chat.completions.create(
-        messages=[
-            {
-                "role": "system",
-                "content": "Always respond with a valid JSON object"
-            },
-            {
-                "role": "user",
-                "content": """extract the values from the text. let not found values empty:
+    messages=[
+        {"role": "system", "content": "Always respond with a valid JSON object"},
+        {
+            "role": "user",
+            "content": """extract the values from the text. let not found values empty:
                 -Fondsname
                 -Fondsmanager
                 -Name Kapitalverwaltungsgesellschaft
@@ -71,20 +68,20 @@ response = client.chat.completions.create(
                     - the page where this value was found
                     - a confidence score, how confident the model is about the value (low, medium, high)
 
-                Here ist the text:""" + pdf_text
-            }
-        ],
-        model="gpt-4o-mini",
-        response_format={"type": "json_object"}
-        # temperature=0.7,
-        # top_p=0.95,
-        # frequency_penalty=0,
-        # presence_penalty=0,
-        # max_tokens=800,
-        # stop="",
-        # stream=False
-    )
-
+                Here ist the text:"""
+            + pdf_text,
+        },
+    ],
+    model="gpt-4o-mini",
+    response_format={"type": "json_object"},
+    # temperature=0.7,
+    # top_p=0.95,
+    # frequency_penalty=0,
+    # presence_penalty=0,
+    # max_tokens=800,
+    # stop="",
+    # stream=False
+)
 
 
 print(response.choices[0].message.content)
diff --git a/prototypes/exxetaGPT/index.py b/prototypes/exxetaGPT/index.py
index 32e0cd7..c1a023e 100644
--- a/prototypes/exxetaGPT/index.py
+++ b/prototypes/exxetaGPT/index.py
@@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
 API_KEY = os.getenv("API_KEY")
 
 client = AzureOpenAI(
-    api_key=API_KEY,
-    api_version="2023-07-01-preview",
-        base_url=BASE_URL
-    )
+    api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
+)
+
+
 def extract_text_from_pdf(file_path):
     """Extract text content from a PDF file using PyMuPDF (fitz)."""
     all_text = ""
@@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
 pdf_text = extract_text_from_pdf(file_path)
 
 response = client.chat.completions.create(
-        messages=[
-            {
-                "role": "system",
-                "content": "Always respond with a valid JSON object"
-            },
-            {
-                "role": "user",
-                "content": """extract the values from the text. let not found values empty:
+    messages=[
+        {"role": "system", "content": "Always respond with a valid JSON object"},
+        {
+            "role": "user",
+            "content": """extract the values from the text. let not found values empty:
                 -Fondsname
                 -Fondsmanager
                 -Name Kapitalverwaltungsgesellschaft
@@ -71,20 +68,20 @@ response = client.chat.completions.create(
                     - the page where this value was found
                     - a confidence score, how confident the model is about the value (low, medium, high)
 
-                Here ist the text:""" + pdf_text
-            }
-        ],
-        model="gpt-4o-mini",
-        response_format={"type": "json_object"}
-        # temperature=0.7,
-        # top_p=0.95,
-        # frequency_penalty=0,
-        # presence_penalty=0,
-        # max_tokens=800,
-        # stop="",
-        # stream=False
-    )
-
+                Here ist the text:"""
+            + pdf_text,
+        },
+    ],
+    model="gpt-4o-mini",
+    response_format={"type": "json_object"},
+    # temperature=0.7,
+    # top_p=0.95,
+    # frequency_penalty=0,
+    # presence_penalty=0,
+    # max_tokens=800,
+    # stop="",
+    # stream=False
+)
 
 
 print(response.choices[0].message.content)
diff --git a/prototypes/fine_tuning_spaCy/convert_to_spacy.py b/prototypes/fine_tuning_spaCy/convert_to_spacy.py
index 8e94729..25e74b3 100644
--- a/prototypes/fine_tuning_spaCy/convert_to_spacy.py
+++ b/prototypes/fine_tuning_spaCy/convert_to_spacy.py
@@ -2,7 +2,7 @@ import spacy
 from spacy.tokens import DocBin
 from training_data import TRAINING_DATA
 
-nlp = spacy.blank("de") 
+nlp = spacy.blank("de")
 doc_bin = DocBin()
 
 for text, annotations in TRAINING_DATA:
@@ -17,4 +17,4 @@ for text, annotations in TRAINING_DATA:
     doc.ents = ents
     doc_bin.add(doc)
 
-doc_bin.to_disk("data/train.spacy")
\ No newline at end of file
+doc_bin.to_disk("data/train.spacy")
diff --git a/prototypes/fine_tuning_spaCy/test_model.py b/prototypes/fine_tuning_spaCy/test_model.py
index 37ff4a4..7b51586 100644
--- a/prototypes/fine_tuning_spaCy/test_model.py
+++ b/prototypes/fine_tuning_spaCy/test_model.py
@@ -15,13 +15,11 @@ for page_number in range(len(doc)):
     text = page.get_text()
     spacy_doc = nlp(text)
     for ent in spacy_doc.ents:
-        results.append({
-            "label": ent.label_,
-            "entity": ent.text.strip(),
-            "page": page_number + 1 
-        })
+        results.append(
+            {"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
+        )
 
 with open("entities_output.json", "w", encoding="utf-8") as f:
     json.dump(results, f, indent=2, ensure_ascii=False)
 
-print("✅ Extraction completed. Results saved to 'entities_output.json'")
\ No newline at end of file
+print("✅ Extraction completed. Results saved to 'entities_output.json'")
diff --git a/prototypes/fine_tuning_spaCy/training_data.py b/prototypes/fine_tuning_spaCy/training_data.py
index f51aebd..6e4b6be 100644
--- a/prototypes/fine_tuning_spaCy/training_data.py
+++ b/prototypes/fine_tuning_spaCy/training_data.py
@@ -71,33 +71,33 @@ TRAINING_DATA = [
         "core, core+, value-added",
         {"entities": [[0, 24, "RISIKOPROFIL"]]},
     ),
-     (
-         "Manage to Core: max 20%",
-         {"entities": [[10, 14, "RISIKOPROFIL"]]},
-     ),
-     (
-         "Benefits of the core/ core+ segment",
-         {"entities": [[16, 27, "RISIKOPROFIL"]]},
-     ),
-     (
-         "Drawbacks of the core/ core+ segment",
-         {"entities": [[17, 28, "RISIKOPROFIL"]]},
-     ),
-     (
-         "Why a Core / Core + investment program?",
-         {"entities": [[6, 19, "RISIKOPROFIL"]]},
-     ),
-     (
-         "Different risk profile (core, core+, value-added)",
-         {"entities": [[24, 48, "RISIKOPROFIL"]]},
-     ),
-     (
+    (
+        "Manage to Core: max 20%",
+        {"entities": [[10, 14, "RISIKOPROFIL"]]},
+    ),
+    (
+        "Benefits of the core/ core+ segment",
+        {"entities": [[16, 27, "RISIKOPROFIL"]]},
+    ),
+    (
+        "Drawbacks of the core/ core+ segment",
+        {"entities": [[17, 28, "RISIKOPROFIL"]]},
+    ),
+    (
+        "Why a Core / Core + investment program?",
+        {"entities": [[6, 19, "RISIKOPROFIL"]]},
+    ),
+    (
+        "Different risk profile (core, core+, value-added)",
+        {"entities": [[24, 48, "RISIKOPROFIL"]]},
+    ),
+    (
         "INK MGallery Hotel Area: Amsterdam Core Tenant: Closed in 2018",
         {"entities": [[35, 39, "RISIKOPROFIL"]]},
-     ),
-     (
-         "A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
-         {"entities": [[34, 48, "RISIKOPROFIL"]]},
+    ),
+    (
+        "A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
+        {"entities": [[34, 48, "RISIKOPROFIL"]]},
     ),
     (
         "Navigate the diversity of the Core/Core+ investment opportunities in European Prime Cities",
@@ -226,9 +226,5 @@ TRAINING_DATA = [
     (
         "Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
         {"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
-    )
+    ),
 ]
-
-
-
-
diff --git a/prototypes/fine_tuning_spaCy/training_model.py b/prototypes/fine_tuning_spaCy/training_model.py
index 80b120b..4778961 100644
--- a/prototypes/fine_tuning_spaCy/training_model.py
+++ b/prototypes/fine_tuning_spaCy/training_model.py
@@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA):
     for start, end, label in annot["entities"]:
         span = doc.char_span(start, end, label=label, alignment_mode="contract")
         if span is None:
-            print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+            print(
+                f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
+            )
         else:
             ents.append(span)
-            print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+            print(
+                f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
+            )
     # label the text with the ents
     doc.ents = ents
     db.add(doc)
diff --git a/prototypes/first-mcp-python/client.py b/prototypes/first-mcp-python/client.py
index 3c0fe7b..697bd88 100644
--- a/prototypes/first-mcp-python/client.py
+++ b/prototypes/first-mcp-python/client.py
@@ -87,9 +87,9 @@ class Server:
         server_params = StdioServerParameters(
             command=command,
             args=self.config["args"],
-            env={**os.environ, **self.config["env"]}
-            if self.config.get("env")
-            else None,
+            env=(
+                {**os.environ, **self.config["env"]} if self.config.get("env") else None
+            ),
         )
         try:
             stdio_transport = await self.exit_stack.enter_async_context(
@@ -244,28 +244,23 @@ class LLMClient:
         formatted_messages = []
         for msg in messages:
             print(msg)
-            formatted_messages.append({
-                "role": msg["role"],
-                "content": msg["content"]
-            })
+            formatted_messages.append({"role": msg["role"], "content": msg["content"]})
 
         client = AzureOpenAI(
-            api_key=self.api_key,
-            api_version="2023-07-01-preview",
-            base_url=url
+            api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
         )
         response = client.chat.completions.create(
-                messages=formatted_messages,
-                model="gpt-4o-mini",
-                # response_format={"type": "json_object"}
-                # temperature=0.7,
-                # top_p=0.95,
-                # frequency_penalty=0,
-                # presence_penalty=0,
-                # max_tokens=800,
-                # stop="",
-                # stream=False
-            )
+            messages=formatted_messages,
+            model="gpt-4o-mini",
+            # response_format={"type": "json_object"}
+            # temperature=0.7,
+            # top_p=0.95,
+            # frequency_penalty=0,
+            # presence_penalty=0,
+            # max_tokens=800,
+            # stop="",
+            # stream=False
+        )
         if response.choices[0].message.content:
             print("response: " + response.choices[0].message.content)
             return response.choices[0].message.content
diff --git a/prototypes/first-mcp-python/main.py b/prototypes/first-mcp-python/main.py
index 3a17140..a69e6c5 100644
--- a/prototypes/first-mcp-python/main.py
+++ b/prototypes/first-mcp-python/main.py
@@ -1,5 +1,6 @@
 # server.py
 from mcp.server.fastmcp import FastMCP
+
 # Create an MCP server
 mcp = FastMCP("Demo")
 
diff --git a/prototypes/mcp_validate-arc1/config.py b/prototypes/mcp_validate-arc1/config.py
index 3b27716..46f5ef7 100644
--- a/prototypes/mcp_validate-arc1/config.py
+++ b/prototypes/mcp_validate-arc1/config.py
@@ -1,3 +1,3 @@
 EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
 EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
-MODEL_ID = "gpt-35-turbo"
\ No newline at end of file
+MODEL_ID = "gpt-35-turbo"
diff --git a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py
index ec97452..f3b04e6 100644
--- a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py
+++ b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py
@@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
 OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
 OUTPUT_PATH = "mcp_spacy_validated_result.json"
 
+
 def load_spacy_entities():
     with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def load_pitchbook_pages():
     with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def get_page_text(pages, page_number):
     for page in pages:
         if page.get("page") == page_number:
             return page.get("text", "")
     return ""
 
+
 def normalize_entity(entity):
-    return ' '.join(entity.replace('\n', ' ').split())
+    return " ".join(entity.replace("\n", " ").split())
+
 
 def validate_entity_with_exxeta(entity, page_num, text):
     prompt = (
         f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
         f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
         f"Ziel-Formulierung:\n"
-        f"\"{entity}\"\n\n"
+        f'"{entity}"\n\n'
         f"Validierungsregeln:\n"
         f"- Groß- und Kleinschreibung ignorieren.\n"
         f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
         f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
         f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
-        f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
+        f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
         f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
         f"OCR-Text auf Seite {page_num}:\n{text}"
     )
 
     headers = {
         "Content-Type": "application/json",
-        "Authorization": f"Bearer {EXXETA_API_KEY}"
+        "Authorization": f"Bearer {EXXETA_API_KEY}",
     }
 
     payload = {
         "model": MODEL,
         "messages": [
-            {"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
-            {"role": "user", "content": prompt}
+            {
+                "role": "system",
+                "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
+            },
+            {"role": "user", "content": prompt},
         ],
-        "temperature": 0.0
+        "temperature": 0.0,
     }
 
     url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text):
         print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
         return False
 
+
 def run():
     spacy_entities = load_spacy_entities()
     pitchbook_pages = load_pitchbook_pages()
@@ -81,17 +90,20 @@ def run():
         page_text = get_page_text(pitchbook_pages, page)
         is_valid = validate_entity_with_exxeta(entity, page, page_text)
 
-        validated_results.append({
-            "label": entity_data.get("label"),
-            "entity": raw_entity,
-            "page": page,
-            "validated": is_valid
-        })
+        validated_results.append(
+            {
+                "label": entity_data.get("label"),
+                "entity": raw_entity,
+                "page": page,
+                "validated": is_valid,
+            }
+        )
 
     with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
         json.dump(validated_results, f, indent=2, ensure_ascii=False)
 
     print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
 
+
 if __name__ == "__main__":
-    run()
\ No newline at end of file
+    run()
diff --git a/prototypes/mcp_validate-arc1/mcp_validate.py b/prototypes/mcp_validate-arc1/mcp_validate.py
index 699e2ee..4dea0e5 100644
--- a/prototypes/mcp_validate-arc1/mcp_validate.py
+++ b/prototypes/mcp_validate-arc1/mcp_validate.py
@@ -10,19 +10,23 @@ KPI_SERVICE_MAP = {
 SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
 EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
 
+
 def load_spacy_entities(path):
     with open(path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def load_exxeta_entities(path):
     with open(path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def normalize(text):
     if not text:
         return ""
     return text.strip().lower().replace(" ", "").replace("/", "/")
 
+
 def validate_kpi(kpi, spacy_entities, exxeta_entities):
     results = []
 
@@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
             for ee in exxeta_entries:
                 ee_entity = normalize(ee["entity"])
                 if se_entity == ee_entity:
-                    results.append({
-                        "kpi": kpi,
-                        "entity": se["entity"],
-                        "page": page,
-                        "validation_status": "validated"
-                    })
+                    results.append(
+                        {
+                            "kpi": kpi,
+                            "entity": se["entity"],
+                            "page": page,
+                            "validation_status": "validated",
+                        }
+                    )
                     matched = True
                     break
 
             if not matched:
-                results.append({
-                    "kpi": kpi,
-                    "entity": se["entity"],
-                    "page": page,
-                    "validation_status": "spacy-only"
-                })
+                results.append(
+                    {
+                        "kpi": kpi,
+                        "entity": se["entity"],
+                        "page": page,
+                        "validation_status": "spacy-only",
+                    }
+                )
 
         for ee in exxeta_entries:
             ee_entity = normalize(ee["entity"])
             if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
-                results.append({
-                    "kpi": kpi,
-                    "entity": ee["entity"],
-                    "page": page,
-                    "validation_status": "exxeta-only"
-                })
+                results.append(
+                    {
+                        "kpi": kpi,
+                        "entity": ee["entity"],
+                        "page": page,
+                        "validation_status": "exxeta-only",
+                    }
+                )
 
     return results
 
+
 def save_results(results, filename):
     with open(filename, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=2, ensure_ascii=False)
 
+
 def run():
     spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
     exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
@@ -96,5 +108,6 @@ def run():
     save_results(all_results, "mcp_validated_result.json")
     print("✅ Validation complete! Output: mcp_validated_result.json")
 
+
 if __name__ == "__main__":
     run()
diff --git a/prototypes/merge_validate-arc2/config.py b/prototypes/merge_validate-arc2/config.py
index 3b27716..46f5ef7 100644
--- a/prototypes/merge_validate-arc2/config.py
+++ b/prototypes/merge_validate-arc2/config.py
@@ -1,3 +1,3 @@
 EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
 EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
-MODEL_ID = "gpt-35-turbo"
\ No newline at end of file
+MODEL_ID = "gpt-35-turbo"
diff --git a/prototypes/merge_validate-arc2/exxeta_api.py b/prototypes/merge_validate-arc2/exxeta_api.py
index 3d8f8b2..97fa88b 100644
--- a/prototypes/merge_validate-arc2/exxeta_api.py
+++ b/prototypes/merge_validate-arc2/exxeta_api.py
@@ -4,6 +4,7 @@ import json
 
 MODEL = "gpt-35-turbo"
 
+
 def extract_risikoprofil_from_exxeta(pages_json):
     results = []
 
@@ -15,34 +16,36 @@ def extract_risikoprofil_from_exxeta(pages_json):
             continue
 
         prompt = (
-                "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
-                "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
-                "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
-                "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
-                "Beispiele:\n"
-                "- \"Core, Core+\" → entity: \"Core, Core+\"\n"
-                "- \"Core/Core+\" → entity: \"Core/Core+\"\n"
-                "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
-                "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
-                f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
-                "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
-                "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
-                "TEXT:\n" + text
+            "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
+            'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
+            'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
+            "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
+            "Beispiele:\n"
+            '- "Core, Core+" → entity: "Core, Core+"\n'
+            '- "Core/Core+" → entity: "Core/Core+"\n'
+            '- "Core and Core+" → entity: "Core and Core+"\n\n'
+            "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
+            f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
+            "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
+            "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
+            "TEXT:\n" + text
         )
 
-
         headers = {
             "Content-Type": "application/json",
-            "Authorization": f"Bearer {EXXETA_API_KEY}"
+            "Authorization": f"Bearer {EXXETA_API_KEY}",
         }
 
         payload = {
             "model": MODEL,
             "messages": [
-                {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
-                {"role": "user", "content": prompt}
+                {
+                    "role": "system",
+                    "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
+                },
+                {"role": "user", "content": prompt},
             ],
-            "temperature": 0.0
+            "temperature": 0.0,
         }
 
         url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@@ -71,4 +74,4 @@ def extract_risikoprofil_from_exxeta(pages_json):
             except Exception as e:
                 print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}")
 
-    return results
\ No newline at end of file
+    return results
diff --git a/prototypes/merge_validate-arc2/merge_logic.py b/prototypes/merge_validate-arc2/merge_logic.py
index b6cf2c3..38d2d48 100644
--- a/prototypes/merge_validate-arc2/merge_logic.py
+++ b/prototypes/merge_validate-arc2/merge_logic.py
@@ -1,10 +1,11 @@
 def normalize_entity(entity_str):
     if not entity_str:
         return ""
-    normalized = entity_str.replace('\n', ' ')
-    normalized = ''.join(normalized.lower().split())
+    normalized = entity_str.replace("\n", " ")
+    normalized = "".join(normalized.lower().split())
     return normalized
 
+
 def merge_and_validate_entities(spacy_data, exxeta_data):
     merged = []
     seen = set()
@@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
             e_page = e["page"]
 
             # Match if normalized entity and page match
-            if (s["label"] == e["label"] and
-                    s_entity_norm == e_entity_norm and
-                    s_page == e_page):
+            if (
+                s["label"] == e["label"]
+                and s_entity_norm == e_entity_norm
+                and s_page == e_page
+            ):
 
-                merged.append({
-                    "label": s["label"],
-                    "entity": s["entity"],
-                    "page": s_page,
-                    "status": "validated"
-                })
+                merged.append(
+                    {
+                        "label": s["label"],
+                        "entity": s["entity"],
+                        "page": s_page,
+                        "status": "validated",
+                    }
+                )
                 seen.add((e["entity"], e_page))
                 found = True
                 break
 
         # If no match found, add as single-source
         if not found:
-            merged.append({
-                "label": s["label"],
-                "entity": s["entity"],
-                "page": s_page,
-                "status": "single-source",
-                "source": "spacy"
-            })
+            merged.append(
+                {
+                    "label": s["label"],
+                    "entity": s["entity"],
+                    "page": s_page,
+                    "status": "single-source",
+                    "source": "spacy",
+                }
+            )
 
     # Add remaining Exxeta entities not already processed
     for e in exxeta_data:
         if (e["entity"], e["page"]) not in seen:
-            merged.append({
-                "label": e["label"],
-                "entity": e["entity"],
-                "page": e["page"],
-                "status": "single-source",
-                "source": "exxeta"
-            })
+            merged.append(
+                {
+                    "label": e["label"],
+                    "entity": e["entity"],
+                    "page": e["page"],
+                    "status": "single-source",
+                    "source": "exxeta",
+                }
+            )
 
-    return merged
\ No newline at end of file
+    return merged
diff --git a/prototypes/merge_validate-arc2/merge_validate.py b/prototypes/merge_validate-arc2/merge_validate.py
index e9b2886..b3cea82 100644
--- a/prototypes/merge_validate-arc2/merge_validate.py
+++ b/prototypes/merge_validate-arc2/merge_validate.py
@@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities
 SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
 PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
 
+
 def load_pitchbook_pages():
     path = Path(PITCHBOOK_PATH)
     with open(path, "r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def save_json(data, filename):
     with open(filename, "w", encoding="utf-8") as f:
         json.dump(data, f, indent=2, ensure_ascii=False)
 
+
 def sort_by_page_number(entities):
     return sorted(entities, key=lambda x: x.get("page", 0))
 
+
 def run():
     spacy_entities = load_spacy_entities(SPACY_PATH)
     pitchbook_pages = load_pitchbook_pages()
@@ -33,5 +37,6 @@ def run():
     print("- merged_result.json")
     print(f"- Total entities in merged result: {len(merged_sorted)}")
 
+
 if __name__ == "__main__":
-    run()
\ No newline at end of file
+    run()
diff --git a/prototypes/merge_validate-arc2/spacy_extract.py b/prototypes/merge_validate-arc2/spacy_extract.py
index 0ccc818..f6e9075 100644
--- a/prototypes/merge_validate-arc2/spacy_extract.py
+++ b/prototypes/merge_validate-arc2/spacy_extract.py
@@ -1,7 +1,8 @@
 import json
 from pathlib import Path
 
+
 def load_spacy_entities(path):
     path = Path(path)
     with open(path, "r", encoding="utf-8") as f:
-        return json.load(f)
\ No newline at end of file
+        return json.load(f)
diff --git a/prototypes/ocr/ocr.py b/prototypes/ocr/ocr.py
index 3149035..6e4f0e8 100644
--- a/prototypes/ocr/ocr.py
+++ b/prototypes/ocr/ocr.py
@@ -11,15 +11,20 @@ log_folder = Path("logs")
 for folder in [output_folder, log_folder]:
     folder.mkdir(parents=True, exist_ok=True)
 
+
 def extract_text_to_json(pdf_path: Path):
     json_path = output_folder / f"{pdf_path.stem}.json"
     with pdfplumber.open(pdf_path) as pdf:
-        pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
+        pages = [
+            {"page": i + 1, "text": (page.extract_text() or "").strip()}
+            for i, page in enumerate(pdf.pages)
+        ]
 
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(pages, f, indent=2, ensure_ascii=False)
     print(f"📄 Text JSON saved: {json_path.name}")
 
+
 def ocr_pdf(input_file: Path):
     output_file = output_folder / f"{input_file.stem}-OCR.pdf"
     log_file = log_folder / f"{input_file.stem}.log"
@@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path):
     cmd = [
         "ocrmypdf",
         "--force-ocr",
-        "--output-type", "pdfa",
-        "--language", "deu+eng",  
-        "--sidecar", str(sidecar_txt),
+        "--output-type",
+        "pdfa",
+        "--language",
+        "deu+eng",
+        "--sidecar",
+        str(sidecar_txt),
         str(input_file),
-        str(output_file)
+        str(output_file),
     ]
 
     with open(log_file, "w") as log:
@@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path):
     else:
         print(f"❌ OCR failed. See log: {log_file}")
 
+
 if __name__ == "__main__":
     if not input_folder.exists():
         print("Input folder does not exist!")
@@ -54,4 +63,4 @@ if __name__ == "__main__":
         else:
             for pdf in pdfs:
                 print(f"Processing: {pdf.name}")
-                ocr_pdf(pdf)
\ No newline at end of file
+                ocr_pdf(pdf)
diff --git a/prototypes/pdfplumber/tabellentext_holen.py b/prototypes/pdfplumber/tabellentext_holen.py
index 7666df8..58513fd 100644
--- a/prototypes/pdfplumber/tabellentext_holen.py
+++ b/prototypes/pdfplumber/tabellentext_holen.py
@@ -1,4 +1,4 @@
-import pdfplumber  
+import pdfplumber
 
 pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
 
@@ -10,7 +10,7 @@ pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
 
 #         # Print the extracted text with preserved structure
 #         print(f"Page {page.page_number}:\n{page_text}\n")
-with pdfplumber.open(pdf_path) as pdf:  
+with pdfplumber.open(pdf_path) as pdf:
 
     for i, page in enumerate(pdf.pages):
         tables = page.extract_tables()
diff --git a/prototypes/spacy-layout/extract_pitchbooks.py b/prototypes/spacy-layout/extract_pitchbooks.py
index 7cd2139..37674df 100644
--- a/prototypes/spacy-layout/extract_pitchbooks.py
+++ b/prototypes/spacy-layout/extract_pitchbooks.py
@@ -1,6 +1,6 @@
 # https://github.com/explosion/spacy-layout
 ### Run with: python extract_pitchbooks.py
-import spacy 
+import spacy
 from spacy_layout import spaCyLayout
 from pathlib import Path
 import pandas as pd
@@ -34,14 +34,14 @@ for ent in doc_ner.ents:
             break
 
     if ent.text.strip():
-        ner_text_results.append({
-            "label": ent.label_,
-            "entity": ent.text.strip(),
-            "page": page_number
-        })
+        ner_text_results.append(
+            {"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
+        )
 
 print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
-(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
+(output_dir / "ner_text.json").write_text(
+    json.dumps(ner_text_results, indent=2, ensure_ascii=False)
+)
 
 # 2. NER on table cells
 table_ner_results = []
@@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1):
             doc_cell = nlp(cell)
             for ent in doc_cell.ents:
                 if ent.text.strip():
-                    table_ner_results.append({
-                        "label": ent.label_,
-                        "entity": ent.text.strip(),
-                        "page": page_number,
-                        "table": i
-                    })
+                    table_ner_results.append(
+                        {
+                            "label": ent.label_,
+                            "entity": ent.text.strip(),
+                            "page": page_number,
+                            "table": i,
+                        }
+                    )
 
 print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
-(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
+(output_dir / "ner_tables.json").write_text(
+    json.dumps(table_ner_results, indent=2, ensure_ascii=False)
+)
 
-print("✅ Done! Extracted data saved to /output")
\ No newline at end of file
+print("✅ Done! Extracted data saved to /output")