From cd6c66a1fdc40ffa2fbff3ac2f2986e0fb7677f9 Mon Sep 17 00:00:00 2001 From: Abdulraahman Dabbagh <1924466@stud.hs-mannheim.de> Date: Sun, 25 May 2025 10:47:16 +0200 Subject: [PATCH] Backend Flask aufsetzen (Ticket #4) --- .pre-commit-config.yaml | 2 + prototypes/PDFMiner/test.py | 6 +- prototypes/PyMuPdf/PyMuPdf_st.py | 10 +-- prototypes/PyMuPdf/prototype.py | 10 +-- prototypes/arc1_prototype/client.py | 49 +++++++------ prototypes/arc1_prototype/server.py | 68 ++++++++++++++++--- prototypes/arc2_prototype/app.py | 5 +- .../arc2_prototype/exxeta_service/config.py | 2 +- .../exxeta_service/exxeta_client.py | 40 ++++++----- .../merge_validate_service/validator.py | 18 +++-- .../ocr_pdf_service/ocr_runner.py | 19 +++--- .../spacy_service/spacy_extractor.py | 16 +++-- prototypes/exxeta/index.py | 49 +++++++------ prototypes/exxetaGPT/index.py | 49 +++++++------ .../fine_tuning_spaCy/convert_to_spacy.py | 4 +- prototypes/fine_tuning_spaCy/test_model.py | 10 ++- prototypes/fine_tuning_spaCy/training_data.py | 56 +++++++-------- .../fine_tuning_spaCy/training_model.py | 8 ++- prototypes/first-mcp-python/client.py | 37 +++++----- prototypes/first-mcp-python/main.py | 1 + prototypes/mcp_validate-arc1/config.py | 2 +- .../mcp_spacy_validate_with_exxeta.py | 40 +++++++---- prototypes/mcp_validate-arc1/mcp_validate.py | 49 ++++++++----- prototypes/merge_validate-arc2/config.py | 2 +- prototypes/merge_validate-arc2/exxeta_api.py | 41 +++++------ prototypes/merge_validate-arc2/merge_logic.py | 61 ++++++++++------- .../merge_validate-arc2/merge_validate.py | 7 +- .../merge_validate-arc2/spacy_extract.py | 3 +- prototypes/ocr/ocr.py | 21 ++++-- prototypes/pdfplumber/tabellentext_holen.py | 4 +- prototypes/spacy-layout/extract_pitchbooks.py | 34 ++++++---- 31 files changed, 416 insertions(+), 307 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9fcd752..dd98e70 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,8 +4,10 @@ repos: hooks: - id: black language_version: python3 + files: ^project/backend/ - repo: https://github.com/pycqa/flake8 rev: 6.1.0 hooks: - id: flake8 + files: ^project/backend/ diff --git a/prototypes/PDFMiner/test.py b/prototypes/PDFMiner/test.py index 148046a..7db4ada 100644 --- a/prototypes/PDFMiner/test.py +++ b/prototypes/PDFMiner/test.py @@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator -fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb') +fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) @@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: - print('Processing next page...') + print("Processing next page...") interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text() - print('At %r is text: %s' % ((x, y), text)) + print("At %r is text: %s" % ((x, y), text)) diff --git a/prototypes/PyMuPdf/PyMuPdf_st.py b/prototypes/PyMuPdf/PyMuPdf_st.py index 3af7af1..6da9cea 100644 --- a/prototypes/PyMuPdf/PyMuPdf_st.py +++ b/prototypes/PyMuPdf/PyMuPdf_st.py @@ -1,5 +1,5 @@ ######################################################### -#Run: in Terminal -> streamlit run PyMuPdf_st.py +# Run: in Terminal -> streamlit run PyMuPdf_st.py ######################################################### import streamlit as st @@ -28,18 +28,14 @@ if uploaded_file and suchwort: rects = page.search_for(suchwort) for rect in rects: - fundstellen.append({ - "seite": page_num, - "rect": rect - }) + fundstellen.append({"seite": page_num, "rect": rect}) if fundstellen: st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.") # Auswahl der Fundstelle auswahl = st.selectbox( - "Fundstelle auswählen:", - [f"Seite {f['seite'] + 1}" for f in fundstellen] + "Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen] ) index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl) diff --git a/prototypes/PyMuPdf/prototype.py b/prototypes/PyMuPdf/prototype.py index 138f80c..c268f4e 100644 --- a/prototypes/PyMuPdf/prototype.py +++ b/prototypes/PyMuPdf/prototype.py @@ -38,7 +38,9 @@ for eintrag in kennzahlen: highlight = page.add_highlight_annot(rect) highlight.update() else: - st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)") + st.warning( + f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)" + ) except Exception as e: st.error(f" Fehler bei Eintrag {eintrag}: {e}") @@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1)) # PDF anzeigen mit Scroll zu aktueller Seite st.subheader(f"Vorschau") with open(highlighted_path, "rb") as f: - base64_pdf = base64.b64encode(f.read()).decode('utf-8') + base64_pdf = base64.b64encode(f.read()).decode("utf-8") # Seite direkt ansteuern -pdf_display = f''' +pdf_display = f""" -''' +""" st.markdown(pdf_display, unsafe_allow_html=True) diff --git a/prototypes/arc1_prototype/client.py b/prototypes/arc1_prototype/client.py index 519d514..b3ae21b 100644 --- a/prototypes/arc1_prototype/client.py +++ b/prototypes/arc1_prototype/client.py @@ -87,9 +87,9 @@ class Server: server_params = StdioServerParameters( command=command, args=self.config["args"], - env={**os.environ, **self.config["env"]} - if self.config.get("env") - else None, + env=( + {**os.environ, **self.config["env"]} if self.config.get("env") else None + ), ) try: stdio_transport = await self.exit_stack.enter_async_context( @@ -244,28 +244,23 @@ class LLMClient: formatted_messages = [] for msg in messages: # print(msg) - formatted_messages.append({ - "role": msg["role"], - "content": msg["content"] - }) + formatted_messages.append({"role": msg["role"], "content": msg["content"]}) client = AzureOpenAI( - api_key=self.api_key, - api_version="2023-07-01-preview", - base_url=url + api_key=self.api_key, api_version="2023-07-01-preview", base_url=url ) response = client.chat.completions.create( - messages=formatted_messages, - model="gpt-4o-mini", - # response_format={"type": "json_object"} - # temperature=0.7, - # top_p=0.95, - # frequency_penalty=0, - # presence_penalty=0, - # max_tokens=800, - # stop="", - # stream=False - ) + messages=formatted_messages, + model="gpt-4o-mini", + # response_format={"type": "json_object"} + # temperature=0.7, + # top_p=0.95, + # frequency_penalty=0, + # presence_penalty=0, + # max_tokens=800, + # stop="", + # stream=False + ) if response.choices[0].message.content: # print("response: " + response.choices[0].message.content) return response.choices[0].message.content @@ -412,12 +407,16 @@ class ChatSession: "4. Use appropriate context from the user's question\n" "5. Avoid simply repeating the raw data\n\n" "Please use only the tools that are explicitly defined above." - ) messages = [{"role": "system", "content": system_message}] - messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting." - "For each value there is only one correct answer, try to find it with the tools provided."}) + messages.append( + { + "role": "assistant", + "content": "You have to extract data from pdf files and have different tools for extracting." + "For each value there is only one correct answer, try to find it with the tools provided.", + } + ) while True: try: @@ -455,7 +454,6 @@ class ChatSession: # messages.append({"role": "assistant", "content": llm_response}) # logging.info("\nFinal response: %s", llm_response) - except KeyboardInterrupt: logging.info("\nExiting...") break @@ -476,5 +474,6 @@ async def main() -> None: chat_session = ChatSession(servers, llm_client) await chat_session.start() + if __name__ == "__main__": asyncio.run(main()) diff --git a/prototypes/arc1_prototype/server.py b/prototypes/arc1_prototype/server.py index a588659..1788914 100644 --- a/prototypes/arc1_prototype/server.py +++ b/prototypes/arc1_prototype/server.py @@ -8,54 +8,86 @@ mcp = FastMCP("Demo") risikoProfile = ["Core/Core+, Core", "Value Add"] risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"] + # Add an addition tool @mcp.tool() def add(a: int, b: int) -> int: """Add two numbers""" return a + b + @mcp.tool() def getFromSpaCy() -> list: """Get data from SpaCy""" - return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"}, - {"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}] + return [ + { + "page": random.randint(1, 35), + "value": random.choice(risikoProfileSpacy), + "key": "Risiko", + }, + {"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"}, + ] + @mcp.tool() def getFromChatGPT() -> list: """Get data from ChatGPT""" - return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}, - {"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}] + return [ + { + "page": random.randint(1, 35), + "value": random.choice(risikoProfile), + "key": "Risiko", + }, + {"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"}, + ] + @mcp.tool() def checkSpacyResult() -> dict: """This tool checks the result of SpaCy, ensuring it meets certain criteria.""" - return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"} + return { + "page": random.randint(1, 35), + "value": random.choice(risikoProfile), + "key": "Risiko", + } + @mcp.tool() def getFromChatGPTSingle(value: str) -> dict: """This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated""" - return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value} + return { + "page": random.randint(1, 35), + "value": random.choice(risikoProfile), + "key": value, + } + context = "" + @mcp.tool() def getContext() -> str: """This tool gets context information.""" return context + @mcp.tool() def setContext(value: str) -> None: """This tool sets context information.""" global context context = value + # Add a dynamic greeting resource @mcp.resource("greeting://{name}") def get_greeting(name: str) -> str: """Get a personalized greeting""" return f"Hello, {name}!" + """ Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """ + + @mcp.tool() def validate_entity(entity: str, label: str) -> dict: """Returns if the entity is valid based on hardcoded rules.""" @@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict: return {"status": "valid", "entity": entity} return {"status": "invalid", "entity": entity} + """ Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """ + + @mcp.tool() -def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]: +def merge_spacy_exxeta( + spacy_result: list[dict], exxeta_result: list[dict] +) -> list[dict]: """Merge two results, mark as validated if label/entity/page match.""" - def norm(e): return e["entity"].lower().replace(" ", "") + + def norm(e): + return e["entity"].lower().replace(" ", "") merged = [] seen = set() @@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l for s in spacy_result: s_norm = norm(s) s_page = s["page"] - match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None) + match = next( + ( + e + for e in exxeta_result + if e["label"] == s["label"] + and norm(e) == s_norm + and e["page"] == s_page + ), + None, + ) if match: merged.append({**s, "status": "validated"}) seen.add((match["entity"], match["page"])) @@ -88,4 +136,4 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l for e in exxeta_result: if (e["entity"], e["page"]) not in seen: merged.append({**e, "status": "exxeta_only"}) - return merged \ No newline at end of file + return merged diff --git a/prototypes/arc2_prototype/app.py b/prototypes/arc2_prototype/app.py index 0f73b37..1aa0707 100644 --- a/prototypes/arc2_prototype/app.py +++ b/prototypes/arc2_prototype/app.py @@ -12,10 +12,12 @@ app = Flask(__name__) UPLOAD_FOLDER = Path("pitchbooks") UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + @app.route("/") def home(): return "Backend is running!" + @app.route("/upload", methods=["POST"]) def upload(): file = request.files.get("file") @@ -44,5 +46,6 @@ def upload(): return "status: complete\n" + if __name__ == "__main__": - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/prototypes/arc2_prototype/exxeta_service/config.py b/prototypes/arc2_prototype/exxeta_service/config.py index 7085986..df5494f 100644 --- a/prototypes/arc2_prototype/exxeta_service/config.py +++ b/prototypes/arc2_prototype/exxeta_service/config.py @@ -1,2 +1,2 @@ EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0" -EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" \ No newline at end of file +EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" diff --git a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py index a66832d..7a1c74e 100644 --- a/prototypes/arc2_prototype/exxeta_service/exxeta_client.py +++ b/prototypes/arc2_prototype/exxeta_service/exxeta_client.py @@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo" OUTPUT_FOLDER = Path(__file__).resolve().parent / "output" OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) + def extract_with_exxeta(pages_json): results = [] @@ -18,33 +19,36 @@ def extract_with_exxeta(pages_json): continue prompt = ( - "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n" - "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n" - "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, " - "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n" - "Beispiele:\n" - "- \"Core, Core+\" → entity: \"Core, Core+\"\n" - "- \"Core/Core+\" → entity: \"Core/Core+\"\n" - "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n" - "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n" - f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n" - "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n" - "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n" - "TEXT:\n" + text + "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n" + 'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n' + 'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, ' + "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n" + "Beispiele:\n" + '- "Core, Core+" → entity: "Core, Core+"\n' + '- "Core/Core+" → entity: "Core/Core+"\n' + '- "Core and Core+" → entity: "Core and Core+"\n\n' + "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n" + f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n' + "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n" + "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n" + "TEXT:\n" + text ) headers = { "Content-Type": "application/json", - "Authorization": f"Bearer {EXXETA_API_KEY}" + "Authorization": f"Bearer {EXXETA_API_KEY}", } payload = { "model": MODEL, "messages": [ - {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.", + }, + {"role": "user", "content": prompt}, ], - "temperature": 0.0 + "temperature": 0.0, } url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" @@ -77,4 +81,4 @@ def extract_with_exxeta(pages_json): with open(out_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) - return results \ No newline at end of file + return results diff --git a/prototypes/arc2_prototype/merge_validate_service/validator.py b/prototypes/arc2_prototype/merge_validate_service/validator.py index f5045e4..e355dff 100644 --- a/prototypes/arc2_prototype/merge_validate_service/validator.py +++ b/prototypes/arc2_prototype/merge_validate_service/validator.py @@ -1,13 +1,16 @@ from pathlib import Path import json + def normalize_entity(entity_str): - return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else "" + return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else "" + def load_json(path: Path): with path.open("r", encoding="utf-8") as f: return json.load(f) + def merge_and_validate_entities(filter_label=None): base = Path(__file__).resolve().parent.parent spacy_path = base / "spacy_service/output/spacy-results.json" @@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None): s_page = s["page"] match = next( - (e for e in exxeta_data - if e["label"] == s["label"] and - normalize_entity(e["entity"]) == s_norm and - e["page"] == s_page), - None + ( + e + for e in exxeta_data + if e["label"] == s["label"] + and normalize_entity(e["entity"]) == s_norm + and e["page"] == s_page + ), + None, ) if match: diff --git a/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py b/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py index 0db3dc0..c0c8729 100644 --- a/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py +++ b/prototypes/arc2_prototype/ocr_pdf_service/ocr_runner.py @@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent OUTPUT_FOLDER = BASE_DIR / "output" OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) + def run_ocr_and_extract(pdf_path: str): pdf_path = Path(pdf_path) output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf" @@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str): cmd = [ "ocrmypdf", "--force-ocr", - "--output-type", "pdfa", - "--language", "deu+eng", + "--output-type", + "pdfa", + "--language", + "deu+eng", str(pdf_path), - str(output_pdf) + str(output_pdf), ] result = subprocess.run(cmd, capture_output=True) @@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str): raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}") with pdfplumber.open(output_pdf) as pdf: - pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)] + pages = [ + {"page": i + 1, "text": (page.extract_text() or "").strip()} + for i, page in enumerate(pdf.pages) + ] with open(json_path, "w", encoding="utf-8") as f: json.dump(pages, f, indent=2, ensure_ascii=False) - return { - "ocr_pdf": str(output_pdf), - "json_path": str(json_path) - } \ No newline at end of file + return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)} diff --git a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py index 28d5a34..bf0bd6e 100644 --- a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py +++ b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py @@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) model_path = os.path.join(os.path.dirname(__file__), "models", "model-last") nlp = spacy.load(model_path) -input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf" +input_pdf_path = ( + Path(__file__).resolve().parent + / ".." + / "ocr_pdf_service" + / "output" + / "pitchbook-OCR.pdf" +) input_pdf = Path(input_pdf_path) doc = fitz.open(input_pdf) @@ -26,14 +32,10 @@ def extract_with_spacy(pages_json): doc = nlp(text) for ent in doc.ents: - results.append({ - "label": ent.label_, - "entity": ent.text, - "page": page_num - }) + results.append({"label": ent.label_, "entity": ent.text, "page": page_num}) output_path = OUTPUT_FOLDER / f"spacy-results.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) - return results \ No newline at end of file + return results diff --git a/prototypes/exxeta/index.py b/prototypes/exxeta/index.py index 32e0cd7..c1a023e 100644 --- a/prototypes/exxeta/index.py +++ b/prototypes/exxeta/index.py @@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" API_KEY = os.getenv("API_KEY") client = AzureOpenAI( - api_key=API_KEY, - api_version="2023-07-01-preview", - base_url=BASE_URL - ) + api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL +) + + def extract_text_from_pdf(file_path): """Extract text content from a PDF file using PyMuPDF (fitz).""" all_text = "" @@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf" pdf_text = extract_text_from_pdf(file_path) response = client.chat.completions.create( - messages=[ - { - "role": "system", - "content": "Always respond with a valid JSON object" - }, - { - "role": "user", - "content": """extract the values from the text. let not found values empty: + messages=[ + {"role": "system", "content": "Always respond with a valid JSON object"}, + { + "role": "user", + "content": """extract the values from the text. let not found values empty: -Fondsname -Fondsmanager -Name Kapitalverwaltungsgesellschaft @@ -71,20 +68,20 @@ response = client.chat.completions.create( - the page where this value was found - a confidence score, how confident the model is about the value (low, medium, high) - Here ist the text:""" + pdf_text - } - ], - model="gpt-4o-mini", - response_format={"type": "json_object"} - # temperature=0.7, - # top_p=0.95, - # frequency_penalty=0, - # presence_penalty=0, - # max_tokens=800, - # stop="", - # stream=False - ) - + Here ist the text:""" + + pdf_text, + }, + ], + model="gpt-4o-mini", + response_format={"type": "json_object"}, + # temperature=0.7, + # top_p=0.95, + # frequency_penalty=0, + # presence_penalty=0, + # max_tokens=800, + # stop="", + # stream=False +) print(response.choices[0].message.content) diff --git a/prototypes/exxetaGPT/index.py b/prototypes/exxetaGPT/index.py index 32e0cd7..c1a023e 100644 --- a/prototypes/exxetaGPT/index.py +++ b/prototypes/exxetaGPT/index.py @@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" API_KEY = os.getenv("API_KEY") client = AzureOpenAI( - api_key=API_KEY, - api_version="2023-07-01-preview", - base_url=BASE_URL - ) + api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL +) + + def extract_text_from_pdf(file_path): """Extract text content from a PDF file using PyMuPDF (fitz).""" all_text = "" @@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf" pdf_text = extract_text_from_pdf(file_path) response = client.chat.completions.create( - messages=[ - { - "role": "system", - "content": "Always respond with a valid JSON object" - }, - { - "role": "user", - "content": """extract the values from the text. let not found values empty: + messages=[ + {"role": "system", "content": "Always respond with a valid JSON object"}, + { + "role": "user", + "content": """extract the values from the text. let not found values empty: -Fondsname -Fondsmanager -Name Kapitalverwaltungsgesellschaft @@ -71,20 +68,20 @@ response = client.chat.completions.create( - the page where this value was found - a confidence score, how confident the model is about the value (low, medium, high) - Here ist the text:""" + pdf_text - } - ], - model="gpt-4o-mini", - response_format={"type": "json_object"} - # temperature=0.7, - # top_p=0.95, - # frequency_penalty=0, - # presence_penalty=0, - # max_tokens=800, - # stop="", - # stream=False - ) - + Here ist the text:""" + + pdf_text, + }, + ], + model="gpt-4o-mini", + response_format={"type": "json_object"}, + # temperature=0.7, + # top_p=0.95, + # frequency_penalty=0, + # presence_penalty=0, + # max_tokens=800, + # stop="", + # stream=False +) print(response.choices[0].message.content) diff --git a/prototypes/fine_tuning_spaCy/convert_to_spacy.py b/prototypes/fine_tuning_spaCy/convert_to_spacy.py index 8e94729..25e74b3 100644 --- a/prototypes/fine_tuning_spaCy/convert_to_spacy.py +++ b/prototypes/fine_tuning_spaCy/convert_to_spacy.py @@ -2,7 +2,7 @@ import spacy from spacy.tokens import DocBin from training_data import TRAINING_DATA -nlp = spacy.blank("de") +nlp = spacy.blank("de") doc_bin = DocBin() for text, annotations in TRAINING_DATA: @@ -17,4 +17,4 @@ for text, annotations in TRAINING_DATA: doc.ents = ents doc_bin.add(doc) -doc_bin.to_disk("data/train.spacy") \ No newline at end of file +doc_bin.to_disk("data/train.spacy") diff --git a/prototypes/fine_tuning_spaCy/test_model.py b/prototypes/fine_tuning_spaCy/test_model.py index 37ff4a4..7b51586 100644 --- a/prototypes/fine_tuning_spaCy/test_model.py +++ b/prototypes/fine_tuning_spaCy/test_model.py @@ -15,13 +15,11 @@ for page_number in range(len(doc)): text = page.get_text() spacy_doc = nlp(text) for ent in spacy_doc.ents: - results.append({ - "label": ent.label_, - "entity": ent.text.strip(), - "page": page_number + 1 - }) + results.append( + {"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1} + ) with open("entities_output.json", "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) -print("✅ Extraction completed. Results saved to 'entities_output.json'") \ No newline at end of file +print("✅ Extraction completed. Results saved to 'entities_output.json'") diff --git a/prototypes/fine_tuning_spaCy/training_data.py b/prototypes/fine_tuning_spaCy/training_data.py index f51aebd..6e4b6be 100644 --- a/prototypes/fine_tuning_spaCy/training_data.py +++ b/prototypes/fine_tuning_spaCy/training_data.py @@ -71,33 +71,33 @@ TRAINING_DATA = [ "core, core+, value-added", {"entities": [[0, 24, "RISIKOPROFIL"]]}, ), - ( - "Manage to Core: max 20%", - {"entities": [[10, 14, "RISIKOPROFIL"]]}, - ), - ( - "Benefits of the core/ core+ segment", - {"entities": [[16, 27, "RISIKOPROFIL"]]}, - ), - ( - "Drawbacks of the core/ core+ segment", - {"entities": [[17, 28, "RISIKOPROFIL"]]}, - ), - ( - "Why a Core / Core + investment program?", - {"entities": [[6, 19, "RISIKOPROFIL"]]}, - ), - ( - "Different risk profile (core, core+, value-added)", - {"entities": [[24, 48, "RISIKOPROFIL"]]}, - ), - ( + ( + "Manage to Core: max 20%", + {"entities": [[10, 14, "RISIKOPROFIL"]]}, + ), + ( + "Benefits of the core/ core+ segment", + {"entities": [[16, 27, "RISIKOPROFIL"]]}, + ), + ( + "Drawbacks of the core/ core+ segment", + {"entities": [[17, 28, "RISIKOPROFIL"]]}, + ), + ( + "Why a Core / Core + investment program?", + {"entities": [[6, 19, "RISIKOPROFIL"]]}, + ), + ( + "Different risk profile (core, core+, value-added)", + {"entities": [[24, 48, "RISIKOPROFIL"]]}, + ), + ( "INK MGallery Hotel Area: Amsterdam Core Tenant: Closed in 2018", {"entities": [[35, 39, "RISIKOPROFIL"]]}, - ), - ( - "A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.", - {"entities": [[34, 48, "RISIKOPROFIL"]]}, + ), + ( + "A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.", + {"entities": [[34, 48, "RISIKOPROFIL"]]}, ), ( "Navigate the diversity of the Core/Core+ investment opportunities in European Prime Cities", @@ -226,9 +226,5 @@ TRAINING_DATA = [ ( "Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+", {"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]}, - ) + ), ] - - - - diff --git a/prototypes/fine_tuning_spaCy/training_model.py b/prototypes/fine_tuning_spaCy/training_model.py index 80b120b..4778961 100644 --- a/prototypes/fine_tuning_spaCy/training_model.py +++ b/prototypes/fine_tuning_spaCy/training_model.py @@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA): for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: - print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}") + print( + f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}" + ) else: ents.append(span) - print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}") + print( + f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}" + ) # label the text with the ents doc.ents = ents db.add(doc) diff --git a/prototypes/first-mcp-python/client.py b/prototypes/first-mcp-python/client.py index 3c0fe7b..697bd88 100644 --- a/prototypes/first-mcp-python/client.py +++ b/prototypes/first-mcp-python/client.py @@ -87,9 +87,9 @@ class Server: server_params = StdioServerParameters( command=command, args=self.config["args"], - env={**os.environ, **self.config["env"]} - if self.config.get("env") - else None, + env=( + {**os.environ, **self.config["env"]} if self.config.get("env") else None + ), ) try: stdio_transport = await self.exit_stack.enter_async_context( @@ -244,28 +244,23 @@ class LLMClient: formatted_messages = [] for msg in messages: print(msg) - formatted_messages.append({ - "role": msg["role"], - "content": msg["content"] - }) + formatted_messages.append({"role": msg["role"], "content": msg["content"]}) client = AzureOpenAI( - api_key=self.api_key, - api_version="2023-07-01-preview", - base_url=url + api_key=self.api_key, api_version="2023-07-01-preview", base_url=url ) response = client.chat.completions.create( - messages=formatted_messages, - model="gpt-4o-mini", - # response_format={"type": "json_object"} - # temperature=0.7, - # top_p=0.95, - # frequency_penalty=0, - # presence_penalty=0, - # max_tokens=800, - # stop="", - # stream=False - ) + messages=formatted_messages, + model="gpt-4o-mini", + # response_format={"type": "json_object"} + # temperature=0.7, + # top_p=0.95, + # frequency_penalty=0, + # presence_penalty=0, + # max_tokens=800, + # stop="", + # stream=False + ) if response.choices[0].message.content: print("response: " + response.choices[0].message.content) return response.choices[0].message.content diff --git a/prototypes/first-mcp-python/main.py b/prototypes/first-mcp-python/main.py index 3a17140..a69e6c5 100644 --- a/prototypes/first-mcp-python/main.py +++ b/prototypes/first-mcp-python/main.py @@ -1,5 +1,6 @@ # server.py from mcp.server.fastmcp import FastMCP + # Create an MCP server mcp = FastMCP("Demo") diff --git a/prototypes/mcp_validate-arc1/config.py b/prototypes/mcp_validate-arc1/config.py index 3b27716..46f5ef7 100644 --- a/prototypes/mcp_validate-arc1/config.py +++ b/prototypes/mcp_validate-arc1/config.py @@ -1,3 +1,3 @@ EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0" EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" -MODEL_ID = "gpt-35-turbo" \ No newline at end of file +MODEL_ID = "gpt-35-turbo" diff --git a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py index ec97452..f3b04e6 100644 --- a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py +++ b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py @@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json" OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json" OUTPUT_PATH = "mcp_spacy_validated_result.json" + def load_spacy_entities(): with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f: return json.load(f) + def load_pitchbook_pages(): with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f: return json.load(f) + def get_page_text(pages, page_number): for page in pages: if page.get("page") == page_number: return page.get("text", "") return "" + def normalize_entity(entity): - return ' '.join(entity.replace('\n', ' ').split()) + return " ".join(entity.replace("\n", " ").split()) + def validate_entity_with_exxeta(entity, page_num, text): prompt = ( f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n" f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n" f"Ziel-Formulierung:\n" - f"\"{entity}\"\n\n" + f'"{entity}"\n\n' f"Validierungsregeln:\n" f"- Groß- und Kleinschreibung ignorieren.\n" f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n" f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n" f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n" - f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n" + f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n' f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n" f"OCR-Text auf Seite {page_num}:\n{text}" ) headers = { "Content-Type": "application/json", - "Authorization": f"Bearer {EXXETA_API_KEY}" + "Authorization": f"Bearer {EXXETA_API_KEY}", } payload = { "model": MODEL, "messages": [ - {"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.", + }, + {"role": "user", "content": prompt}, ], - "temperature": 0.0 + "temperature": 0.0, } url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" @@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text): print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}") return False + def run(): spacy_entities = load_spacy_entities() pitchbook_pages = load_pitchbook_pages() @@ -81,17 +90,20 @@ def run(): page_text = get_page_text(pitchbook_pages, page) is_valid = validate_entity_with_exxeta(entity, page, page_text) - validated_results.append({ - "label": entity_data.get("label"), - "entity": raw_entity, - "page": page, - "validated": is_valid - }) + validated_results.append( + { + "label": entity_data.get("label"), + "entity": raw_entity, + "page": page, + "validated": is_valid, + } + ) with open(OUTPUT_PATH, "w", encoding="utf-8") as f: json.dump(validated_results, f, indent=2, ensure_ascii=False) print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}") + if __name__ == "__main__": - run() \ No newline at end of file + run() diff --git a/prototypes/mcp_validate-arc1/mcp_validate.py b/prototypes/mcp_validate-arc1/mcp_validate.py index 699e2ee..4dea0e5 100644 --- a/prototypes/mcp_validate-arc1/mcp_validate.py +++ b/prototypes/mcp_validate-arc1/mcp_validate.py @@ -10,19 +10,23 @@ KPI_SERVICE_MAP = { SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json" EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json" + def load_spacy_entities(path): with open(path, "r", encoding="utf-8") as f: return json.load(f) + def load_exxeta_entities(path): with open(path, "r", encoding="utf-8") as f: return json.load(f) + def normalize(text): if not text: return "" return text.strip().lower().replace(" ", "").replace("/", "/") + def validate_kpi(kpi, spacy_entities, exxeta_entities): results = [] @@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities): for ee in exxeta_entries: ee_entity = normalize(ee["entity"]) if se_entity == ee_entity: - results.append({ - "kpi": kpi, - "entity": se["entity"], - "page": page, - "validation_status": "validated" - }) + results.append( + { + "kpi": kpi, + "entity": se["entity"], + "page": page, + "validation_status": "validated", + } + ) matched = True break if not matched: - results.append({ - "kpi": kpi, - "entity": se["entity"], - "page": page, - "validation_status": "spacy-only" - }) + results.append( + { + "kpi": kpi, + "entity": se["entity"], + "page": page, + "validation_status": "spacy-only", + } + ) for ee in exxeta_entries: ee_entity = normalize(ee["entity"]) if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries): - results.append({ - "kpi": kpi, - "entity": ee["entity"], - "page": page, - "validation_status": "exxeta-only" - }) + results.append( + { + "kpi": kpi, + "entity": ee["entity"], + "page": page, + "validation_status": "exxeta-only", + } + ) return results + def save_results(results, filename): with open(filename, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) + def run(): spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH) exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH) @@ -96,5 +108,6 @@ def run(): save_results(all_results, "mcp_validated_result.json") print("✅ Validation complete! Output: mcp_validated_result.json") + if __name__ == "__main__": run() diff --git a/prototypes/merge_validate-arc2/config.py b/prototypes/merge_validate-arc2/config.py index 3b27716..46f5ef7 100644 --- a/prototypes/merge_validate-arc2/config.py +++ b/prototypes/merge_validate-arc2/config.py @@ -1,3 +1,3 @@ EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0" EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" -MODEL_ID = "gpt-35-turbo" \ No newline at end of file +MODEL_ID = "gpt-35-turbo" diff --git a/prototypes/merge_validate-arc2/exxeta_api.py b/prototypes/merge_validate-arc2/exxeta_api.py index 3d8f8b2..97fa88b 100644 --- a/prototypes/merge_validate-arc2/exxeta_api.py +++ b/prototypes/merge_validate-arc2/exxeta_api.py @@ -4,6 +4,7 @@ import json MODEL = "gpt-35-turbo" + def extract_risikoprofil_from_exxeta(pages_json): results = [] @@ -15,34 +16,36 @@ def extract_risikoprofil_from_exxeta(pages_json): continue prompt = ( - "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n" - "Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n" - "Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, " - "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n" - "Beispiele:\n" - "- \"Core, Core+\" → entity: \"Core, Core+\"\n" - "- \"Core/Core+\" → entity: \"Core/Core+\"\n" - "- \"Core and Core+\" → entity: \"Core and Core+\"\n\n" - "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n" - f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n" - "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n" - "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n" - "TEXT:\n" + text + "Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n" + 'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n' + 'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, ' + "bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n" + "Beispiele:\n" + '- "Core, Core+" → entity: "Core, Core+"\n' + '- "Core/Core+" → entity: "Core/Core+"\n' + '- "Core and Core+" → entity: "Core and Core+"\n\n' + "Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n" + f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n' + "Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n" + "Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n" + "TEXT:\n" + text ) - headers = { "Content-Type": "application/json", - "Authorization": f"Bearer {EXXETA_API_KEY}" + "Authorization": f"Bearer {EXXETA_API_KEY}", } payload = { "model": MODEL, "messages": [ - {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.", + }, + {"role": "user", "content": prompt}, ], - "temperature": 0.0 + "temperature": 0.0, } url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" @@ -71,4 +74,4 @@ def extract_risikoprofil_from_exxeta(pages_json): except Exception as e: print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}") - return results \ No newline at end of file + return results diff --git a/prototypes/merge_validate-arc2/merge_logic.py b/prototypes/merge_validate-arc2/merge_logic.py index b6cf2c3..38d2d48 100644 --- a/prototypes/merge_validate-arc2/merge_logic.py +++ b/prototypes/merge_validate-arc2/merge_logic.py @@ -1,10 +1,11 @@ def normalize_entity(entity_str): if not entity_str: return "" - normalized = entity_str.replace('\n', ' ') - normalized = ''.join(normalized.lower().split()) + normalized = entity_str.replace("\n", " ") + normalized = "".join(normalized.lower().split()) return normalized + def merge_and_validate_entities(spacy_data, exxeta_data): merged = [] seen = set() @@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data): e_page = e["page"] # Match if normalized entity and page match - if (s["label"] == e["label"] and - s_entity_norm == e_entity_norm and - s_page == e_page): + if ( + s["label"] == e["label"] + and s_entity_norm == e_entity_norm + and s_page == e_page + ): - merged.append({ - "label": s["label"], - "entity": s["entity"], - "page": s_page, - "status": "validated" - }) + merged.append( + { + "label": s["label"], + "entity": s["entity"], + "page": s_page, + "status": "validated", + } + ) seen.add((e["entity"], e_page)) found = True break # If no match found, add as single-source if not found: - merged.append({ - "label": s["label"], - "entity": s["entity"], - "page": s_page, - "status": "single-source", - "source": "spacy" - }) + merged.append( + { + "label": s["label"], + "entity": s["entity"], + "page": s_page, + "status": "single-source", + "source": "spacy", + } + ) # Add remaining Exxeta entities not already processed for e in exxeta_data: if (e["entity"], e["page"]) not in seen: - merged.append({ - "label": e["label"], - "entity": e["entity"], - "page": e["page"], - "status": "single-source", - "source": "exxeta" - }) + merged.append( + { + "label": e["label"], + "entity": e["entity"], + "page": e["page"], + "status": "single-source", + "source": "exxeta", + } + ) - return merged \ No newline at end of file + return merged diff --git a/prototypes/merge_validate-arc2/merge_validate.py b/prototypes/merge_validate-arc2/merge_validate.py index e9b2886..b3cea82 100644 --- a/prototypes/merge_validate-arc2/merge_validate.py +++ b/prototypes/merge_validate-arc2/merge_validate.py @@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities SPACY_PATH = "../fine_tuning_spaCy/entities_output.json" PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json" + def load_pitchbook_pages(): path = Path(PITCHBOOK_PATH) with open(path, "r", encoding="utf-8") as f: return json.load(f) + def save_json(data, filename): with open(filename, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) + def sort_by_page_number(entities): return sorted(entities, key=lambda x: x.get("page", 0)) + def run(): spacy_entities = load_spacy_entities(SPACY_PATH) pitchbook_pages = load_pitchbook_pages() @@ -33,5 +37,6 @@ def run(): print("- merged_result.json") print(f"- Total entities in merged result: {len(merged_sorted)}") + if __name__ == "__main__": - run() \ No newline at end of file + run() diff --git a/prototypes/merge_validate-arc2/spacy_extract.py b/prototypes/merge_validate-arc2/spacy_extract.py index 0ccc818..f6e9075 100644 --- a/prototypes/merge_validate-arc2/spacy_extract.py +++ b/prototypes/merge_validate-arc2/spacy_extract.py @@ -1,7 +1,8 @@ import json from pathlib import Path + def load_spacy_entities(path): path = Path(path) with open(path, "r", encoding="utf-8") as f: - return json.load(f) \ No newline at end of file + return json.load(f) diff --git a/prototypes/ocr/ocr.py b/prototypes/ocr/ocr.py index 3149035..6e4f0e8 100644 --- a/prototypes/ocr/ocr.py +++ b/prototypes/ocr/ocr.py @@ -11,15 +11,20 @@ log_folder = Path("logs") for folder in [output_folder, log_folder]: folder.mkdir(parents=True, exist_ok=True) + def extract_text_to_json(pdf_path: Path): json_path = output_folder / f"{pdf_path.stem}.json" with pdfplumber.open(pdf_path) as pdf: - pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)] + pages = [ + {"page": i + 1, "text": (page.extract_text() or "").strip()} + for i, page in enumerate(pdf.pages) + ] with open(json_path, "w", encoding="utf-8") as f: json.dump(pages, f, indent=2, ensure_ascii=False) print(f"📄 Text JSON saved: {json_path.name}") + def ocr_pdf(input_file: Path): output_file = output_folder / f"{input_file.stem}-OCR.pdf" log_file = log_folder / f"{input_file.stem}.log" @@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path): cmd = [ "ocrmypdf", "--force-ocr", - "--output-type", "pdfa", - "--language", "deu+eng", - "--sidecar", str(sidecar_txt), + "--output-type", + "pdfa", + "--language", + "deu+eng", + "--sidecar", + str(sidecar_txt), str(input_file), - str(output_file) + str(output_file), ] with open(log_file, "w") as log: @@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path): else: print(f"❌ OCR failed. See log: {log_file}") + if __name__ == "__main__": if not input_folder.exists(): print("Input folder does not exist!") @@ -54,4 +63,4 @@ if __name__ == "__main__": else: for pdf in pdfs: print(f"Processing: {pdf.name}") - ocr_pdf(pdf) \ No newline at end of file + ocr_pdf(pdf) diff --git a/prototypes/pdfplumber/tabellentext_holen.py b/prototypes/pdfplumber/tabellentext_holen.py index 7666df8..58513fd 100644 --- a/prototypes/pdfplumber/tabellentext_holen.py +++ b/prototypes/pdfplumber/tabellentext_holen.py @@ -1,4 +1,4 @@ -import pdfplumber +import pdfplumber pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf" @@ -10,7 +10,7 @@ pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf" # # Print the extracted text with preserved structure # print(f"Page {page.page_number}:\n{page_text}\n") -with pdfplumber.open(pdf_path) as pdf: +with pdfplumber.open(pdf_path) as pdf: for i, page in enumerate(pdf.pages): tables = page.extract_tables() diff --git a/prototypes/spacy-layout/extract_pitchbooks.py b/prototypes/spacy-layout/extract_pitchbooks.py index 7cd2139..37674df 100644 --- a/prototypes/spacy-layout/extract_pitchbooks.py +++ b/prototypes/spacy-layout/extract_pitchbooks.py @@ -1,6 +1,6 @@ # https://github.com/explosion/spacy-layout ### Run with: python extract_pitchbooks.py -import spacy +import spacy from spacy_layout import spaCyLayout from pathlib import Path import pandas as pd @@ -34,14 +34,14 @@ for ent in doc_ner.ents: break if ent.text.strip(): - ner_text_results.append({ - "label": ent.label_, - "entity": ent.text.strip(), - "page": page_number - }) + ner_text_results.append( + {"label": ent.label_, "entity": ent.text.strip(), "page": page_number} + ) print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json") -(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False)) +(output_dir / "ner_text.json").write_text( + json.dumps(ner_text_results, indent=2, ensure_ascii=False) +) # 2. NER on table cells table_ner_results = [] @@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1): doc_cell = nlp(cell) for ent in doc_cell.ents: if ent.text.strip(): - table_ner_results.append({ - "label": ent.label_, - "entity": ent.text.strip(), - "page": page_number, - "table": i - }) + table_ner_results.append( + { + "label": ent.label_, + "entity": ent.text.strip(), + "page": page_number, + "table": i, + } + ) print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json") -(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False)) +(output_dir / "ner_tables.json").write_text( + json.dumps(table_ner_results, indent=2, ensure_ascii=False) +) -print("✅ Done! Extracted data saved to /output") \ No newline at end of file +print("✅ Done! Extracted data saved to /output")