2025-05-25 14:49:01 +02:00
7 changed files with 388 additions and 1 deletions
--- a/project/backend/.gitignore
+++ b/project/backend/.gitignore
@ -0,0 +1,175 @@
+# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
+
+# Logs
+
+logs
+_.log
+npm-debug.log_
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Caches
+
+.cache
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+
+# Runtime data
+
+pids
+_.pid
+_.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+
+lib-cov
+
+# Coverage directory used by tools like istanbul
+
+coverage
+*.lcov
+
+# nyc test coverage
+
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+
+bower_components
+
+# node-waf configuration
+
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+
+build/Release
+
+# Dependency directories
+
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+
+web_modules/
+
+# TypeScript cache
+
+*.tsbuildinfo
+
+# Optional npm cache directory
+
+.npm
+
+# Optional eslint cache
+
+.eslintcache
+
+# Optional stylelint cache
+
+.stylelintcache
+
+# Microbundle cache
+
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+
+.node_repl_history
+
+# Output of 'npm pack'
+
+*.tgz
+
+# Yarn Integrity file
+
+.yarn-integrity
+
+# dotenv environment variable files
+
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+
+.parcel-cache
+
+# Next.js build output
+
+.next
+out
+
+# Nuxt.js build / generate output
+
+.nuxt
+dist
+
+# Gatsby files
+
+# Comment in the public line in if your project uses Gatsby and not Next.js
+
+# https://nextjs.org/blog/next-9-1#public-directory-support
+
+# public
+
+# vuepress build output
+
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+
+.temp
+
+# Docusaurus cache and generated files
+
+.docusaurus
+
+# Serverless directories
+
+.serverless/
+
+# FuseBox cache
+
+.fusebox/
+
+# DynamoDB Local files
+
+.dynamodb/
+
+# TernJS port file
+
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+
+.vscode-test
+
+# yarn v2
+
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+
+# IntelliJ based IDEs
+.idea
+
+# Finder (MacOS) folder config
+.DS_Store
--- a/project/backend/exxetaGPT/Dockerfile
+++ b/project/backend/exxetaGPT/Dockerfile
@ -0,0 +1,14 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+ENV PYTHONUNBUFFERED=1
+
+EXPOSE 5050
+
+CMD ["python", "app.py"]
--- a/project/backend/exxetaGPT/Readme.md
+++ b/project/backend/exxetaGPT/Readme.md
@ -0,0 +1,41 @@
+# ExxetaGPT Microservice
+
+## Lokaler Start (ohne Container)
+
+### 1. Voraussetzungen
+
+- Python 3.11+
+- Virtuelle Umgebung (empfohlen)
+
+```bash
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+### 2. .env Datei erstellen
+Leg eine .env Datei im Projektverzeichnis mit der Exxeta API-Key an
+
+(Der API Key ist ein JWT von Exxeta – nicht veröffentlichen!)
+
+### 3. Starten
+python app.py
+
+## Verwendung als Docker-Container
+
+### 1. Build
+```bash
+docker build -t exxeta-gpt .
+```
+
+### 2. Starten
+```bash
+docker run -p 5050:5050 --env-file .env exxeta-gpt
+```
+
+## Beispielaufruf:
+```bash
+curl -X POST http://localhost:5050/extract \
+  -H "Content-Type: application/json" \
+  -d @text-per-page.json
+```
--- a/project/backend/exxetaGPT/app.py
+++ b/project/backend/exxetaGPT/app.py
@ -0,0 +1,15 @@
+from flask import Flask, request, jsonify
+from services.extractExxeta import extract_with_exxeta
+
+app = Flask(__name__)
+
+@app.route('/extract', methods=['POST'])
+def extract_text_from_ocr_json():
+    json_ocr = request.get_json()
+    json_data = extract_with_exxeta(json_ocr)
+    return jsonify(json_data), 200
+    #TO DO: Anbindung an das Merge & Validate Service
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5050, debug=True)
--- a/project/backend/exxetaGPT/requirements.txt
+++ b/project/backend/exxetaGPT/requirements.txt
@ -0,0 +1,3 @@
+python-dotenv
+flask
+requests
--- a/project/backend/exxetaGPT/services/extractExxeta.py
+++ b/project/backend/exxetaGPT/services/extractExxeta.py
@ -0,0 +1,140 @@
+import requests
+import json
+import os
+import time
+
+from dotenv import load_dotenv
+
+MODEL = "gpt-4o-mini"
+EXXETA_BASE_URL= "https://ai.exxeta.com/api/v2/azure/openai"
+load_dotenv()
+EXXETA_API_KEY = os.getenv("API_KEY")
+
+MAX_RETRIES = 3
+TIMEOUT = 30
+
+def extract_with_exxeta(pages_json):
+    results = []
+
+    for page_data in pages_json:
+        page_num = page_data.get("page")
+        text = page_data.get("text", "").strip()
+
+        if not text:
+            continue
+
+        if page_num == 1:
+            prompt = (
+                "Bitte extrahiere gezielt folgende drei Kennzahlen aus dem folgenden Pitchbook-Text:\n\n"
+                "- FONDSNAME\n"
+                "- FONDSMANAGER\n"
+                "- DATUM\n\n"
+                "Extrahiere **nur** diese drei Werte, wenn sie im Text explizit genannt werden.\n\n"
+                "WICHTIG:\n"
+                "- Gib exakt eine Entität pro Kennzahl an.\n"
+                "- Falls eine Information nicht eindeutig erkennbar ist, lass sie weg.\n"
+                "- Gib die Antwort als **JSON-Array** im folgenden Format zurück:\n\n"
+                "[\n"
+                "  {\n"
+                "    \"label\": \"FONDSNAME\",\n"
+                "    \"entity\": \"...\",\n"
+                f"    \"page\": {page_num},\n"
+                "  },\n"
+                "  ...\n"
+                "]\n\n"
+                "Nur JSON-Antwort – keine Kommentare, keine Erklärungen.\n\n"
+                f"TEXT:\n{text}"
+            )
+        else:
+            prompt = (
+                "Bitte extrahiere relevante Fondskennzahlen aus dem folgenden Pitchbook-Text. "
+                "Analysiere den Text sorgfältig, um **nur exakt benannte und relevante Werte** zu extrahieren.\n\n"
+            
+                "ZU EXTRAHIERENDE KENNZAHLEN (immer exakt wie unten angegeben):\n"
+                "- FONDSNAME\n"
+                "- FONDSMANAGER\n"
+                "- AIFM (z. B. Name Kapitalverwaltungsgesellschaft)\n"
+                "- DATUM\n"
+                "- RISIKOPROFIL (z. B. CORE, CORE+, VALUE-ADDED, OPPORTUNISTISCH)\n"
+                "- ARTIKEL (z. B. ARTIKEL 6, 8, 9)\n"
+                "- ZIELRENDITE\n"
+                "- RENDITE\n"
+                "- ZIELAUSSCHÜTTUNG\n"
+                "- AUSSCHÜTTUNG\n"
+                "- LAUFZEIT\n"
+                "- LTV\n"
+                "- MANAGEMENTGEBÜHREN (ggf. mit Staffelung und Bezug auf NAV/GAV)\n"
+                "- SEKTORENALLOKATION (z. B. BÜRO, LOGISTIK, WOHNEN... inkl. %-Angaben)\n"
+                "- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n"
+            
+                "WICHTIG:\n"
+                "- Gib **nur eine Entität pro Kennzahl** an – keine Listen oder Interpretationen.\n"
+                "- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n"
+                "- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n"
+                "- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n"
+                "- Jeder gefundene Wert muss einem der obigen Label **eindeutig zuordenbar** sein.\n\n"
+            
+                "FORMAT:\n"
+                "Antworte als **reines JSON-Array** mit folgendem Format:\n"
+                "[\n"
+                "  {\n"
+                "    \"label\": \"Kennzahlname (exakt wie oben)\",\n"
+                "    \"entity\": \"Wert aus dem Text (exakt im Original)\",\n"
+                f"    \"page\": {page_num},\n"
+                "  },\n"
+                "  ...\n"
+                "]\n\n"
+            
+                f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n"
+                f"Nur JSON-Antwort – keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
+                f"TEXT:\n{text}"
+            )
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {EXXETA_API_KEY}"
+        }
+
+        payload = {
+            "model": MODEL,
+            "messages": [
+                {"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.0
+        }
+
+        url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
+
+        for attempt in range(1, MAX_RETRIES + 1):
+            try:
+                response = requests.post(url, headers=headers, json=payload, timeout=TIMEOUT)
+                response.raise_for_status()
+
+                content = response.json()["choices"][0]["message"]["content"]
+                content = content.strip()
+
+                if content.startswith("```json"):
+                    content = content.split("```json")[1]
+                if content.endswith("```"):
+                    content = content.split("```")[0]
+                content = content.strip()
+
+                try:
+                    page_results = json.loads(content)
+                except json.JSONDecodeError:
+                    page_results = []
+
+                if isinstance(page_results, list):
+                    results.extend(page_results)
+                break
+
+            except requests.exceptions.RequestException as e:
+                if attempt == MAX_RETRIES:
+                    results.extend([])
+            except Exception as e:
+                if attempt == MAX_RETRIES:
+                    results.extend([])
+
+    json_result = json.dumps(results, indent=2, ensure_ascii=False)
+    return json_result
--- a/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
+++ b/prototypes/arc2_prototype/spacy_service/spacy_extractor.py
@ -11,7 +11,6 @@ model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
 nlp = spacy.load(model_path)
 input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
 input_pdf = Path(input_pdf_path)
-doc = fitz.open(input_pdf)


 def extract_with_spacy(pages_json):