Add ExxetaGPT microservice with Docker support and local .env configuration. Implement Flask-based API in app.py. Integrate extractExxeta.py with exxetaGPT. Add Dockerfile for containerized deployment.
parent
cfb67439ba
commit
cc5e57b953
|
|
@ -0,0 +1,175 @@
|
|||
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
|
||||
|
||||
# Logs
|
||||
|
||||
logs
|
||||
_.log
|
||||
npm-debug.log_
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# Caches
|
||||
|
||||
.cache
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
|
||||
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||
|
||||
# Runtime data
|
||||
|
||||
pids
|
||||
_.pid
|
||||
_.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Snowpack dependency directory (https://snowpack.dev/)
|
||||
|
||||
web_modules/
|
||||
|
||||
# TypeScript cache
|
||||
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
|
||||
.eslintcache
|
||||
|
||||
# Optional stylelint cache
|
||||
|
||||
.stylelintcache
|
||||
|
||||
# Microbundle cache
|
||||
|
||||
.rpt2_cache/
|
||||
.rts2_cache_cjs/
|
||||
.rts2_cache_es/
|
||||
.rts2_cache_umd/
|
||||
|
||||
# Optional REPL history
|
||||
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variable files
|
||||
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
|
||||
.parcel-cache
|
||||
|
||||
# Next.js build output
|
||||
|
||||
.next
|
||||
out
|
||||
|
||||
# Nuxt.js build / generate output
|
||||
|
||||
.nuxt
|
||||
dist
|
||||
|
||||
# Gatsby files
|
||||
|
||||
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||
|
||||
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||
|
||||
# public
|
||||
|
||||
# vuepress build output
|
||||
|
||||
.vuepress/dist
|
||||
|
||||
# vuepress v2.x temp and cache directory
|
||||
|
||||
.temp
|
||||
|
||||
# Docusaurus cache and generated files
|
||||
|
||||
.docusaurus
|
||||
|
||||
# Serverless directories
|
||||
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
|
||||
.dynamodb/
|
||||
|
||||
# TernJS port file
|
||||
|
||||
.tern-port
|
||||
|
||||
# Stores VSCode versions used for testing VSCode extensions
|
||||
|
||||
.vscode-test
|
||||
|
||||
# yarn v2
|
||||
|
||||
.yarn/cache
|
||||
.yarn/unplugged
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
# IntelliJ based IDEs
|
||||
.idea
|
||||
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
EXPOSE 5050
|
||||
|
||||
CMD ["python", "app.py"]
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
# ExxetaGPT Microservice
|
||||
|
||||
## Lokaler Start (ohne Container)
|
||||
|
||||
### 1. Voraussetzungen
|
||||
|
||||
- Python 3.11+
|
||||
- Virtuelle Umgebung (empfohlen)
|
||||
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 2. .env Datei erstellen
|
||||
Leg eine .env Datei im Projektverzeichnis mit der Exxeta API-Key an
|
||||
|
||||
(Der API Key ist ein JWT von Exxeta – nicht veröffentlichen!)
|
||||
|
||||
### 3. Starten
|
||||
python app.py
|
||||
|
||||
## Verwendung als Docker-Container
|
||||
|
||||
### 1. Build
|
||||
```bash
|
||||
docker build -t exxeta-gpt .
|
||||
```
|
||||
|
||||
### 2. Starten
|
||||
```bash
|
||||
docker run -p 5050:5050 --env-file .env exxeta-gpt
|
||||
```
|
||||
|
||||
## Beispielaufruf:
|
||||
```bash
|
||||
curl -X POST http://localhost:5050/extract \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @text-per-page.json
|
||||
```
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
from flask import Flask, request, jsonify
|
||||
from services.extractExxeta import extract_with_exxeta
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/extract', methods=['POST'])
|
||||
def extract_text_from_ocr_json():
|
||||
json_ocr = request.get_json()
|
||||
json_data = extract_with_exxeta(json_ocr)
|
||||
return jsonify(json_data), 200
|
||||
#TO DO: Anbindung an das Merge & Validate Service
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5050, debug=True)
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
python-dotenv
|
||||
flask
|
||||
requests
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
import requests
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
MODEL = "gpt-4o-mini"
|
||||
EXXETA_BASE_URL= "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
load_dotenv()
|
||||
EXXETA_API_KEY = os.getenv("API_KEY")
|
||||
|
||||
MAX_RETRIES = 3
|
||||
TIMEOUT = 30
|
||||
|
||||
def extract_with_exxeta(pages_json):
|
||||
results = []
|
||||
|
||||
for page_data in pages_json:
|
||||
page_num = page_data.get("page")
|
||||
text = page_data.get("text", "").strip()
|
||||
|
||||
if not text:
|
||||
continue
|
||||
|
||||
if page_num == 1:
|
||||
prompt = (
|
||||
"Bitte extrahiere gezielt folgende drei Kennzahlen aus dem folgenden Pitchbook-Text:\n\n"
|
||||
"- FONDSNAME\n"
|
||||
"- FONDSMANAGER\n"
|
||||
"- DATUM\n\n"
|
||||
"Extrahiere **nur** diese drei Werte, wenn sie im Text explizit genannt werden.\n\n"
|
||||
"WICHTIG:\n"
|
||||
"- Gib exakt eine Entität pro Kennzahl an.\n"
|
||||
"- Falls eine Information nicht eindeutig erkennbar ist, lass sie weg.\n"
|
||||
"- Gib die Antwort als **JSON-Array** im folgenden Format zurück:\n\n"
|
||||
"[\n"
|
||||
" {\n"
|
||||
" \"label\": \"FONDSNAME\",\n"
|
||||
" \"entity\": \"...\",\n"
|
||||
f" \"page\": {page_num},\n"
|
||||
" },\n"
|
||||
" ...\n"
|
||||
"]\n\n"
|
||||
"Nur JSON-Antwort – keine Kommentare, keine Erklärungen.\n\n"
|
||||
f"TEXT:\n{text}"
|
||||
)
|
||||
else:
|
||||
prompt = (
|
||||
"Bitte extrahiere relevante Fondskennzahlen aus dem folgenden Pitchbook-Text. "
|
||||
"Analysiere den Text sorgfältig, um **nur exakt benannte und relevante Werte** zu extrahieren.\n\n"
|
||||
|
||||
"ZU EXTRAHIERENDE KENNZAHLEN (immer exakt wie unten angegeben):\n"
|
||||
"- FONDSNAME\n"
|
||||
"- FONDSMANAGER\n"
|
||||
"- AIFM (z. B. Name Kapitalverwaltungsgesellschaft)\n"
|
||||
"- DATUM\n"
|
||||
"- RISIKOPROFIL (z. B. CORE, CORE+, VALUE-ADDED, OPPORTUNISTISCH)\n"
|
||||
"- ARTIKEL (z. B. ARTIKEL 6, 8, 9)\n"
|
||||
"- ZIELRENDITE\n"
|
||||
"- RENDITE\n"
|
||||
"- ZIELAUSSCHÜTTUNG\n"
|
||||
"- AUSSCHÜTTUNG\n"
|
||||
"- LAUFZEIT\n"
|
||||
"- LTV\n"
|
||||
"- MANAGEMENTGEBÜHREN (ggf. mit Staffelung und Bezug auf NAV/GAV)\n"
|
||||
"- SEKTORENALLOKATION (z. B. BÜRO, LOGISTIK, WOHNEN... inkl. %-Angaben)\n"
|
||||
"- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n"
|
||||
|
||||
"WICHTIG:\n"
|
||||
"- Gib **nur eine Entität pro Kennzahl** an – keine Listen oder Interpretationen.\n"
|
||||
"- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n"
|
||||
"- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n"
|
||||
"- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n"
|
||||
"- Jeder gefundene Wert muss einem der obigen Label **eindeutig zuordenbar** sein.\n\n"
|
||||
|
||||
"FORMAT:\n"
|
||||
"Antworte als **reines JSON-Array** mit folgendem Format:\n"
|
||||
"[\n"
|
||||
" {\n"
|
||||
" \"label\": \"Kennzahlname (exakt wie oben)\",\n"
|
||||
" \"entity\": \"Wert aus dem Text (exakt im Original)\",\n"
|
||||
f" \"page\": {page_num},\n"
|
||||
" },\n"
|
||||
" ...\n"
|
||||
"]\n\n"
|
||||
|
||||
f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n"
|
||||
f"Nur JSON-Antwort – keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
|
||||
f"TEXT:\n{text}"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"temperature": 0.0
|
||||
}
|
||||
|
||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=TIMEOUT)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.json()["choices"][0]["message"]["content"]
|
||||
content = content.strip()
|
||||
|
||||
if content.startswith("```json"):
|
||||
content = content.split("```json")[1]
|
||||
if content.endswith("```"):
|
||||
content = content.split("```")[0]
|
||||
content = content.strip()
|
||||
|
||||
try:
|
||||
page_results = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
page_results = []
|
||||
|
||||
if isinstance(page_results, list):
|
||||
results.extend(page_results)
|
||||
break
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt == MAX_RETRIES:
|
||||
results.extend([])
|
||||
except Exception as e:
|
||||
if attempt == MAX_RETRIES:
|
||||
results.extend([])
|
||||
|
||||
json_result = json.dumps(results, indent=2, ensure_ascii=False)
|
||||
return json_result
|
||||
|
|
@ -11,7 +11,6 @@ model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
|||
nlp = spacy.load(model_path)
|
||||
input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
|
||||
input_pdf = Path(input_pdf_path)
|
||||
doc = fitz.open(input_pdf)
|
||||
|
||||
|
||||
def extract_with_spacy(pages_json):
|
||||
|
|
|
|||
Loading…
Reference in New Issue