Compare commits
No commits in common. "908050a2fb2f018bcab2321e0bf1580592869085" and "fd06fc1821b2fbab0c986072dbdc3318a3b553cc" have entirely different histories.
908050a2fb
...
fd06fc1821
|
|
@ -2,10 +2,4 @@
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
# Python virtual environments
|
# Python virtual environments
|
||||||
**/.venv/
|
.venv/
|
||||||
**/venv
|
|
||||||
**/__pycache__
|
|
||||||
**/.env
|
|
||||||
**/node_modules
|
|
||||||
**/build
|
|
||||||
**/dist
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
repos:
|
|
||||||
- repo: https://github.com/psf/black
|
|
||||||
rev: 23.3.0
|
|
||||||
hooks:
|
|
||||||
- id: black
|
|
||||||
language_version: python3
|
|
||||||
files: ^project/backend/
|
|
||||||
|
|
||||||
- repo: https://github.com/pycqa/flake8
|
|
||||||
rev: 6.1.0
|
|
||||||
hooks:
|
|
||||||
- id: flake8
|
|
||||||
files: ^project/backend/
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
services:
|
|
||||||
backend:
|
|
||||||
build: ./project/backend
|
|
||||||
container_name: fundfuechse-backend
|
|
||||||
ports:
|
|
||||||
- "5000:5000"
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
# frontend:
|
|
||||||
# build: ./project/frontend
|
|
||||||
# container_name: fundfuechse-frontend
|
|
||||||
# ports:
|
|
||||||
# - "3000:80"
|
|
||||||
# restart: always
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
API_KEY=
|
|
||||||
DATABASE_URL=postgresql://admin:admin@db:5432
|
|
||||||
POSTGRES_PASSWORD=admin
|
|
||||||
POSTGRES_USER=admin
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
# 1. Python-Image verwenden
|
|
||||||
FROM python:3.11-alpine
|
|
||||||
|
|
||||||
# 2. Arbeitsverzeichnis im Container setzen
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# 3. production-style server mit gunicorn
|
|
||||||
RUN pip install gunicorn
|
|
||||||
|
|
||||||
# 4. requirements.txt kopieren und Pakete installieren
|
|
||||||
COPY requirements.txt .
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
|
|
||||||
|
|
||||||
# 5. Quellcode kopieren (z.B. app.py)
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
|
||||||
EXPOSE 5000
|
|
||||||
|
|
||||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
## Setup
|
|
||||||
|
|
||||||
### Voraussetzungen
|
|
||||||
|
|
||||||
- Python 3.11+
|
|
||||||
- pip
|
|
||||||
- Docker (Desktop)
|
|
||||||
- Optional: `pre-commit`
|
|
||||||
|
|
||||||
### Abhängigkeiten installieren
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# Codequalität (lokal prüfen)
|
|
||||||
black app.py
|
|
||||||
flake8 app.py
|
|
||||||
|
|
||||||
|
|
||||||
## Anwendung starten
|
|
||||||
|
|
||||||
### Lokal
|
|
||||||
|
|
||||||
1. Abhängigkeiten installieren:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Flask-App starten:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python app.py
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Aufrufen im Browser:
|
|
||||||
|
|
||||||
```
|
|
||||||
http://localhost:5000/
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Option 2: Mit Docker
|
|
||||||
|
|
||||||
1. Image bauen:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker build -t fundfuechse-backend .
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Container starten:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run -p 5000:5000 fundfuechse-backend
|
|
||||||
```
|
|
||||||
|
|
||||||
Die API läuft dann unter:
|
|
||||||
|
|
||||||
```
|
|
||||||
http://localhost:5000/
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Option 3: Mit docker-compose
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose up --build
|
|
||||||
```
|
|
||||||
|
|
||||||
Danach ist der Service erreichbar unter:
|
|
||||||
|
|
||||||
```
|
|
||||||
http://localhost:5000/
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Testaufruf per curl (PDF hochladen)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl.exe -X POST -F "file=@Pitchbook 1.pdf" http://localhost:5000/upload
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
||||||
from flask import Flask
|
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from controller import register_routes
|
|
||||||
from model.database import init_db
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
DATABASE_URL = os.getenv("DATABASE_URL")
|
|
||||||
|
|
||||||
app.config["SQLALCHEMY_DATABASE_URI"] = DATABASE_URL
|
|
||||||
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = True
|
|
||||||
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024 # 100 MB
|
|
||||||
|
|
||||||
init_db(app)
|
|
||||||
|
|
||||||
register_routes(app)
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/health")
|
|
||||||
def health_check():
|
|
||||||
return "OK"
|
|
||||||
|
|
||||||
|
|
||||||
# für Docker wichtig: host='0.0.0.0'
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run(debug=True, host="0.0.0.0")
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
from controller.spacy_contoller import spacy_controller
|
|
||||||
from controller.kpi_setting_controller import kpi_setting_controller
|
|
||||||
from controller.pitch_book_controller import pitch_book_controller
|
|
||||||
|
|
||||||
|
|
||||||
def register_routes(app):
|
|
||||||
app.register_blueprint(kpi_setting_controller)
|
|
||||||
app.register_blueprint(pitch_book_controller)
|
|
||||||
app.register_blueprint(spacy_controller)
|
|
||||||
|
|
@ -1,116 +0,0 @@
|
||||||
from flask import Blueprint, request, jsonify
|
|
||||||
from model.database import db
|
|
||||||
from model.kpi_setting_model import KPISettingModel, KPISettingType
|
|
||||||
|
|
||||||
|
|
||||||
kpi_setting_controller = Blueprint(
|
|
||||||
"kpi_settings", __name__, url_prefix="/api/kpi_setting"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@kpi_setting_controller.route("/", methods=["GET"])
|
|
||||||
def get_all_kpi_settings():
|
|
||||||
kpi_settings = KPISettingModel.query.all()
|
|
||||||
return jsonify([kpi_setting.to_dict() for kpi_setting in kpi_settings]), 200
|
|
||||||
|
|
||||||
|
|
||||||
@kpi_setting_controller.route("/<int:id>", methods=["GET"])
|
|
||||||
def get_kpi_setting(id):
|
|
||||||
kpi_setting = KPISettingModel.query.get_or_404(id)
|
|
||||||
return jsonify(kpi_setting.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@kpi_setting_controller.route("/", methods=["POST"])
|
|
||||||
def create_kpi_setting():
|
|
||||||
data = request.json
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
return jsonify({"error": "No data provided"}), 400
|
|
||||||
|
|
||||||
required_fields = [
|
|
||||||
"name",
|
|
||||||
"description",
|
|
||||||
"mandatory",
|
|
||||||
"type",
|
|
||||||
"translation",
|
|
||||||
"example",
|
|
||||||
]
|
|
||||||
for field in required_fields:
|
|
||||||
if field not in data:
|
|
||||||
return jsonify({"error": f"Missing required field: {field}"}), 400
|
|
||||||
|
|
||||||
# Check if name already exists
|
|
||||||
existing_kpi = KPISettingModel.query.filter_by(name=data["name"]).first()
|
|
||||||
if existing_kpi:
|
|
||||||
return jsonify({"error": "KPI Setting with this name already exists"}), 409
|
|
||||||
|
|
||||||
# Validate type enum
|
|
||||||
try:
|
|
||||||
kpi_type = KPISettingType(data["type"])
|
|
||||||
except ValueError:
|
|
||||||
valid_types = [t.value for t in KPISettingType]
|
|
||||||
return jsonify({"error": f"Invalid type. Must be one of: {valid_types}"}), 400
|
|
||||||
|
|
||||||
new_kpi_setting = KPISettingModel(
|
|
||||||
name=data["name"],
|
|
||||||
description=data["description"],
|
|
||||||
mandatory=data["mandatory"],
|
|
||||||
type=kpi_type,
|
|
||||||
translation=data["translation"],
|
|
||||||
example=data["example"],
|
|
||||||
)
|
|
||||||
|
|
||||||
db.session.add(new_kpi_setting)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(new_kpi_setting.to_dict()), 201
|
|
||||||
|
|
||||||
|
|
||||||
@kpi_setting_controller.route("/<int:id>", methods=["PUT"])
|
|
||||||
def update_kpi_setting(id):
|
|
||||||
kpi_setting = KPISettingModel.query.get_or_404(id)
|
|
||||||
data = request.json
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
return jsonify({"error": "No data provided"}), 400
|
|
||||||
|
|
||||||
if "name" in data and data["name"] != kpi_setting.name:
|
|
||||||
existing_kpi = KPISettingModel.query.filter_by(name=data["name"]).first()
|
|
||||||
if existing_kpi:
|
|
||||||
return jsonify({"error": "KPI Setting with this name already exists"}), 409
|
|
||||||
kpi_setting.name = data["name"]
|
|
||||||
|
|
||||||
if "description" in data:
|
|
||||||
kpi_setting.description = data["description"]
|
|
||||||
|
|
||||||
if "mandatory" in data:
|
|
||||||
kpi_setting.mandatory = data["mandatory"]
|
|
||||||
|
|
||||||
if "type" in data:
|
|
||||||
try:
|
|
||||||
kpi_setting.type = KPISettingType(data["type"])
|
|
||||||
except ValueError:
|
|
||||||
valid_types = [t.value for t in KPISettingType]
|
|
||||||
return (
|
|
||||||
jsonify({"error": f"Invalid type. Must be one of: {valid_types}"}),
|
|
||||||
400,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "translation" in data:
|
|
||||||
kpi_setting.translation = data["translation"]
|
|
||||||
|
|
||||||
if "example" in data:
|
|
||||||
kpi_setting.example = data["example"]
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(kpi_setting.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@kpi_setting_controller.route("/<int:id>", methods=["DELETE"])
|
|
||||||
def delete_kpi_setting(id):
|
|
||||||
kpi_setting = KPISettingModel.query.get_or_404(id)
|
|
||||||
db.session.delete(kpi_setting)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify({"message": f"KPI Setting {id} deleted successfully"}), 200
|
|
||||||
|
|
@ -1,97 +0,0 @@
|
||||||
from flask import Blueprint, request, jsonify, send_file
|
|
||||||
from model.database import db
|
|
||||||
from model.pitch_book_model import PitchBookModel
|
|
||||||
from io import BytesIO
|
|
||||||
from werkzeug.utils import secure_filename
|
|
||||||
import puremagic
|
|
||||||
|
|
||||||
|
|
||||||
pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book")
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/", methods=["GET"])
|
|
||||||
def get_all_files():
|
|
||||||
files = PitchBookModel.query.all()
|
|
||||||
return jsonify([file.to_dict() for file in files]), 200
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/<int:id>", methods=["GET"])
|
|
||||||
def get_file(id):
|
|
||||||
file = PitchBookModel.query.get_or_404(id)
|
|
||||||
return jsonify(file.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/<int:id>/download", methods=["GET"])
|
|
||||||
def download_file(id):
|
|
||||||
file = PitchBookModel.query.get_or_404(id)
|
|
||||||
return send_file(
|
|
||||||
BytesIO(file.file), download_name=file.filename, as_attachment=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/", methods=["POST"])
|
|
||||||
def upload_file():
|
|
||||||
if "file" not in request.files:
|
|
||||||
return jsonify({"error": "No file part in the request"}), 400
|
|
||||||
|
|
||||||
uploaded_file = request.files["file"]
|
|
||||||
if uploaded_file.filename == "":
|
|
||||||
return jsonify({"error": "No selected file"}), 400
|
|
||||||
|
|
||||||
# Read file data once
|
|
||||||
file_data = uploaded_file.read()
|
|
||||||
|
|
||||||
try:
|
|
||||||
if (
|
|
||||||
uploaded_file
|
|
||||||
and puremagic.from_string(file_data, mime=True) == "application/pdf"
|
|
||||||
):
|
|
||||||
fileName = uploaded_file.filename or ""
|
|
||||||
new_file = PitchBookModel(
|
|
||||||
filename=secure_filename(fileName), file=file_data
|
|
||||||
)
|
|
||||||
|
|
||||||
db.session.add(new_file)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(new_file.to_dict()), 201
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return jsonify({"error": "Invalid file format. Only PDF files are accepted"}), 400
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/<int:id>", methods=["PUT"])
|
|
||||||
def update_file(id):
|
|
||||||
file = PitchBookModel.query.get_or_404(id)
|
|
||||||
|
|
||||||
if "file" in request.files:
|
|
||||||
uploaded_file = request.files["file"]
|
|
||||||
if uploaded_file.filename != "":
|
|
||||||
file.filename = uploaded_file.filename
|
|
||||||
|
|
||||||
# Read file data once
|
|
||||||
file_data = uploaded_file.read()
|
|
||||||
try:
|
|
||||||
if (
|
|
||||||
uploaded_file
|
|
||||||
and puremagic.from_string(file_data, mime=True) == "application/pdf"
|
|
||||||
):
|
|
||||||
file.file = file_data
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
if "kpi" in request.form:
|
|
||||||
file.kpi = request.form.get("kpi")
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(file.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@pitch_book_controller.route("/<int:id>", methods=["DELETE"])
|
|
||||||
def delete_file(id):
|
|
||||||
file = PitchBookModel.query.get_or_404(id)
|
|
||||||
db.session.delete(file)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
|
||||||
|
|
@ -1,93 +0,0 @@
|
||||||
from flask import Blueprint, request, jsonify, send_file
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from model.spacy_model import SpacyModel
|
|
||||||
import puremagic
|
|
||||||
from werkzeug.utils import secure_filename
|
|
||||||
from model.database import db
|
|
||||||
|
|
||||||
|
|
||||||
spacy_controller = Blueprint("spacy", __name__, url_prefix="/api/spacy")
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/", methods=["GET"])
|
|
||||||
def get_all_files():
|
|
||||||
files = SpacyModel.query.all()
|
|
||||||
return jsonify([file.to_dict() for file in files]), 200
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/<int:id>", methods=["GET"])
|
|
||||||
def get_file(id):
|
|
||||||
file = SpacyModel.query.get_or_404(id)
|
|
||||||
return jsonify(file.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/<int:id>/download", methods=["GET"])
|
|
||||||
def download_file(id):
|
|
||||||
file = SpacyModel.query.get_or_404(id)
|
|
||||||
return send_file(
|
|
||||||
BytesIO(file.file), download_name=file.filename, as_attachment=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/", methods=["POST"])
|
|
||||||
def upload_file():
|
|
||||||
print(request)
|
|
||||||
if "file" not in request.files:
|
|
||||||
return jsonify({"error": "No file part in the request"}), 400
|
|
||||||
|
|
||||||
uploaded_file = request.files["file"]
|
|
||||||
if uploaded_file.filename == "":
|
|
||||||
return jsonify({"error": "No selected file"}), 400
|
|
||||||
|
|
||||||
# Read file data once
|
|
||||||
file_data = uploaded_file.read()
|
|
||||||
try:
|
|
||||||
if uploaded_file:
|
|
||||||
fileName = uploaded_file.filename or ""
|
|
||||||
new_file = SpacyModel(filename=secure_filename(fileName), file=file_data)
|
|
||||||
|
|
||||||
db.session.add(new_file)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(new_file.to_dict()), 201
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return jsonify({"error": "Invalid file format. Only PDF files are accepted"}), 400
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/<int:id>", methods=["PUT"])
|
|
||||||
def update_file(id):
|
|
||||||
file = SpacyModel.query.get_or_404(id)
|
|
||||||
|
|
||||||
if "file" in request.files:
|
|
||||||
uploaded_file = request.files["file"]
|
|
||||||
if uploaded_file.filename != "":
|
|
||||||
file.filename = uploaded_file.filename
|
|
||||||
|
|
||||||
# Read file data once
|
|
||||||
file_data = uploaded_file.read()
|
|
||||||
try:
|
|
||||||
if (
|
|
||||||
uploaded_file
|
|
||||||
and puremagic.from_string(file_data, mime=True) == "application/pdf"
|
|
||||||
):
|
|
||||||
file.file = file_data
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
if "kpi" in request.form:
|
|
||||||
file.kpi = request.form.get("kpi")
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify(file.to_dict()), 200
|
|
||||||
|
|
||||||
|
|
||||||
@spacy_controller.route("/<int:id>", methods=["DELETE"])
|
|
||||||
def delete_file(id):
|
|
||||||
file = SpacyModel.query.get_or_404(id)
|
|
||||||
db.session.delete(file)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
|
|
||||||
services:
|
|
||||||
db:
|
|
||||||
image: postgres
|
|
||||||
environment:
|
|
||||||
POSTGRES_PASSWORD: admin
|
|
||||||
POSTGRES_USER: admin
|
|
||||||
ports:
|
|
||||||
- "5432:5432"
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
from flask_sqlalchemy import SQLAlchemy
|
|
||||||
from sqlalchemy.orm import DeclarativeBase
|
|
||||||
|
|
||||||
|
|
||||||
class Base(DeclarativeBase):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
db = SQLAlchemy(model_class=Base)
|
|
||||||
|
|
||||||
|
|
||||||
def init_db(app):
|
|
||||||
db.init_app(app)
|
|
||||||
with app.app_context():
|
|
||||||
db.create_all()
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
||||||
from model.database import db
|
|
||||||
from sqlalchemy.orm import Mapped, mapped_column
|
|
||||||
from sqlalchemy import Enum as SQLAlchemyEnum
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
|
|
||||||
class KPISettingType(Enum):
|
|
||||||
NUMBER = "number"
|
|
||||||
STRING = "string"
|
|
||||||
RANGE = "range"
|
|
||||||
BOOLEAN = "boolean"
|
|
||||||
ARRAY = "array"
|
|
||||||
|
|
||||||
|
|
||||||
class KPISettingModel(db.Model):
|
|
||||||
id: Mapped[int] = mapped_column(primary_key=True)
|
|
||||||
name: Mapped[str] = mapped_column(unique=True)
|
|
||||||
description: Mapped[str]
|
|
||||||
mandatory: Mapped[bool]
|
|
||||||
type: Mapped[KPISettingType] = mapped_column(
|
|
||||||
SQLAlchemyEnum(KPISettingType, native_enum=True)
|
|
||||||
)
|
|
||||||
translation: Mapped[str]
|
|
||||||
example: Mapped[str]
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
return {
|
|
||||||
"id": self.id,
|
|
||||||
"name": self.name,
|
|
||||||
"description": self.description,
|
|
||||||
"mandatory": self.mandatory,
|
|
||||||
"type": self.type.value,
|
|
||||||
"translation": self.translation,
|
|
||||||
"example": self.example,
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, name, description, mandatory, type, translation, example):
|
|
||||||
self.name = name
|
|
||||||
self.description = description
|
|
||||||
self.mandatory = mandatory
|
|
||||||
self.type = type
|
|
||||||
self.translation = translation
|
|
||||||
self.example = example
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
from model.database import db
|
|
||||||
from sqlalchemy.orm import Mapped, mapped_column
|
|
||||||
from sqlalchemy import LargeBinary
|
|
||||||
|
|
||||||
|
|
||||||
class PitchBookModel(db.Model):
|
|
||||||
id: Mapped[int] = mapped_column(primary_key=True)
|
|
||||||
filename: Mapped[str] = mapped_column()
|
|
||||||
file: Mapped[bytes] = mapped_column(LargeBinary)
|
|
||||||
kpi: Mapped[str | None]
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
return {"id": self.id, "filename": self.filename, "kpi": self.kpi}
|
|
||||||
|
|
||||||
def __init__(self, filename, file):
|
|
||||||
self.filename = filename
|
|
||||||
self.file = file
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
from model.database import db
|
|
||||||
from sqlalchemy.orm import Mapped, mapped_column
|
|
||||||
from sqlalchemy import LargeBinary
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class SpacyModel(db.Model):
|
|
||||||
id: Mapped[int] = mapped_column(primary_key=True)
|
|
||||||
filename: Mapped[str] = mapped_column()
|
|
||||||
file: Mapped[bytes] = mapped_column(LargeBinary)
|
|
||||||
created_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
return {
|
|
||||||
"id": self.id,
|
|
||||||
"filename": self.filename,
|
|
||||||
"created_at": self.created_at.isoformat(),
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, filename, file):
|
|
||||||
self.filename = filename
|
|
||||||
self.file = file
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
black==25.1.0
|
|
||||||
blinker==1.9.0
|
|
||||||
cfgv==3.4.0
|
|
||||||
click==8.2.1
|
|
||||||
distlib==0.3.9
|
|
||||||
filelock==3.18.0
|
|
||||||
flake8==7.2.0
|
|
||||||
Flask==3.1.1
|
|
||||||
Flask-SQLAlchemy==3.1.1
|
|
||||||
greenlet==3.2.2
|
|
||||||
identify==2.6.12
|
|
||||||
itsdangerous==2.2.0
|
|
||||||
Jinja2==3.1.6
|
|
||||||
MarkupSafe==3.0.2
|
|
||||||
mccabe==0.7.0
|
|
||||||
mypy_extensions==1.1.0
|
|
||||||
nodeenv==1.9.1
|
|
||||||
packaging==25.0
|
|
||||||
pathspec==0.12.1
|
|
||||||
platformdirs==4.3.8
|
|
||||||
pre_commit==4.2.0
|
|
||||||
psycopg2-binary==2.9.10
|
|
||||||
puremagic==1.29
|
|
||||||
pycodestyle==2.13.0
|
|
||||||
pyflakes==3.3.2
|
|
||||||
python-dotenv==1.1.0
|
|
||||||
PyYAML==6.0.2
|
|
||||||
SQLAlchemy==2.0.41
|
|
||||||
typing_extensions==4.13.2
|
|
||||||
virtualenv==20.31.2
|
|
||||||
Werkzeug==3.1.3
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
EXPOSE 5050
|
||||||
|
|
||||||
|
CMD ["python", "app.py"]
|
||||||
|
|
@ -7,13 +7,11 @@ RUN apt-get update && apt-get install -y \
|
||||||
build-essential \
|
build-essential \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY requirements.txt /app
|
COPY .. /app
|
||||||
|
|
||||||
RUN pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
RUN python -m spacy download en_core_web_sm
|
RUN python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
COPY .. /app
|
|
||||||
|
|
||||||
CMD ["python3.12", "app.py"]
|
CMD ["python3.12", "app.py"]
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,44 +0,0 @@
|
||||||
services:
|
|
||||||
frontend:
|
|
||||||
build:
|
|
||||||
context: frontend
|
|
||||||
ports:
|
|
||||||
- 8080:80
|
|
||||||
db:
|
|
||||||
image: postgres:17-alpine
|
|
||||||
env_file:
|
|
||||||
- .env
|
|
||||||
# ports:
|
|
||||||
# - "5432:5432"
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "pg_isready -U admin"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 5
|
|
||||||
|
|
||||||
coordinator:
|
|
||||||
build:
|
|
||||||
context: backend/coordinator
|
|
||||||
dockerfile: ../../Dockerfile
|
|
||||||
env_file:
|
|
||||||
- .env
|
|
||||||
depends_on:
|
|
||||||
- db
|
|
||||||
healthcheck:
|
|
||||||
test: wget --spider --no-verbose http://127.0.0.1:5000/health || exit 1
|
|
||||||
interval: 10s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 5
|
|
||||||
ports:
|
|
||||||
- 5000:5000
|
|
||||||
|
|
||||||
spacy:
|
|
||||||
build:
|
|
||||||
context: backend/spacy-service
|
|
||||||
|
|
||||||
exxeta:
|
|
||||||
build:
|
|
||||||
context: backend/exxetaGPT
|
|
||||||
dockerfile: ../../Dockerfile
|
|
||||||
env_file:
|
|
||||||
- .env
|
|
||||||
Binary file not shown.
|
|
@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
|
||||||
fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
|
fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager()
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
|
@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
pages = PDFPage.get_pages(fp)
|
pages = PDFPage.get_pages(fp)
|
||||||
|
|
||||||
for page in pages:
|
for page in pages:
|
||||||
print("Processing next page...")
|
print('Processing next page...')
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
layout = device.get_result()
|
layout = device.get_result()
|
||||||
for lobj in layout:
|
for lobj in layout:
|
||||||
if isinstance(lobj, LTTextBox):
|
if isinstance(lobj, LTTextBox):
|
||||||
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
||||||
print("At %r is text: %s" % ((x, y), text))
|
print('At %r is text: %s' % ((x, y), text))
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
#########################################################
|
#########################################################
|
||||||
# Run: in Terminal -> streamlit run PyMuPdf_st.py
|
#Run: in Terminal -> streamlit run PyMuPdf_st.py
|
||||||
#########################################################
|
#########################################################
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
@ -28,14 +28,18 @@ if uploaded_file and suchwort:
|
||||||
rects = page.search_for(suchwort)
|
rects = page.search_for(suchwort)
|
||||||
|
|
||||||
for rect in rects:
|
for rect in rects:
|
||||||
fundstellen.append({"seite": page_num, "rect": rect})
|
fundstellen.append({
|
||||||
|
"seite": page_num,
|
||||||
|
"rect": rect
|
||||||
|
})
|
||||||
|
|
||||||
if fundstellen:
|
if fundstellen:
|
||||||
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
||||||
|
|
||||||
# Auswahl der Fundstelle
|
# Auswahl der Fundstelle
|
||||||
auswahl = st.selectbox(
|
auswahl = st.selectbox(
|
||||||
"Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
|
"Fundstelle auswählen:",
|
||||||
|
[f"Seite {f['seite'] + 1}" for f in fundstellen]
|
||||||
)
|
)
|
||||||
|
|
||||||
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
||||||
|
|
|
||||||
|
|
@ -38,9 +38,7 @@ for eintrag in kennzahlen:
|
||||||
highlight = page.add_highlight_annot(rect)
|
highlight = page.add_highlight_annot(rect)
|
||||||
highlight.update()
|
highlight.update()
|
||||||
else:
|
else:
|
||||||
st.warning(
|
st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
|
||||||
f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
||||||
|
|
||||||
|
|
@ -70,13 +68,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
|
||||||
# PDF anzeigen mit Scroll zu aktueller Seite
|
# PDF anzeigen mit Scroll zu aktueller Seite
|
||||||
st.subheader(f"Vorschau")
|
st.subheader(f"Vorschau")
|
||||||
with open(highlighted_path, "rb") as f:
|
with open(highlighted_path, "rb") as f:
|
||||||
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
||||||
|
|
||||||
# Seite direkt ansteuern
|
# Seite direkt ansteuern
|
||||||
pdf_display = f"""
|
pdf_display = f'''
|
||||||
<iframe
|
<iframe
|
||||||
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
||||||
width="100%" height="800px" type="application/pdf">
|
width="100%" height="800px" type="application/pdf">
|
||||||
</iframe>
|
</iframe>
|
||||||
"""
|
'''
|
||||||
st.markdown(pdf_display, unsafe_allow_html=True)
|
st.markdown(pdf_display, unsafe_allow_html=True)
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,9 @@ class Server:
|
||||||
server_params = StdioServerParameters(
|
server_params = StdioServerParameters(
|
||||||
command=command,
|
command=command,
|
||||||
args=self.config["args"],
|
args=self.config["args"],
|
||||||
env=(
|
env={**os.environ, **self.config["env"]}
|
||||||
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
if self.config.get("env")
|
||||||
),
|
else None,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
stdio_transport = await self.exit_stack.enter_async_context(
|
stdio_transport = await self.exit_stack.enter_async_context(
|
||||||
|
|
@ -244,10 +244,15 @@ class LLMClient:
|
||||||
formatted_messages = []
|
formatted_messages = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
# print(msg)
|
# print(msg)
|
||||||
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
formatted_messages.append({
|
||||||
|
"role": msg["role"],
|
||||||
|
"content": msg["content"]
|
||||||
|
})
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
api_key=self.api_key,
|
||||||
|
api_version="2023-07-01-preview",
|
||||||
|
base_url=url
|
||||||
)
|
)
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=formatted_messages,
|
messages=formatted_messages,
|
||||||
|
|
@ -407,16 +412,12 @@ class ChatSession:
|
||||||
"4. Use appropriate context from the user's question\n"
|
"4. Use appropriate context from the user's question\n"
|
||||||
"5. Avoid simply repeating the raw data\n\n"
|
"5. Avoid simply repeating the raw data\n\n"
|
||||||
"Please use only the tools that are explicitly defined above."
|
"Please use only the tools that are explicitly defined above."
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
messages = [{"role": "system", "content": system_message}]
|
messages = [{"role": "system", "content": system_message}]
|
||||||
messages.append(
|
messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
|
||||||
{
|
"For each value there is only one correct answer, try to find it with the tools provided."})
|
||||||
"role": "assistant",
|
|
||||||
"content": "You have to extract data from pdf files and have different tools for extracting."
|
|
||||||
"For each value there is only one correct answer, try to find it with the tools provided.",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|
@ -454,6 +455,7 @@ class ChatSession:
|
||||||
# messages.append({"role": "assistant", "content": llm_response})
|
# messages.append({"role": "assistant", "content": llm_response})
|
||||||
# logging.info("\nFinal response: %s", llm_response)
|
# logging.info("\nFinal response: %s", llm_response)
|
||||||
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logging.info("\nExiting...")
|
logging.info("\nExiting...")
|
||||||
break
|
break
|
||||||
|
|
@ -474,6 +476,5 @@ async def main() -> None:
|
||||||
chat_session = ChatSession(servers, llm_client)
|
chat_session = ChatSession(servers, llm_client)
|
||||||
await chat_session.start()
|
await chat_session.start()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
|
||||||
|
|
@ -8,86 +8,54 @@ mcp = FastMCP("Demo")
|
||||||
risikoProfile = ["Core/Core+, Core", "Value Add"]
|
risikoProfile = ["Core/Core+, Core", "Value Add"]
|
||||||
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
|
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
|
||||||
|
|
||||||
|
|
||||||
# Add an addition tool
|
# Add an addition tool
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def add(a: int, b: int) -> int:
|
def add(a: int, b: int) -> int:
|
||||||
"""Add two numbers"""
|
"""Add two numbers"""
|
||||||
return a + b
|
return a + b
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromSpaCy() -> list:
|
def getFromSpaCy() -> list:
|
||||||
"""Get data from SpaCy"""
|
"""Get data from SpaCy"""
|
||||||
return [
|
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
|
||||||
{
|
{"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
|
||||||
"page": random.randint(1, 35),
|
|
||||||
"value": random.choice(risikoProfileSpacy),
|
|
||||||
"key": "Risiko",
|
|
||||||
},
|
|
||||||
{"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromChatGPT() -> list:
|
def getFromChatGPT() -> list:
|
||||||
"""Get data from ChatGPT"""
|
"""Get data from ChatGPT"""
|
||||||
return [
|
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
|
||||||
{
|
{"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
|
||||||
"page": random.randint(1, 35),
|
|
||||||
"value": random.choice(risikoProfile),
|
|
||||||
"key": "Risiko",
|
|
||||||
},
|
|
||||||
{"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def checkSpacyResult() -> dict:
|
def checkSpacyResult() -> dict:
|
||||||
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
|
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
|
||||||
return {
|
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
|
||||||
"page": random.randint(1, 35),
|
|
||||||
"value": random.choice(risikoProfile),
|
|
||||||
"key": "Risiko",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromChatGPTSingle(value: str) -> dict:
|
def getFromChatGPTSingle(value: str) -> dict:
|
||||||
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
|
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
|
||||||
return {
|
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
|
||||||
"page": random.randint(1, 35),
|
|
||||||
"value": random.choice(risikoProfile),
|
|
||||||
"key": value,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
context = ""
|
context = ""
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getContext() -> str:
|
def getContext() -> str:
|
||||||
"""This tool gets context information."""
|
"""This tool gets context information."""
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def setContext(value: str) -> None:
|
def setContext(value: str) -> None:
|
||||||
"""This tool sets context information."""
|
"""This tool sets context information."""
|
||||||
global context
|
global context
|
||||||
context = value
|
context = value
|
||||||
|
|
||||||
|
|
||||||
# Add a dynamic greeting resource
|
# Add a dynamic greeting resource
|
||||||
@mcp.resource("greeting://{name}")
|
@mcp.resource("greeting://{name}")
|
||||||
def get_greeting(name: str) -> str:
|
def get_greeting(name: str) -> str:
|
||||||
"""Get a personalized greeting"""
|
"""Get a personalized greeting"""
|
||||||
return f"Hello, {name}!"
|
return f"Hello, {name}!"
|
||||||
|
|
||||||
|
|
||||||
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
|
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def validate_entity(entity: str, label: str) -> dict:
|
def validate_entity(entity: str, label: str) -> dict:
|
||||||
"""Returns if the entity is valid based on hardcoded rules."""
|
"""Returns if the entity is valid based on hardcoded rules."""
|
||||||
|
|
@ -98,18 +66,11 @@ def validate_entity(entity: str, label: str) -> dict:
|
||||||
return {"status": "valid", "entity": entity}
|
return {"status": "valid", "entity": entity}
|
||||||
return {"status": "invalid", "entity": entity}
|
return {"status": "invalid", "entity": entity}
|
||||||
|
|
||||||
|
|
||||||
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
|
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def merge_spacy_exxeta(
|
def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
|
||||||
spacy_result: list[dict], exxeta_result: list[dict]
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Merge two results, mark as validated if label/entity/page match."""
|
"""Merge two results, mark as validated if label/entity/page match."""
|
||||||
|
def norm(e): return e["entity"].lower().replace(" ", "")
|
||||||
def norm(e):
|
|
||||||
return e["entity"].lower().replace(" ", "")
|
|
||||||
|
|
||||||
merged = []
|
merged = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -117,16 +78,7 @@ def merge_spacy_exxeta(
|
||||||
for s in spacy_result:
|
for s in spacy_result:
|
||||||
s_norm = norm(s)
|
s_norm = norm(s)
|
||||||
s_page = s["page"]
|
s_page = s["page"]
|
||||||
match = next(
|
match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
|
||||||
(
|
|
||||||
e
|
|
||||||
for e in exxeta_result
|
|
||||||
if e["label"] == s["label"]
|
|
||||||
and norm(e) == s_norm
|
|
||||||
and e["page"] == s_page
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
if match:
|
if match:
|
||||||
merged.append({**s, "status": "validated"})
|
merged.append({**s, "status": "validated"})
|
||||||
seen.add((match["entity"], match["page"]))
|
seen.add((match["entity"], match["page"]))
|
||||||
|
|
|
||||||
|
|
@ -12,12 +12,10 @@ app = Flask(__name__)
|
||||||
UPLOAD_FOLDER = Path("pitchbooks")
|
UPLOAD_FOLDER = Path("pitchbooks")
|
||||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def home():
|
def home():
|
||||||
return "Backend is running!"
|
return "Backend is running!"
|
||||||
|
|
||||||
|
|
||||||
@app.route("/upload", methods=["POST"])
|
@app.route("/upload", methods=["POST"])
|
||||||
def upload():
|
def upload():
|
||||||
file = request.files.get("file")
|
file = request.files.get("file")
|
||||||
|
|
@ -46,6 +44,5 @@ def upload():
|
||||||
|
|
||||||
return "status: complete\n"
|
return "status: complete\n"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
@ -7,7 +7,6 @@ MODEL = "gpt-35-turbo"
|
||||||
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
|
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
|
||||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def extract_with_exxeta(pages_json):
|
def extract_with_exxeta(pages_json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -20,15 +19,15 @@ def extract_with_exxeta(pages_json):
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||||
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
||||||
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
||||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||||
"Beispiele:\n"
|
"Beispiele:\n"
|
||||||
'- "Core, Core+" → entity: "Core, Core+"\n'
|
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
||||||
'- "Core/Core+" → entity: "Core/Core+"\n'
|
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
||||||
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
||||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||||
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
||||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||||
"TEXT:\n" + text
|
"TEXT:\n" + text
|
||||||
|
|
@ -36,19 +35,16 @@ def extract_with_exxeta(pages_json):
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
||||||
"role": "system",
|
{"role": "user", "content": prompt}
|
||||||
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
|
||||||
},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,13 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
def normalize_entity(entity_str):
|
def normalize_entity(entity_str):
|
||||||
return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
|
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
|
||||||
|
|
||||||
|
|
||||||
def load_json(path: Path):
|
def load_json(path: Path):
|
||||||
with path.open("r", encoding="utf-8") as f:
|
with path.open("r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def merge_and_validate_entities(filter_label=None):
|
def merge_and_validate_entities(filter_label=None):
|
||||||
base = Path(__file__).resolve().parent.parent
|
base = Path(__file__).resolve().parent.parent
|
||||||
spacy_path = base / "spacy_service/output/spacy-results.json"
|
spacy_path = base / "spacy_service/output/spacy-results.json"
|
||||||
|
|
@ -28,14 +25,11 @@ def merge_and_validate_entities(filter_label=None):
|
||||||
s_page = s["page"]
|
s_page = s["page"]
|
||||||
|
|
||||||
match = next(
|
match = next(
|
||||||
(
|
(e for e in exxeta_data
|
||||||
e
|
if e["label"] == s["label"] and
|
||||||
for e in exxeta_data
|
normalize_entity(e["entity"]) == s_norm and
|
||||||
if e["label"] == s["label"]
|
e["page"] == s_page),
|
||||||
and normalize_entity(e["entity"]) == s_norm
|
None
|
||||||
and e["page"] == s_page
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ BASE_DIR = Path(__file__).resolve().parent
|
||||||
OUTPUT_FOLDER = BASE_DIR / "output"
|
OUTPUT_FOLDER = BASE_DIR / "output"
|
||||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def run_ocr_and_extract(pdf_path: str):
|
def run_ocr_and_extract(pdf_path: str):
|
||||||
pdf_path = Path(pdf_path)
|
pdf_path = Path(pdf_path)
|
||||||
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
|
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
|
||||||
|
|
@ -17,12 +16,10 @@ def run_ocr_and_extract(pdf_path: str):
|
||||||
cmd = [
|
cmd = [
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
"--force-ocr",
|
"--force-ocr",
|
||||||
"--output-type",
|
"--output-type", "pdfa",
|
||||||
"pdfa",
|
"--language", "deu+eng",
|
||||||
"--language",
|
|
||||||
"deu+eng",
|
|
||||||
str(pdf_path),
|
str(pdf_path),
|
||||||
str(output_pdf),
|
str(output_pdf)
|
||||||
]
|
]
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True)
|
result = subprocess.run(cmd, capture_output=True)
|
||||||
|
|
@ -31,12 +28,12 @@ def run_ocr_and_extract(pdf_path: str):
|
||||||
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
|
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
|
||||||
|
|
||||||
with pdfplumber.open(output_pdf) as pdf:
|
with pdfplumber.open(output_pdf) as pdf:
|
||||||
pages = [
|
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
||||||
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
|
||||||
for i, page in enumerate(pdf.pages)
|
|
||||||
]
|
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}
|
return {
|
||||||
|
"ocr_pdf": str(output_pdf),
|
||||||
|
"json_path": str(json_path)
|
||||||
|
}
|
||||||
|
|
@ -9,13 +9,7 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
||||||
nlp = spacy.load(model_path)
|
nlp = spacy.load(model_path)
|
||||||
input_pdf_path = (
|
input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
|
||||||
Path(__file__).resolve().parent
|
|
||||||
/ ".."
|
|
||||||
/ "ocr_pdf_service"
|
|
||||||
/ "output"
|
|
||||||
/ "pitchbook-OCR.pdf"
|
|
||||||
)
|
|
||||||
input_pdf = Path(input_pdf_path)
|
input_pdf = Path(input_pdf_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -31,7 +25,11 @@ def extract_with_spacy(pages_json):
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
|
results.append({
|
||||||
|
"label": ent.label_,
|
||||||
|
"entity": ent.text,
|
||||||
|
"page": page_num
|
||||||
|
})
|
||||||
|
|
||||||
output_path = OUTPUT_FOLDER / f"spacy-results.json"
|
output_path = OUTPUT_FOLDER / f"spacy-results.json"
|
||||||
with open(output_path, "w", encoding="utf-8") as f:
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||||
API_KEY = os.getenv("API_KEY")
|
API_KEY = os.getenv("API_KEY")
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
api_key=API_KEY,
|
||||||
)
|
api_version="2023-07-01-preview",
|
||||||
|
base_url=BASE_URL
|
||||||
|
)
|
||||||
def extract_text_from_pdf(file_path):
|
def extract_text_from_pdf(file_path):
|
||||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||||
all_text = ""
|
all_text = ""
|
||||||
|
|
@ -40,7 +40,10 @@ pdf_text = extract_text_from_pdf(file_path)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": "Always respond with a valid JSON object"},
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Always respond with a valid JSON object"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": """extract the values from the text. let not found values empty:
|
"content": """extract the values from the text. let not found values empty:
|
||||||
|
|
@ -68,12 +71,11 @@ response = client.chat.completions.create(
|
||||||
- the page where this value was found
|
- the page where this value was found
|
||||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||||
|
|
||||||
Here ist the text:"""
|
Here ist the text:""" + pdf_text
|
||||||
+ pdf_text,
|
}
|
||||||
},
|
|
||||||
],
|
],
|
||||||
model="gpt-4o-mini",
|
model="gpt-4o-mini",
|
||||||
response_format={"type": "json_object"},
|
response_format={"type": "json_object"}
|
||||||
# temperature=0.7,
|
# temperature=0.7,
|
||||||
# top_p=0.95,
|
# top_p=0.95,
|
||||||
# frequency_penalty=0,
|
# frequency_penalty=0,
|
||||||
|
|
@ -81,7 +83,8 @@ response = client.chat.completions.create(
|
||||||
# max_tokens=800,
|
# max_tokens=800,
|
||||||
# stop="",
|
# stop="",
|
||||||
# stream=False
|
# stream=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(response.choices[0].message.content)
|
print(response.choices[0].message.content)
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||||
API_KEY = os.getenv("API_KEY")
|
API_KEY = os.getenv("API_KEY")
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
api_key=API_KEY,
|
||||||
)
|
api_version="2023-07-01-preview",
|
||||||
|
base_url=BASE_URL
|
||||||
|
)
|
||||||
def extract_text_from_pdf(file_path):
|
def extract_text_from_pdf(file_path):
|
||||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||||
all_text = ""
|
all_text = ""
|
||||||
|
|
@ -40,7 +40,10 @@ pdf_text = extract_text_from_pdf(file_path)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": "Always respond with a valid JSON object"},
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Always respond with a valid JSON object"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": """extract the values from the text. let not found values empty:
|
"content": """extract the values from the text. let not found values empty:
|
||||||
|
|
@ -68,12 +71,11 @@ response = client.chat.completions.create(
|
||||||
- the page where this value was found
|
- the page where this value was found
|
||||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||||
|
|
||||||
Here ist the text:"""
|
Here ist the text:""" + pdf_text
|
||||||
+ pdf_text,
|
}
|
||||||
},
|
|
||||||
],
|
],
|
||||||
model="gpt-4o-mini",
|
model="gpt-4o-mini",
|
||||||
response_format={"type": "json_object"},
|
response_format={"type": "json_object"}
|
||||||
# temperature=0.7,
|
# temperature=0.7,
|
||||||
# top_p=0.95,
|
# top_p=0.95,
|
||||||
# frequency_penalty=0,
|
# frequency_penalty=0,
|
||||||
|
|
@ -81,7 +83,8 @@ response = client.chat.completions.create(
|
||||||
# max_tokens=800,
|
# max_tokens=800,
|
||||||
# stop="",
|
# stop="",
|
||||||
# stream=False
|
# stream=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(response.choices[0].message.content)
|
print(response.choices[0].message.content)
|
||||||
|
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
|
|
||||||
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
import streamlit as st
|
|
||||||
import json
|
|
||||||
|
|
||||||
st.title("Neue Kennzahl annotieren")
|
|
||||||
|
|
||||||
text = st.text_area("Text", "Das geplante Projektvolumen beträgt 120 Mio. €.")
|
|
||||||
start = st.number_input("Start-Position", min_value=0, max_value=len(text), value=28)
|
|
||||||
end = st.number_input("End-Position", min_value=0, max_value=len(text), value=44)
|
|
||||||
label = st.text_input("Label (z. B. KENNZAHL)", "KENNZAHL")
|
|
||||||
|
|
||||||
if st.button("Speichern"):
|
|
||||||
example = {"text": text, "entities": [[start, end, label]]}
|
|
||||||
|
|
||||||
with open("annotated_data.json", "a", encoding="utf-8") as f:
|
|
||||||
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
|
||||||
|
|
||||||
st.success("✅ Annotation gespeichert!")
|
|
||||||
|
|
@ -15,9 +15,11 @@ for page_number in range(len(doc)):
|
||||||
text = page.get_text()
|
text = page.get_text()
|
||||||
spacy_doc = nlp(text)
|
spacy_doc = nlp(text)
|
||||||
for ent in spacy_doc.ents:
|
for ent in spacy_doc.ents:
|
||||||
results.append(
|
results.append({
|
||||||
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
|
"label": ent.label_,
|
||||||
)
|
"entity": ent.text.strip(),
|
||||||
|
"page": page_number + 1
|
||||||
|
})
|
||||||
|
|
||||||
with open("entities_output.json", "w", encoding="utf-8") as f:
|
with open("entities_output.json", "w", encoding="utf-8") as f:
|
||||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
|
||||||
|
|
@ -226,5 +226,9 @@ TRAINING_DATA = [
|
||||||
(
|
(
|
||||||
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
|
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
|
||||||
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
|
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
|
||||||
),
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,14 +22,10 @@ for text, annot in tqdm(TRAINING_DATA):
|
||||||
for start, end, label in annot["entities"]:
|
for start, end, label in annot["entities"]:
|
||||||
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
||||||
if span is None:
|
if span is None:
|
||||||
print(
|
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
||||||
f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
ents.append(span)
|
ents.append(span)
|
||||||
print(
|
print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
||||||
f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
|
||||||
)
|
|
||||||
# label the text with the ents
|
# label the text with the ents
|
||||||
doc.ents = ents
|
doc.ents = ents
|
||||||
db.add(doc)
|
db.add(doc)
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,9 @@ class Server:
|
||||||
server_params = StdioServerParameters(
|
server_params = StdioServerParameters(
|
||||||
command=command,
|
command=command,
|
||||||
args=self.config["args"],
|
args=self.config["args"],
|
||||||
env=(
|
env={**os.environ, **self.config["env"]}
|
||||||
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
if self.config.get("env")
|
||||||
),
|
else None,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
stdio_transport = await self.exit_stack.enter_async_context(
|
stdio_transport = await self.exit_stack.enter_async_context(
|
||||||
|
|
@ -244,10 +244,15 @@ class LLMClient:
|
||||||
formatted_messages = []
|
formatted_messages = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
print(msg)
|
print(msg)
|
||||||
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
formatted_messages.append({
|
||||||
|
"role": msg["role"],
|
||||||
|
"content": msg["content"]
|
||||||
|
})
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
api_key=self.api_key,
|
||||||
|
api_version="2023-07-01-preview",
|
||||||
|
base_url=url
|
||||||
)
|
)
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=formatted_messages,
|
messages=formatted_messages,
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
# server.py
|
# server.py
|
||||||
from mcp.server.fastmcp import FastMCP
|
from mcp.server.fastmcp import FastMCP
|
||||||
|
|
||||||
# Create an MCP server
|
# Create an MCP server
|
||||||
mcp = FastMCP("Demo")
|
mcp = FastMCP("Demo")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,59 +9,51 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||||
OUTPUT_PATH = "mcp_spacy_validated_result.json"
|
OUTPUT_PATH = "mcp_spacy_validated_result.json"
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities():
|
def load_spacy_entities():
|
||||||
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
|
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def load_pitchbook_pages():
|
def load_pitchbook_pages():
|
||||||
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
|
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def get_page_text(pages, page_number):
|
def get_page_text(pages, page_number):
|
||||||
for page in pages:
|
for page in pages:
|
||||||
if page.get("page") == page_number:
|
if page.get("page") == page_number:
|
||||||
return page.get("text", "")
|
return page.get("text", "")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def normalize_entity(entity):
|
def normalize_entity(entity):
|
||||||
return " ".join(entity.replace("\n", " ").split())
|
return ' '.join(entity.replace('\n', ' ').split())
|
||||||
|
|
||||||
|
|
||||||
def validate_entity_with_exxeta(entity, page_num, text):
|
def validate_entity_with_exxeta(entity, page_num, text):
|
||||||
prompt = (
|
prompt = (
|
||||||
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
|
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
|
||||||
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
|
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
|
||||||
f"Ziel-Formulierung:\n"
|
f"Ziel-Formulierung:\n"
|
||||||
f'"{entity}"\n\n'
|
f"\"{entity}\"\n\n"
|
||||||
f"Validierungsregeln:\n"
|
f"Validierungsregeln:\n"
|
||||||
f"- Groß- und Kleinschreibung ignorieren.\n"
|
f"- Groß- und Kleinschreibung ignorieren.\n"
|
||||||
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
|
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
|
||||||
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
|
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
|
||||||
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
|
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
|
||||||
f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
|
f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
|
||||||
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
|
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
|
||||||
f"OCR-Text auf Seite {page_num}:\n{text}"
|
f"OCR-Text auf Seite {page_num}:\n{text}"
|
||||||
)
|
)
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
|
||||||
"role": "system",
|
{"role": "user", "content": prompt}
|
||||||
"content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
|
|
||||||
},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
@ -75,7 +67,6 @@ def validate_entity_with_exxeta(entity, page_num, text):
|
||||||
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
|
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities()
|
spacy_entities = load_spacy_entities()
|
||||||
pitchbook_pages = load_pitchbook_pages()
|
pitchbook_pages = load_pitchbook_pages()
|
||||||
|
|
@ -90,20 +81,17 @@ def run():
|
||||||
page_text = get_page_text(pitchbook_pages, page)
|
page_text = get_page_text(pitchbook_pages, page)
|
||||||
is_valid = validate_entity_with_exxeta(entity, page, page_text)
|
is_valid = validate_entity_with_exxeta(entity, page, page_text)
|
||||||
|
|
||||||
validated_results.append(
|
validated_results.append({
|
||||||
{
|
|
||||||
"label": entity_data.get("label"),
|
"label": entity_data.get("label"),
|
||||||
"entity": raw_entity,
|
"entity": raw_entity,
|
||||||
"page": page,
|
"page": page,
|
||||||
"validated": is_valid,
|
"validated": is_valid
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(validated_results, f, indent=2, ensure_ascii=False)
|
json.dump(validated_results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
|
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
@ -10,23 +10,19 @@ KPI_SERVICE_MAP = {
|
||||||
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
|
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities(path):
|
def load_spacy_entities(path):
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def load_exxeta_entities(path):
|
def load_exxeta_entities(path):
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def normalize(text):
|
def normalize(text):
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
return text.strip().lower().replace(" ", "").replace("/", "/")
|
return text.strip().lower().replace(" ", "").replace("/", "/")
|
||||||
|
|
||||||
|
|
||||||
def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -54,47 +50,39 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
||||||
for ee in exxeta_entries:
|
for ee in exxeta_entries:
|
||||||
ee_entity = normalize(ee["entity"])
|
ee_entity = normalize(ee["entity"])
|
||||||
if se_entity == ee_entity:
|
if se_entity == ee_entity:
|
||||||
results.append(
|
results.append({
|
||||||
{
|
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": se["entity"],
|
"entity": se["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "validated",
|
"validation_status": "validated"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
matched = True
|
matched = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not matched:
|
if not matched:
|
||||||
results.append(
|
results.append({
|
||||||
{
|
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": se["entity"],
|
"entity": se["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "spacy-only",
|
"validation_status": "spacy-only"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
for ee in exxeta_entries:
|
for ee in exxeta_entries:
|
||||||
ee_entity = normalize(ee["entity"])
|
ee_entity = normalize(ee["entity"])
|
||||||
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
|
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
|
||||||
results.append(
|
results.append({
|
||||||
{
|
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": ee["entity"],
|
"entity": ee["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "exxeta-only",
|
"validation_status": "exxeta-only"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def save_results(results, filename):
|
def save_results(results, filename):
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
|
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
|
||||||
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
|
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
|
||||||
|
|
@ -108,6 +96,5 @@ def run():
|
||||||
save_results(all_results, "mcp_validated_result.json")
|
save_results(all_results, "mcp_validated_result.json")
|
||||||
print("✅ Validation complete! Output: mcp_validated_result.json")
|
print("✅ Validation complete! Output: mcp_validated_result.json")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import json
|
||||||
|
|
||||||
MODEL = "gpt-35-turbo"
|
MODEL = "gpt-35-turbo"
|
||||||
|
|
||||||
|
|
||||||
def extract_risikoprofil_from_exxeta(pages_json):
|
def extract_risikoprofil_from_exxeta(pages_json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -17,35 +16,33 @@ def extract_risikoprofil_from_exxeta(pages_json):
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||||
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
||||||
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
||||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||||
"Beispiele:\n"
|
"Beispiele:\n"
|
||||||
'- "Core, Core+" → entity: "Core, Core+"\n'
|
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
||||||
'- "Core/Core+" → entity: "Core/Core+"\n'
|
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
||||||
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
||||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||||
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
||||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||||
"TEXT:\n" + text
|
"TEXT:\n" + text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
||||||
"role": "system",
|
{"role": "user", "content": prompt}
|
||||||
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
|
||||||
},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
def normalize_entity(entity_str):
|
def normalize_entity(entity_str):
|
||||||
if not entity_str:
|
if not entity_str:
|
||||||
return ""
|
return ""
|
||||||
normalized = entity_str.replace("\n", " ")
|
normalized = entity_str.replace('\n', ' ')
|
||||||
normalized = "".join(normalized.lower().split())
|
normalized = ''.join(normalized.lower().split())
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def merge_and_validate_entities(spacy_data, exxeta_data):
|
def merge_and_validate_entities(spacy_data, exxeta_data):
|
||||||
merged = []
|
merged = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -22,47 +21,39 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
|
||||||
e_page = e["page"]
|
e_page = e["page"]
|
||||||
|
|
||||||
# Match if normalized entity and page match
|
# Match if normalized entity and page match
|
||||||
if (
|
if (s["label"] == e["label"] and
|
||||||
s["label"] == e["label"]
|
s_entity_norm == e_entity_norm and
|
||||||
and s_entity_norm == e_entity_norm
|
s_page == e_page):
|
||||||
and s_page == e_page
|
|
||||||
):
|
|
||||||
|
|
||||||
merged.append(
|
merged.append({
|
||||||
{
|
|
||||||
"label": s["label"],
|
"label": s["label"],
|
||||||
"entity": s["entity"],
|
"entity": s["entity"],
|
||||||
"page": s_page,
|
"page": s_page,
|
||||||
"status": "validated",
|
"status": "validated"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
seen.add((e["entity"], e_page))
|
seen.add((e["entity"], e_page))
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
# If no match found, add as single-source
|
# If no match found, add as single-source
|
||||||
if not found:
|
if not found:
|
||||||
merged.append(
|
merged.append({
|
||||||
{
|
|
||||||
"label": s["label"],
|
"label": s["label"],
|
||||||
"entity": s["entity"],
|
"entity": s["entity"],
|
||||||
"page": s_page,
|
"page": s_page,
|
||||||
"status": "single-source",
|
"status": "single-source",
|
||||||
"source": "spacy",
|
"source": "spacy"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
# Add remaining Exxeta entities not already processed
|
# Add remaining Exxeta entities not already processed
|
||||||
for e in exxeta_data:
|
for e in exxeta_data:
|
||||||
if (e["entity"], e["page"]) not in seen:
|
if (e["entity"], e["page"]) not in seen:
|
||||||
merged.append(
|
merged.append({
|
||||||
{
|
|
||||||
"label": e["label"],
|
"label": e["label"],
|
||||||
"entity": e["entity"],
|
"entity": e["entity"],
|
||||||
"page": e["page"],
|
"page": e["page"],
|
||||||
"status": "single-source",
|
"status": "single-source",
|
||||||
"source": "exxeta",
|
"source": "exxeta"
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
@ -7,22 +7,18 @@ from merge_logic import merge_and_validate_entities
|
||||||
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
|
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||||
|
|
||||||
|
|
||||||
def load_pitchbook_pages():
|
def load_pitchbook_pages():
|
||||||
path = Path(PITCHBOOK_PATH)
|
path = Path(PITCHBOOK_PATH)
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def save_json(data, filename):
|
def save_json(data, filename):
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def sort_by_page_number(entities):
|
def sort_by_page_number(entities):
|
||||||
return sorted(entities, key=lambda x: x.get("page", 0))
|
return sorted(entities, key=lambda x: x.get("page", 0))
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities(SPACY_PATH)
|
spacy_entities = load_spacy_entities(SPACY_PATH)
|
||||||
pitchbook_pages = load_pitchbook_pages()
|
pitchbook_pages = load_pitchbook_pages()
|
||||||
|
|
@ -37,6 +33,5 @@ def run():
|
||||||
print("- merged_result.json")
|
print("- merged_result.json")
|
||||||
print(f"- Total entities in merged result: {len(merged_sorted)}")
|
print(f"- Total entities in merged result: {len(merged_sorted)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities(path):
|
def load_spacy_entities(path):
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -11,20 +11,15 @@ log_folder = Path("logs")
|
||||||
for folder in [output_folder, log_folder]:
|
for folder in [output_folder, log_folder]:
|
||||||
folder.mkdir(parents=True, exist_ok=True)
|
folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_json(pdf_path: Path):
|
def extract_text_to_json(pdf_path: Path):
|
||||||
json_path = output_folder / f"{pdf_path.stem}.json"
|
json_path = output_folder / f"{pdf_path.stem}.json"
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
pages = [
|
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
||||||
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
|
||||||
for i, page in enumerate(pdf.pages)
|
|
||||||
]
|
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||||
print(f"📄 Text JSON saved: {json_path.name}")
|
print(f"📄 Text JSON saved: {json_path.name}")
|
||||||
|
|
||||||
|
|
||||||
def ocr_pdf(input_file: Path):
|
def ocr_pdf(input_file: Path):
|
||||||
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
||||||
log_file = log_folder / f"{input_file.stem}.log"
|
log_file = log_folder / f"{input_file.stem}.log"
|
||||||
|
|
@ -33,14 +28,11 @@ def ocr_pdf(input_file: Path):
|
||||||
cmd = [
|
cmd = [
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
"--force-ocr",
|
"--force-ocr",
|
||||||
"--output-type",
|
"--output-type", "pdfa",
|
||||||
"pdfa",
|
"--language", "deu+eng",
|
||||||
"--language",
|
"--sidecar", str(sidecar_txt),
|
||||||
"deu+eng",
|
|
||||||
"--sidecar",
|
|
||||||
str(sidecar_txt),
|
|
||||||
str(input_file),
|
str(input_file),
|
||||||
str(output_file),
|
str(output_file)
|
||||||
]
|
]
|
||||||
|
|
||||||
with open(log_file, "w") as log:
|
with open(log_file, "w") as log:
|
||||||
|
|
@ -52,7 +44,6 @@ def ocr_pdf(input_file: Path):
|
||||||
else:
|
else:
|
||||||
print(f"❌ OCR failed. See log: {log_file}")
|
print(f"❌ OCR failed. See log: {log_file}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if not input_folder.exists():
|
if not input_folder.exists():
|
||||||
print("Input folder does not exist!")
|
print("Input folder does not exist!")
|
||||||
|
|
|
||||||
|
|
@ -34,14 +34,14 @@ for ent in doc_ner.ents:
|
||||||
break
|
break
|
||||||
|
|
||||||
if ent.text.strip():
|
if ent.text.strip():
|
||||||
ner_text_results.append(
|
ner_text_results.append({
|
||||||
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
|
"label": ent.label_,
|
||||||
)
|
"entity": ent.text.strip(),
|
||||||
|
"page": page_number
|
||||||
|
})
|
||||||
|
|
||||||
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
||||||
(output_dir / "ner_text.json").write_text(
|
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
|
||||||
json.dumps(ner_text_results, indent=2, ensure_ascii=False)
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2. NER on table cells
|
# 2. NER on table cells
|
||||||
table_ner_results = []
|
table_ner_results = []
|
||||||
|
|
@ -62,18 +62,14 @@ for i, table in enumerate(doc._.tables, 1):
|
||||||
doc_cell = nlp(cell)
|
doc_cell = nlp(cell)
|
||||||
for ent in doc_cell.ents:
|
for ent in doc_cell.ents:
|
||||||
if ent.text.strip():
|
if ent.text.strip():
|
||||||
table_ner_results.append(
|
table_ner_results.append({
|
||||||
{
|
|
||||||
"label": ent.label_,
|
"label": ent.label_,
|
||||||
"entity": ent.text.strip(),
|
"entity": ent.text.strip(),
|
||||||
"page": page_number,
|
"page": page_number,
|
||||||
"table": i,
|
"table": i
|
||||||
}
|
})
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
||||||
(output_dir / "ner_tables.json").write_text(
|
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
|
||||||
json.dumps(table_ner_results, indent=2, ensure_ascii=False)
|
|
||||||
)
|
|
||||||
|
|
||||||
print("✅ Done! Extracted data saved to /output")
|
print("✅ Done! Extracted data saved to /output")
|
||||||
Loading…
Reference in New Issue