Merge pull request 'backend/flask-setup' (#38) from backend/flask-setup into main
Reviewed-on: #38pull/40/head
commit
cc321fea4a
|
|
@ -0,0 +1,13 @@
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 23.3.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
language_version: python3
|
||||||
|
files: ^project/backend/
|
||||||
|
|
||||||
|
- repo: https://github.com/pycqa/flake8
|
||||||
|
rev: 6.1.0
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
files: ^project/backend/
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
services:
|
||||||
|
backend:
|
||||||
|
build: ./project/backend
|
||||||
|
container_name: fundfuechse-backend
|
||||||
|
ports:
|
||||||
|
- "5000:5000"
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
# frontend:
|
||||||
|
# build: ./project/frontend
|
||||||
|
# container_name: fundfuechse-frontend
|
||||||
|
# ports:
|
||||||
|
# - "3000:80"
|
||||||
|
# restart: always
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
# 1. Python-Image verwenden
|
||||||
|
FROM python:3.11-alpine
|
||||||
|
|
||||||
|
# 2. Arbeitsverzeichnis im Container setzen
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 3. requirements.txt kopieren und Pakete installieren
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# 4. Quellcode kopieren (z. B. app.py)
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# 5. Flask-App starten
|
||||||
|
# production-style server mit gunicorn
|
||||||
|
RUN pip install gunicorn
|
||||||
|
|
||||||
|
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
|
||||||
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### Voraussetzungen
|
||||||
|
|
||||||
|
- Python 3.11+
|
||||||
|
- pip
|
||||||
|
- Docker (Desktop)
|
||||||
|
- Optional: `pre-commit`
|
||||||
|
|
||||||
|
### Abhängigkeiten installieren
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Codequalität (lokal prüfen)
|
||||||
|
black app.py
|
||||||
|
flake8 app.py
|
||||||
|
|
||||||
|
|
||||||
|
## Anwendung starten
|
||||||
|
|
||||||
|
### Lokal
|
||||||
|
|
||||||
|
1. Abhängigkeiten installieren:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Flask-App starten:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Aufrufen im Browser:
|
||||||
|
|
||||||
|
```
|
||||||
|
http://localhost:5000/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 2: Mit Docker
|
||||||
|
|
||||||
|
1. Image bauen:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t fundfuechse-backend .
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Container starten:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -p 5000:5000 fundfuechse-backend
|
||||||
|
```
|
||||||
|
|
||||||
|
Die API läuft dann unter:
|
||||||
|
|
||||||
|
```
|
||||||
|
http://localhost:5000/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 3: Mit docker-compose
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
Danach ist der Service erreichbar unter:
|
||||||
|
|
||||||
|
```
|
||||||
|
http://localhost:5000/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Testaufruf per curl (PDF hochladen)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl.exe -X POST -F "file=@Pitchbook 1.pdf" http://localhost:5000/upload
|
||||||
|
```
|
||||||
|
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
from flask import Flask, jsonify
|
||||||
|
from flask import request
|
||||||
|
import os
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/health")
|
||||||
|
def health_check():
|
||||||
|
return "OK"
|
||||||
|
|
||||||
|
|
||||||
|
# gibt Beispiel-Konfig der Kennzahlen zurück (für die UI)
|
||||||
|
@app.route("/config", methods=["GET"])
|
||||||
|
def get_config():
|
||||||
|
config = [
|
||||||
|
{"name": "Fondname", "format": "Text", "required": True},
|
||||||
|
{"name": "IRR", "format": "Prozent", "required": False},
|
||||||
|
]
|
||||||
|
return jsonify(config)
|
||||||
|
|
||||||
|
|
||||||
|
# liefert Beispiel-Ergebnisse der Extraktion
|
||||||
|
@app.route("/extraction_results", methods=["GET"])
|
||||||
|
def get_extraction_results():
|
||||||
|
results = [
|
||||||
|
{"label": "Fondname", "entity": "ABC Fonds", "page": 1, "status": "validated"},
|
||||||
|
{
|
||||||
|
"label": "IRR",
|
||||||
|
"entity": "6,0%",
|
||||||
|
"page": 3,
|
||||||
|
"status": "single-source",
|
||||||
|
"source": "spaCy",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
return jsonify(results)
|
||||||
|
|
||||||
|
|
||||||
|
# legt Upload-Ordner an, falls nicht vorhanden
|
||||||
|
UPLOAD_FOLDER = "uploads"
|
||||||
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# nimmt eine PDF-Datei per POST entgegen und speichert sie
|
||||||
|
@app.route("/upload", methods=["POST"])
|
||||||
|
def upload_pdf():
|
||||||
|
if "file" not in request.files:
|
||||||
|
return {"error": "Keine Datei hochgeladen."}, 400
|
||||||
|
|
||||||
|
file = request.files["file"]
|
||||||
|
|
||||||
|
if file.filename == "":
|
||||||
|
return {"error": "Dateiname fehlt."}, 400
|
||||||
|
|
||||||
|
if not file.filename.endswith(".pdf"):
|
||||||
|
return {"error": "Nur PDF-Dateien erlaubt."}, 400
|
||||||
|
|
||||||
|
file_path = os.path.join(UPLOAD_FOLDER, file.filename)
|
||||||
|
file.save(file_path)
|
||||||
|
|
||||||
|
return {"message": f"Datei {file.filename} erfolgreich gespeichert!"}, 200
|
||||||
|
|
||||||
|
|
||||||
|
# für Docker wichtig: host='0.0.0.0'
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(debug=True, host="0.0.0.0")
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Flask
|
||||||
|
black
|
||||||
|
flake8
|
||||||
|
pre-commit
|
||||||
|
|
@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
|
||||||
fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
|
fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager()
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
|
@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
pages = PDFPage.get_pages(fp)
|
pages = PDFPage.get_pages(fp)
|
||||||
|
|
||||||
for page in pages:
|
for page in pages:
|
||||||
print('Processing next page...')
|
print("Processing next page...")
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
layout = device.get_result()
|
layout = device.get_result()
|
||||||
for lobj in layout:
|
for lobj in layout:
|
||||||
if isinstance(lobj, LTTextBox):
|
if isinstance(lobj, LTTextBox):
|
||||||
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
||||||
print('At %r is text: %s' % ((x, y), text))
|
print("At %r is text: %s" % ((x, y), text))
|
||||||
|
|
|
||||||
|
|
@ -28,18 +28,14 @@ if uploaded_file and suchwort:
|
||||||
rects = page.search_for(suchwort)
|
rects = page.search_for(suchwort)
|
||||||
|
|
||||||
for rect in rects:
|
for rect in rects:
|
||||||
fundstellen.append({
|
fundstellen.append({"seite": page_num, "rect": rect})
|
||||||
"seite": page_num,
|
|
||||||
"rect": rect
|
|
||||||
})
|
|
||||||
|
|
||||||
if fundstellen:
|
if fundstellen:
|
||||||
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
||||||
|
|
||||||
# Auswahl der Fundstelle
|
# Auswahl der Fundstelle
|
||||||
auswahl = st.selectbox(
|
auswahl = st.selectbox(
|
||||||
"Fundstelle auswählen:",
|
"Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
|
||||||
[f"Seite {f['seite'] + 1}" for f in fundstellen]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,9 @@ for eintrag in kennzahlen:
|
||||||
highlight = page.add_highlight_annot(rect)
|
highlight = page.add_highlight_annot(rect)
|
||||||
highlight.update()
|
highlight.update()
|
||||||
else:
|
else:
|
||||||
st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
|
st.warning(
|
||||||
|
f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
||||||
|
|
||||||
|
|
@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
|
||||||
# PDF anzeigen mit Scroll zu aktueller Seite
|
# PDF anzeigen mit Scroll zu aktueller Seite
|
||||||
st.subheader(f"Vorschau")
|
st.subheader(f"Vorschau")
|
||||||
with open(highlighted_path, "rb") as f:
|
with open(highlighted_path, "rb") as f:
|
||||||
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
||||||
|
|
||||||
# Seite direkt ansteuern
|
# Seite direkt ansteuern
|
||||||
pdf_display = f'''
|
pdf_display = f"""
|
||||||
<iframe
|
<iframe
|
||||||
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
||||||
width="100%" height="800px" type="application/pdf">
|
width="100%" height="800px" type="application/pdf">
|
||||||
</iframe>
|
</iframe>
|
||||||
'''
|
"""
|
||||||
st.markdown(pdf_display, unsafe_allow_html=True)
|
st.markdown(pdf_display, unsafe_allow_html=True)
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,9 @@ class Server:
|
||||||
server_params = StdioServerParameters(
|
server_params = StdioServerParameters(
|
||||||
command=command,
|
command=command,
|
||||||
args=self.config["args"],
|
args=self.config["args"],
|
||||||
env={**os.environ, **self.config["env"]}
|
env=(
|
||||||
if self.config.get("env")
|
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
||||||
else None,
|
),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
stdio_transport = await self.exit_stack.enter_async_context(
|
stdio_transport = await self.exit_stack.enter_async_context(
|
||||||
|
|
@ -244,15 +244,10 @@ class LLMClient:
|
||||||
formatted_messages = []
|
formatted_messages = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
# print(msg)
|
# print(msg)
|
||||||
formatted_messages.append({
|
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||||
"role": msg["role"],
|
|
||||||
"content": msg["content"]
|
|
||||||
})
|
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=self.api_key,
|
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
||||||
api_version="2023-07-01-preview",
|
|
||||||
base_url=url
|
|
||||||
)
|
)
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=formatted_messages,
|
messages=formatted_messages,
|
||||||
|
|
@ -412,12 +407,16 @@ class ChatSession:
|
||||||
"4. Use appropriate context from the user's question\n"
|
"4. Use appropriate context from the user's question\n"
|
||||||
"5. Avoid simply repeating the raw data\n\n"
|
"5. Avoid simply repeating the raw data\n\n"
|
||||||
"Please use only the tools that are explicitly defined above."
|
"Please use only the tools that are explicitly defined above."
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
messages = [{"role": "system", "content": system_message}]
|
messages = [{"role": "system", "content": system_message}]
|
||||||
messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
|
messages.append(
|
||||||
"For each value there is only one correct answer, try to find it with the tools provided."})
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "You have to extract data from pdf files and have different tools for extracting."
|
||||||
|
"For each value there is only one correct answer, try to find it with the tools provided.",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|
@ -455,7 +454,6 @@ class ChatSession:
|
||||||
# messages.append({"role": "assistant", "content": llm_response})
|
# messages.append({"role": "assistant", "content": llm_response})
|
||||||
# logging.info("\nFinal response: %s", llm_response)
|
# logging.info("\nFinal response: %s", llm_response)
|
||||||
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logging.info("\nExiting...")
|
logging.info("\nExiting...")
|
||||||
break
|
break
|
||||||
|
|
@ -476,5 +474,6 @@ async def main() -> None:
|
||||||
chat_session = ChatSession(servers, llm_client)
|
chat_session = ChatSession(servers, llm_client)
|
||||||
await chat_session.start()
|
await chat_session.start()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
|
||||||
|
|
@ -8,54 +8,86 @@ mcp = FastMCP("Demo")
|
||||||
risikoProfile = ["Core/Core+, Core", "Value Add"]
|
risikoProfile = ["Core/Core+, Core", "Value Add"]
|
||||||
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
|
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
|
||||||
|
|
||||||
|
|
||||||
# Add an addition tool
|
# Add an addition tool
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def add(a: int, b: int) -> int:
|
def add(a: int, b: int) -> int:
|
||||||
"""Add two numbers"""
|
"""Add two numbers"""
|
||||||
return a + b
|
return a + b
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromSpaCy() -> list:
|
def getFromSpaCy() -> list:
|
||||||
"""Get data from SpaCy"""
|
"""Get data from SpaCy"""
|
||||||
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
|
return [
|
||||||
{"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
|
{
|
||||||
|
"page": random.randint(1, 35),
|
||||||
|
"value": random.choice(risikoProfileSpacy),
|
||||||
|
"key": "Risiko",
|
||||||
|
},
|
||||||
|
{"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromChatGPT() -> list:
|
def getFromChatGPT() -> list:
|
||||||
"""Get data from ChatGPT"""
|
"""Get data from ChatGPT"""
|
||||||
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
|
return [
|
||||||
{"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
|
{
|
||||||
|
"page": random.randint(1, 35),
|
||||||
|
"value": random.choice(risikoProfile),
|
||||||
|
"key": "Risiko",
|
||||||
|
},
|
||||||
|
{"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def checkSpacyResult() -> dict:
|
def checkSpacyResult() -> dict:
|
||||||
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
|
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
|
||||||
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
|
return {
|
||||||
|
"page": random.randint(1, 35),
|
||||||
|
"value": random.choice(risikoProfile),
|
||||||
|
"key": "Risiko",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getFromChatGPTSingle(value: str) -> dict:
|
def getFromChatGPTSingle(value: str) -> dict:
|
||||||
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
|
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
|
||||||
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
|
return {
|
||||||
|
"page": random.randint(1, 35),
|
||||||
|
"value": random.choice(risikoProfile),
|
||||||
|
"key": value,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
context = ""
|
context = ""
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def getContext() -> str:
|
def getContext() -> str:
|
||||||
"""This tool gets context information."""
|
"""This tool gets context information."""
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def setContext(value: str) -> None:
|
def setContext(value: str) -> None:
|
||||||
"""This tool sets context information."""
|
"""This tool sets context information."""
|
||||||
global context
|
global context
|
||||||
context = value
|
context = value
|
||||||
|
|
||||||
|
|
||||||
# Add a dynamic greeting resource
|
# Add a dynamic greeting resource
|
||||||
@mcp.resource("greeting://{name}")
|
@mcp.resource("greeting://{name}")
|
||||||
def get_greeting(name: str) -> str:
|
def get_greeting(name: str) -> str:
|
||||||
"""Get a personalized greeting"""
|
"""Get a personalized greeting"""
|
||||||
return f"Hello, {name}!"
|
return f"Hello, {name}!"
|
||||||
|
|
||||||
|
|
||||||
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
|
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def validate_entity(entity: str, label: str) -> dict:
|
def validate_entity(entity: str, label: str) -> dict:
|
||||||
"""Returns if the entity is valid based on hardcoded rules."""
|
"""Returns if the entity is valid based on hardcoded rules."""
|
||||||
|
|
@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict:
|
||||||
return {"status": "valid", "entity": entity}
|
return {"status": "valid", "entity": entity}
|
||||||
return {"status": "invalid", "entity": entity}
|
return {"status": "invalid", "entity": entity}
|
||||||
|
|
||||||
|
|
||||||
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
|
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
|
def merge_spacy_exxeta(
|
||||||
|
spacy_result: list[dict], exxeta_result: list[dict]
|
||||||
|
) -> list[dict]:
|
||||||
"""Merge two results, mark as validated if label/entity/page match."""
|
"""Merge two results, mark as validated if label/entity/page match."""
|
||||||
def norm(e): return e["entity"].lower().replace(" ", "")
|
|
||||||
|
def norm(e):
|
||||||
|
return e["entity"].lower().replace(" ", "")
|
||||||
|
|
||||||
merged = []
|
merged = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
|
||||||
for s in spacy_result:
|
for s in spacy_result:
|
||||||
s_norm = norm(s)
|
s_norm = norm(s)
|
||||||
s_page = s["page"]
|
s_page = s["page"]
|
||||||
match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
|
match = next(
|
||||||
|
(
|
||||||
|
e
|
||||||
|
for e in exxeta_result
|
||||||
|
if e["label"] == s["label"]
|
||||||
|
and norm(e) == s_norm
|
||||||
|
and e["page"] == s_page
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
if match:
|
if match:
|
||||||
merged.append({**s, "status": "validated"})
|
merged.append({**s, "status": "validated"})
|
||||||
seen.add((match["entity"], match["page"]))
|
seen.add((match["entity"], match["page"]))
|
||||||
|
|
|
||||||
|
|
@ -12,10 +12,12 @@ app = Flask(__name__)
|
||||||
UPLOAD_FOLDER = Path("pitchbooks")
|
UPLOAD_FOLDER = Path("pitchbooks")
|
||||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def home():
|
def home():
|
||||||
return "Backend is running!"
|
return "Backend is running!"
|
||||||
|
|
||||||
|
|
||||||
@app.route("/upload", methods=["POST"])
|
@app.route("/upload", methods=["POST"])
|
||||||
def upload():
|
def upload():
|
||||||
file = request.files.get("file")
|
file = request.files.get("file")
|
||||||
|
|
@ -44,5 +46,6 @@ def upload():
|
||||||
|
|
||||||
return "status: complete\n"
|
return "status: complete\n"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo"
|
||||||
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
|
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
|
||||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def extract_with_exxeta(pages_json):
|
def extract_with_exxeta(pages_json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -19,15 +20,15 @@ def extract_with_exxeta(pages_json):
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||||
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
||||||
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
||||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||||
"Beispiele:\n"
|
"Beispiele:\n"
|
||||||
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
'- "Core, Core+" → entity: "Core, Core+"\n'
|
||||||
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
'- "Core/Core+" → entity: "Core/Core+"\n'
|
||||||
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
||||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||||
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
||||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||||
"TEXT:\n" + text
|
"TEXT:\n" + text
|
||||||
|
|
@ -35,16 +36,19 @@ def extract_with_exxeta(pages_json):
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
{
|
||||||
{"role": "user", "content": prompt}
|
"role": "system",
|
||||||
|
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
],
|
],
|
||||||
"temperature": 0.0
|
"temperature": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,16 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
def normalize_entity(entity_str):
|
def normalize_entity(entity_str):
|
||||||
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
|
return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
|
||||||
|
|
||||||
|
|
||||||
def load_json(path: Path):
|
def load_json(path: Path):
|
||||||
with path.open("r", encoding="utf-8") as f:
|
with path.open("r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def merge_and_validate_entities(filter_label=None):
|
def merge_and_validate_entities(filter_label=None):
|
||||||
base = Path(__file__).resolve().parent.parent
|
base = Path(__file__).resolve().parent.parent
|
||||||
spacy_path = base / "spacy_service/output/spacy-results.json"
|
spacy_path = base / "spacy_service/output/spacy-results.json"
|
||||||
|
|
@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None):
|
||||||
s_page = s["page"]
|
s_page = s["page"]
|
||||||
|
|
||||||
match = next(
|
match = next(
|
||||||
(e for e in exxeta_data
|
(
|
||||||
if e["label"] == s["label"] and
|
e
|
||||||
normalize_entity(e["entity"]) == s_norm and
|
for e in exxeta_data
|
||||||
e["page"] == s_page),
|
if e["label"] == s["label"]
|
||||||
None
|
and normalize_entity(e["entity"]) == s_norm
|
||||||
|
and e["page"] == s_page
|
||||||
|
),
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent
|
||||||
OUTPUT_FOLDER = BASE_DIR / "output"
|
OUTPUT_FOLDER = BASE_DIR / "output"
|
||||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def run_ocr_and_extract(pdf_path: str):
|
def run_ocr_and_extract(pdf_path: str):
|
||||||
pdf_path = Path(pdf_path)
|
pdf_path = Path(pdf_path)
|
||||||
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
|
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
|
||||||
|
|
@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str):
|
||||||
cmd = [
|
cmd = [
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
"--force-ocr",
|
"--force-ocr",
|
||||||
"--output-type", "pdfa",
|
"--output-type",
|
||||||
"--language", "deu+eng",
|
"pdfa",
|
||||||
|
"--language",
|
||||||
|
"deu+eng",
|
||||||
str(pdf_path),
|
str(pdf_path),
|
||||||
str(output_pdf)
|
str(output_pdf),
|
||||||
]
|
]
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True)
|
result = subprocess.run(cmd, capture_output=True)
|
||||||
|
|
@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str):
|
||||||
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
|
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
|
||||||
|
|
||||||
with pdfplumber.open(output_pdf) as pdf:
|
with pdfplumber.open(output_pdf) as pdf:
|
||||||
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
pages = [
|
||||||
|
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
||||||
|
for i, page in enumerate(pdf.pages)
|
||||||
|
]
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
return {
|
return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}
|
||||||
"ocr_pdf": str(output_pdf),
|
|
||||||
"json_path": str(json_path)
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
||||||
nlp = spacy.load(model_path)
|
nlp = spacy.load(model_path)
|
||||||
input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
|
input_pdf_path = (
|
||||||
|
Path(__file__).resolve().parent
|
||||||
|
/ ".."
|
||||||
|
/ "ocr_pdf_service"
|
||||||
|
/ "output"
|
||||||
|
/ "pitchbook-OCR.pdf"
|
||||||
|
)
|
||||||
input_pdf = Path(input_pdf_path)
|
input_pdf = Path(input_pdf_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,11 +31,7 @@ def extract_with_spacy(pages_json):
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
results.append({
|
results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
|
||||||
"label": ent.label_,
|
|
||||||
"entity": ent.text,
|
|
||||||
"page": page_num
|
|
||||||
})
|
|
||||||
|
|
||||||
output_path = OUTPUT_FOLDER / f"spacy-results.json"
|
output_path = OUTPUT_FOLDER / f"spacy-results.json"
|
||||||
with open(output_path, "w", encoding="utf-8") as f:
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||||
API_KEY = os.getenv("API_KEY")
|
API_KEY = os.getenv("API_KEY")
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=API_KEY,
|
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
||||||
api_version="2023-07-01-preview",
|
|
||||||
base_url=BASE_URL
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path):
|
def extract_text_from_pdf(file_path):
|
||||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||||
all_text = ""
|
all_text = ""
|
||||||
|
|
@ -40,10 +40,7 @@ pdf_text = extract_text_from_pdf(file_path)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{"role": "system", "content": "Always respond with a valid JSON object"},
|
||||||
"role": "system",
|
|
||||||
"content": "Always respond with a valid JSON object"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": """extract the values from the text. let not found values empty:
|
"content": """extract the values from the text. let not found values empty:
|
||||||
|
|
@ -71,11 +68,12 @@ response = client.chat.completions.create(
|
||||||
- the page where this value was found
|
- the page where this value was found
|
||||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||||
|
|
||||||
Here ist the text:""" + pdf_text
|
Here ist the text:"""
|
||||||
}
|
+ pdf_text,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
model="gpt-4o-mini",
|
model="gpt-4o-mini",
|
||||||
response_format={"type": "json_object"}
|
response_format={"type": "json_object"},
|
||||||
# temperature=0.7,
|
# temperature=0.7,
|
||||||
# top_p=0.95,
|
# top_p=0.95,
|
||||||
# frequency_penalty=0,
|
# frequency_penalty=0,
|
||||||
|
|
@ -86,5 +84,4 @@ response = client.chat.completions.create(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(response.choices[0].message.content)
|
print(response.choices[0].message.content)
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||||
API_KEY = os.getenv("API_KEY")
|
API_KEY = os.getenv("API_KEY")
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=API_KEY,
|
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
||||||
api_version="2023-07-01-preview",
|
|
||||||
base_url=BASE_URL
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path):
|
def extract_text_from_pdf(file_path):
|
||||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||||
all_text = ""
|
all_text = ""
|
||||||
|
|
@ -40,10 +40,7 @@ pdf_text = extract_text_from_pdf(file_path)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{"role": "system", "content": "Always respond with a valid JSON object"},
|
||||||
"role": "system",
|
|
||||||
"content": "Always respond with a valid JSON object"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": """extract the values from the text. let not found values empty:
|
"content": """extract the values from the text. let not found values empty:
|
||||||
|
|
@ -71,11 +68,12 @@ response = client.chat.completions.create(
|
||||||
- the page where this value was found
|
- the page where this value was found
|
||||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||||
|
|
||||||
Here ist the text:""" + pdf_text
|
Here ist the text:"""
|
||||||
}
|
+ pdf_text,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
model="gpt-4o-mini",
|
model="gpt-4o-mini",
|
||||||
response_format={"type": "json_object"}
|
response_format={"type": "json_object"},
|
||||||
# temperature=0.7,
|
# temperature=0.7,
|
||||||
# top_p=0.95,
|
# top_p=0.95,
|
||||||
# frequency_penalty=0,
|
# frequency_penalty=0,
|
||||||
|
|
@ -86,5 +84,4 @@ response = client.chat.completions.create(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(response.choices[0].message.content)
|
print(response.choices[0].message.content)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
|
||||||
|
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
import streamlit as st
|
||||||
|
import json
|
||||||
|
|
||||||
|
st.title("Neue Kennzahl annotieren")
|
||||||
|
|
||||||
|
text = st.text_area("Text", "Das geplante Projektvolumen beträgt 120 Mio. €.")
|
||||||
|
start = st.number_input("Start-Position", min_value=0, max_value=len(text), value=28)
|
||||||
|
end = st.number_input("End-Position", min_value=0, max_value=len(text), value=44)
|
||||||
|
label = st.text_input("Label (z. B. KENNZAHL)", "KENNZAHL")
|
||||||
|
|
||||||
|
if st.button("Speichern"):
|
||||||
|
example = {"text": text, "entities": [[start, end, label]]}
|
||||||
|
|
||||||
|
with open("annotated_data.json", "a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
st.success("✅ Annotation gespeichert!")
|
||||||
|
|
@ -15,11 +15,9 @@ for page_number in range(len(doc)):
|
||||||
text = page.get_text()
|
text = page.get_text()
|
||||||
spacy_doc = nlp(text)
|
spacy_doc = nlp(text)
|
||||||
for ent in spacy_doc.ents:
|
for ent in spacy_doc.ents:
|
||||||
results.append({
|
results.append(
|
||||||
"label": ent.label_,
|
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
|
||||||
"entity": ent.text.strip(),
|
)
|
||||||
"page": page_number + 1
|
|
||||||
})
|
|
||||||
|
|
||||||
with open("entities_output.json", "w", encoding="utf-8") as f:
|
with open("entities_output.json", "w", encoding="utf-8") as f:
|
||||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
|
||||||
|
|
@ -226,9 +226,5 @@ TRAINING_DATA = [
|
||||||
(
|
(
|
||||||
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
|
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
|
||||||
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
|
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
|
||||||
)
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA):
|
||||||
for start, end, label in annot["entities"]:
|
for start, end, label in annot["entities"]:
|
||||||
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
||||||
if span is None:
|
if span is None:
|
||||||
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
print(
|
||||||
|
f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
ents.append(span)
|
ents.append(span)
|
||||||
print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
print(
|
||||||
|
f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
||||||
|
)
|
||||||
# label the text with the ents
|
# label the text with the ents
|
||||||
doc.ents = ents
|
doc.ents = ents
|
||||||
db.add(doc)
|
db.add(doc)
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,9 @@ class Server:
|
||||||
server_params = StdioServerParameters(
|
server_params = StdioServerParameters(
|
||||||
command=command,
|
command=command,
|
||||||
args=self.config["args"],
|
args=self.config["args"],
|
||||||
env={**os.environ, **self.config["env"]}
|
env=(
|
||||||
if self.config.get("env")
|
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
||||||
else None,
|
),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
stdio_transport = await self.exit_stack.enter_async_context(
|
stdio_transport = await self.exit_stack.enter_async_context(
|
||||||
|
|
@ -244,15 +244,10 @@ class LLMClient:
|
||||||
formatted_messages = []
|
formatted_messages = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
print(msg)
|
print(msg)
|
||||||
formatted_messages.append({
|
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||||
"role": msg["role"],
|
|
||||||
"content": msg["content"]
|
|
||||||
})
|
|
||||||
|
|
||||||
client = AzureOpenAI(
|
client = AzureOpenAI(
|
||||||
api_key=self.api_key,
|
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
||||||
api_version="2023-07-01-preview",
|
|
||||||
base_url=url
|
|
||||||
)
|
)
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
messages=formatted_messages,
|
messages=formatted_messages,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
# server.py
|
# server.py
|
||||||
from mcp.server.fastmcp import FastMCP
|
from mcp.server.fastmcp import FastMCP
|
||||||
|
|
||||||
# Create an MCP server
|
# Create an MCP server
|
||||||
mcp = FastMCP("Demo")
|
mcp = FastMCP("Demo")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||||
OUTPUT_PATH = "mcp_spacy_validated_result.json"
|
OUTPUT_PATH = "mcp_spacy_validated_result.json"
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities():
|
def load_spacy_entities():
|
||||||
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
|
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def load_pitchbook_pages():
|
def load_pitchbook_pages():
|
||||||
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
|
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def get_page_text(pages, page_number):
|
def get_page_text(pages, page_number):
|
||||||
for page in pages:
|
for page in pages:
|
||||||
if page.get("page") == page_number:
|
if page.get("page") == page_number:
|
||||||
return page.get("text", "")
|
return page.get("text", "")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def normalize_entity(entity):
|
def normalize_entity(entity):
|
||||||
return ' '.join(entity.replace('\n', ' ').split())
|
return " ".join(entity.replace("\n", " ").split())
|
||||||
|
|
||||||
|
|
||||||
def validate_entity_with_exxeta(entity, page_num, text):
|
def validate_entity_with_exxeta(entity, page_num, text):
|
||||||
prompt = (
|
prompt = (
|
||||||
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
|
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
|
||||||
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
|
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
|
||||||
f"Ziel-Formulierung:\n"
|
f"Ziel-Formulierung:\n"
|
||||||
f"\"{entity}\"\n\n"
|
f'"{entity}"\n\n'
|
||||||
f"Validierungsregeln:\n"
|
f"Validierungsregeln:\n"
|
||||||
f"- Groß- und Kleinschreibung ignorieren.\n"
|
f"- Groß- und Kleinschreibung ignorieren.\n"
|
||||||
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
|
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
|
||||||
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
|
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
|
||||||
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
|
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
|
||||||
f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
|
f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
|
||||||
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
|
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
|
||||||
f"OCR-Text auf Seite {page_num}:\n{text}"
|
f"OCR-Text auf Seite {page_num}:\n{text}"
|
||||||
)
|
)
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
|
{
|
||||||
{"role": "user", "content": prompt}
|
"role": "system",
|
||||||
|
"content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
],
|
],
|
||||||
"temperature": 0.0
|
"temperature": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text):
|
||||||
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
|
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities()
|
spacy_entities = load_spacy_entities()
|
||||||
pitchbook_pages = load_pitchbook_pages()
|
pitchbook_pages = load_pitchbook_pages()
|
||||||
|
|
@ -81,17 +90,20 @@ def run():
|
||||||
page_text = get_page_text(pitchbook_pages, page)
|
page_text = get_page_text(pitchbook_pages, page)
|
||||||
is_valid = validate_entity_with_exxeta(entity, page, page_text)
|
is_valid = validate_entity_with_exxeta(entity, page, page_text)
|
||||||
|
|
||||||
validated_results.append({
|
validated_results.append(
|
||||||
|
{
|
||||||
"label": entity_data.get("label"),
|
"label": entity_data.get("label"),
|
||||||
"entity": raw_entity,
|
"entity": raw_entity,
|
||||||
"page": page,
|
"page": page,
|
||||||
"validated": is_valid
|
"validated": is_valid,
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(validated_results, f, indent=2, ensure_ascii=False)
|
json.dump(validated_results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
|
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
@ -10,19 +10,23 @@ KPI_SERVICE_MAP = {
|
||||||
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
|
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities(path):
|
def load_spacy_entities(path):
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def load_exxeta_entities(path):
|
def load_exxeta_entities(path):
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def normalize(text):
|
def normalize(text):
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
return text.strip().lower().replace(" ", "").replace("/", "/")
|
return text.strip().lower().replace(" ", "").replace("/", "/")
|
||||||
|
|
||||||
|
|
||||||
def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
||||||
for ee in exxeta_entries:
|
for ee in exxeta_entries:
|
||||||
ee_entity = normalize(ee["entity"])
|
ee_entity = normalize(ee["entity"])
|
||||||
if se_entity == ee_entity:
|
if se_entity == ee_entity:
|
||||||
results.append({
|
results.append(
|
||||||
|
{
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": se["entity"],
|
"entity": se["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "validated"
|
"validation_status": "validated",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
matched = True
|
matched = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not matched:
|
if not matched:
|
||||||
results.append({
|
results.append(
|
||||||
|
{
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": se["entity"],
|
"entity": se["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "spacy-only"
|
"validation_status": "spacy-only",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
for ee in exxeta_entries:
|
for ee in exxeta_entries:
|
||||||
ee_entity = normalize(ee["entity"])
|
ee_entity = normalize(ee["entity"])
|
||||||
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
|
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
|
||||||
results.append({
|
results.append(
|
||||||
|
{
|
||||||
"kpi": kpi,
|
"kpi": kpi,
|
||||||
"entity": ee["entity"],
|
"entity": ee["entity"],
|
||||||
"page": page,
|
"page": page,
|
||||||
"validation_status": "exxeta-only"
|
"validation_status": "exxeta-only",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def save_results(results, filename):
|
def save_results(results, filename):
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
|
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
|
||||||
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
|
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
|
||||||
|
|
@ -96,5 +108,6 @@ def run():
|
||||||
save_results(all_results, "mcp_validated_result.json")
|
save_results(all_results, "mcp_validated_result.json")
|
||||||
print("✅ Validation complete! Output: mcp_validated_result.json")
|
print("✅ Validation complete! Output: mcp_validated_result.json")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import json
|
||||||
|
|
||||||
MODEL = "gpt-35-turbo"
|
MODEL = "gpt-35-turbo"
|
||||||
|
|
||||||
|
|
||||||
def extract_risikoprofil_from_exxeta(pages_json):
|
def extract_risikoprofil_from_exxeta(pages_json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
@ -16,33 +17,35 @@ def extract_risikoprofil_from_exxeta(pages_json):
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||||
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
||||||
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
||||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||||
"Beispiele:\n"
|
"Beispiele:\n"
|
||||||
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
'- "Core, Core+" → entity: "Core, Core+"\n'
|
||||||
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
'- "Core/Core+" → entity: "Core/Core+"\n'
|
||||||
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
||||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||||
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
||||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||||
"TEXT:\n" + text
|
"TEXT:\n" + text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
{
|
||||||
{"role": "user", "content": prompt}
|
"role": "system",
|
||||||
|
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
],
|
],
|
||||||
"temperature": 0.0
|
"temperature": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
def normalize_entity(entity_str):
|
def normalize_entity(entity_str):
|
||||||
if not entity_str:
|
if not entity_str:
|
||||||
return ""
|
return ""
|
||||||
normalized = entity_str.replace('\n', ' ')
|
normalized = entity_str.replace("\n", " ")
|
||||||
normalized = ''.join(normalized.lower().split())
|
normalized = "".join(normalized.lower().split())
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def merge_and_validate_entities(spacy_data, exxeta_data):
|
def merge_and_validate_entities(spacy_data, exxeta_data):
|
||||||
merged = []
|
merged = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
|
||||||
e_page = e["page"]
|
e_page = e["page"]
|
||||||
|
|
||||||
# Match if normalized entity and page match
|
# Match if normalized entity and page match
|
||||||
if (s["label"] == e["label"] and
|
if (
|
||||||
s_entity_norm == e_entity_norm and
|
s["label"] == e["label"]
|
||||||
s_page == e_page):
|
and s_entity_norm == e_entity_norm
|
||||||
|
and s_page == e_page
|
||||||
|
):
|
||||||
|
|
||||||
merged.append({
|
merged.append(
|
||||||
|
{
|
||||||
"label": s["label"],
|
"label": s["label"],
|
||||||
"entity": s["entity"],
|
"entity": s["entity"],
|
||||||
"page": s_page,
|
"page": s_page,
|
||||||
"status": "validated"
|
"status": "validated",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
seen.add((e["entity"], e_page))
|
seen.add((e["entity"], e_page))
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
# If no match found, add as single-source
|
# If no match found, add as single-source
|
||||||
if not found:
|
if not found:
|
||||||
merged.append({
|
merged.append(
|
||||||
|
{
|
||||||
"label": s["label"],
|
"label": s["label"],
|
||||||
"entity": s["entity"],
|
"entity": s["entity"],
|
||||||
"page": s_page,
|
"page": s_page,
|
||||||
"status": "single-source",
|
"status": "single-source",
|
||||||
"source": "spacy"
|
"source": "spacy",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Add remaining Exxeta entities not already processed
|
# Add remaining Exxeta entities not already processed
|
||||||
for e in exxeta_data:
|
for e in exxeta_data:
|
||||||
if (e["entity"], e["page"]) not in seen:
|
if (e["entity"], e["page"]) not in seen:
|
||||||
merged.append({
|
merged.append(
|
||||||
|
{
|
||||||
"label": e["label"],
|
"label": e["label"],
|
||||||
"entity": e["entity"],
|
"entity": e["entity"],
|
||||||
"page": e["page"],
|
"page": e["page"],
|
||||||
"status": "single-source",
|
"status": "single-source",
|
||||||
"source": "exxeta"
|
"source": "exxeta",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities
|
||||||
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
|
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||||
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||||
|
|
||||||
|
|
||||||
def load_pitchbook_pages():
|
def load_pitchbook_pages():
|
||||||
path = Path(PITCHBOOK_PATH)
|
path = Path(PITCHBOOK_PATH)
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def save_json(data, filename):
|
def save_json(data, filename):
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def sort_by_page_number(entities):
|
def sort_by_page_number(entities):
|
||||||
return sorted(entities, key=lambda x: x.get("page", 0))
|
return sorted(entities, key=lambda x: x.get("page", 0))
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
spacy_entities = load_spacy_entities(SPACY_PATH)
|
spacy_entities = load_spacy_entities(SPACY_PATH)
|
||||||
pitchbook_pages = load_pitchbook_pages()
|
pitchbook_pages = load_pitchbook_pages()
|
||||||
|
|
@ -33,5 +37,6 @@ def run():
|
||||||
print("- merged_result.json")
|
print("- merged_result.json")
|
||||||
print(f"- Total entities in merged result: {len(merged_sorted)}")
|
print(f"- Total entities in merged result: {len(merged_sorted)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run()
|
run()
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def load_spacy_entities(path):
|
def load_spacy_entities(path):
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -11,15 +11,20 @@ log_folder = Path("logs")
|
||||||
for folder in [output_folder, log_folder]:
|
for folder in [output_folder, log_folder]:
|
||||||
folder.mkdir(parents=True, exist_ok=True)
|
folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_json(pdf_path: Path):
|
def extract_text_to_json(pdf_path: Path):
|
||||||
json_path = output_folder / f"{pdf_path.stem}.json"
|
json_path = output_folder / f"{pdf_path.stem}.json"
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
pages = [
|
||||||
|
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
||||||
|
for i, page in enumerate(pdf.pages)
|
||||||
|
]
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||||
print(f"📄 Text JSON saved: {json_path.name}")
|
print(f"📄 Text JSON saved: {json_path.name}")
|
||||||
|
|
||||||
|
|
||||||
def ocr_pdf(input_file: Path):
|
def ocr_pdf(input_file: Path):
|
||||||
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
||||||
log_file = log_folder / f"{input_file.stem}.log"
|
log_file = log_folder / f"{input_file.stem}.log"
|
||||||
|
|
@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path):
|
||||||
cmd = [
|
cmd = [
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
"--force-ocr",
|
"--force-ocr",
|
||||||
"--output-type", "pdfa",
|
"--output-type",
|
||||||
"--language", "deu+eng",
|
"pdfa",
|
||||||
"--sidecar", str(sidecar_txt),
|
"--language",
|
||||||
|
"deu+eng",
|
||||||
|
"--sidecar",
|
||||||
|
str(sidecar_txt),
|
||||||
str(input_file),
|
str(input_file),
|
||||||
str(output_file)
|
str(output_file),
|
||||||
]
|
]
|
||||||
|
|
||||||
with open(log_file, "w") as log:
|
with open(log_file, "w") as log:
|
||||||
|
|
@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path):
|
||||||
else:
|
else:
|
||||||
print(f"❌ OCR failed. See log: {log_file}")
|
print(f"❌ OCR failed. See log: {log_file}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if not input_folder.exists():
|
if not input_folder.exists():
|
||||||
print("Input folder does not exist!")
|
print("Input folder does not exist!")
|
||||||
|
|
|
||||||
|
|
@ -34,14 +34,14 @@ for ent in doc_ner.ents:
|
||||||
break
|
break
|
||||||
|
|
||||||
if ent.text.strip():
|
if ent.text.strip():
|
||||||
ner_text_results.append({
|
ner_text_results.append(
|
||||||
"label": ent.label_,
|
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
|
||||||
"entity": ent.text.strip(),
|
)
|
||||||
"page": page_number
|
|
||||||
})
|
|
||||||
|
|
||||||
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
||||||
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
|
(output_dir / "ner_text.json").write_text(
|
||||||
|
json.dumps(ner_text_results, indent=2, ensure_ascii=False)
|
||||||
|
)
|
||||||
|
|
||||||
# 2. NER on table cells
|
# 2. NER on table cells
|
||||||
table_ner_results = []
|
table_ner_results = []
|
||||||
|
|
@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1):
|
||||||
doc_cell = nlp(cell)
|
doc_cell = nlp(cell)
|
||||||
for ent in doc_cell.ents:
|
for ent in doc_cell.ents:
|
||||||
if ent.text.strip():
|
if ent.text.strip():
|
||||||
table_ner_results.append({
|
table_ner_results.append(
|
||||||
|
{
|
||||||
"label": ent.label_,
|
"label": ent.label_,
|
||||||
"entity": ent.text.strip(),
|
"entity": ent.text.strip(),
|
||||||
"page": page_number,
|
"page": page_number,
|
||||||
"table": i
|
"table": i,
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
||||||
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
|
(output_dir / "ner_tables.json").write_text(
|
||||||
|
json.dumps(table_ner_results, indent=2, ensure_ascii=False)
|
||||||
|
)
|
||||||
|
|
||||||
print("✅ Done! Extracted data saved to /output")
|
print("✅ Done! Extracted data saved to /output")
|
||||||
Loading…
Reference in New Issue