Merge branch 'main' of gitMannheim:PSE2_FF/pse2_ff

pull/40/head
Jaronim Pracht 2025-05-26 19:15:26 +02:00
commit c5f3224c68
39 changed files with 637 additions and 307 deletions

3
.flake8 100644
View File

@ -0,0 +1,3 @@
# .flake8
[flake8]
max-line-length = 88

View File

@ -0,0 +1,13 @@
repos:
- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black
language_version: python3
files: ^project/backend/
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
hooks:
- id: flake8
files: ^project/backend/

14
docker-compose.yml 100644
View File

@ -0,0 +1,14 @@
services:
backend:
build: ./project/backend
container_name: fundfuechse-backend
ports:
- "5000:5000"
restart: always
# frontend:
# build: ./project/frontend
# container_name: fundfuechse-frontend
# ports:
# - "3000:80"
# restart: always

View File

@ -0,0 +1,19 @@
# 1. Python-Image verwenden
FROM python:3.11-alpine
# 2. Arbeitsverzeichnis im Container setzen
WORKDIR /app
# 3. requirements.txt kopieren und Pakete installieren
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 4. Quellcode kopieren (z.B. app.py)
COPY . .
# 5. Flask-App starten
# production-style server mit gunicorn
RUN pip install gunicorn
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]

View File

@ -0,0 +1,85 @@
## Setup
### Voraussetzungen
- Python 3.11+
- pip
- Docker (Desktop)
- Optional: `pre-commit`
### Abhängigkeiten installieren
```bash
pip install -r requirements.txt
# Codequalität (lokal prüfen)
black app.py
flake8 app.py
## Anwendung starten
### Lokal
1. Abhängigkeiten installieren:
```bash
pip install -r requirements.txt
```
2. Flask-App starten:
```bash
python app.py
```
3. Aufrufen im Browser:
```
http://localhost:5000/
```
---
### Option 2: Mit Docker
1. Image bauen:
```bash
docker build -t fundfuechse-backend .
```
2. Container starten:
```bash
docker run -p 5000:5000 fundfuechse-backend
```
Die API läuft dann unter:
```
http://localhost:5000/
```
---
### Option 3: Mit docker-compose
```bash
docker-compose up --build
```
Danach ist der Service erreichbar unter:
```
http://localhost:5000/
```
---
### Testaufruf per curl (PDF hochladen)
```bash
curl.exe -X POST -F "file=@Pitchbook 1.pdf" http://localhost:5000/upload
```

View File

@ -0,0 +1,66 @@
from flask import Flask, jsonify
from flask import request
import os
app = Flask(__name__)
@app.route("/health")
def health_check():
return "OK"
# gibt Beispiel-Konfig der Kennzahlen zurück (für die UI)
@app.route("/config", methods=["GET"])
def get_config():
config = [
{"name": "Fondname", "format": "Text", "required": True},
{"name": "IRR", "format": "Prozent", "required": False},
]
return jsonify(config)
# liefert Beispiel-Ergebnisse der Extraktion
@app.route("/extraction_results", methods=["GET"])
def get_extraction_results():
results = [
{"label": "Fondname", "entity": "ABC Fonds", "page": 1, "status": "validated"},
{
"label": "IRR",
"entity": "6,0%",
"page": 3,
"status": "single-source",
"source": "spaCy",
},
]
return jsonify(results)
# legt Upload-Ordner an, falls nicht vorhanden
UPLOAD_FOLDER = "uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# nimmt eine PDF-Datei per POST entgegen und speichert sie
@app.route("/upload", methods=["POST"])
def upload_pdf():
if "file" not in request.files:
return {"error": "Keine Datei hochgeladen."}, 400
file = request.files["file"]
if file.filename == "":
return {"error": "Dateiname fehlt."}, 400
if not file.filename.endswith(".pdf"):
return {"error": "Nur PDF-Dateien erlaubt."}, 400
file_path = os.path.join(UPLOAD_FOLDER, file.filename)
file.save(file_path)
return {"message": f"Datei {file.filename} erfolgreich gespeichert!"}, 200
# für Docker wichtig: host='0.0.0.0'
if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0")

View File

@ -0,0 +1,4 @@
Flask
black
flake8
pre-commit

View File

@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
for page in pages:
print('Processing next page...')
print("Processing next page...")
interpreter.process_page(page)
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextBox):
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
print('At %r is text: %s' % ((x, y), text))
print("At %r is text: %s" % ((x, y), text))

View File

@ -1,5 +1,5 @@
#########################################################
#Run: in Terminal -> streamlit run PyMuPdf_st.py
# Run: in Terminal -> streamlit run PyMuPdf_st.py
#########################################################
import streamlit as st
@ -28,18 +28,14 @@ if uploaded_file and suchwort:
rects = page.search_for(suchwort)
for rect in rects:
fundstellen.append({
"seite": page_num,
"rect": rect
})
fundstellen.append({"seite": page_num, "rect": rect})
if fundstellen:
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
# Auswahl der Fundstelle
auswahl = st.selectbox(
"Fundstelle auswählen:",
[f"Seite {f['seite'] + 1}" for f in fundstellen]
"Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
)
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)

View File

@ -38,7 +38,9 @@ for eintrag in kennzahlen:
highlight = page.add_highlight_annot(rect)
highlight.update()
else:
st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
st.warning(
f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
)
except Exception as e:
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
# PDF anzeigen mit Scroll zu aktueller Seite
st.subheader(f"Vorschau")
with open(highlighted_path, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
# Seite direkt ansteuern
pdf_display = f'''
pdf_display = f"""
<iframe
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
width="100%" height="800px" type="application/pdf">
</iframe>
'''
"""
st.markdown(pdf_display, unsafe_allow_html=True)

View File

@ -87,9 +87,9 @@ class Server:
server_params = StdioServerParameters(
command=command,
args=self.config["args"],
env={**os.environ, **self.config["env"]}
if self.config.get("env")
else None,
env=(
{**os.environ, **self.config["env"]} if self.config.get("env") else None
),
)
try:
stdio_transport = await self.exit_stack.enter_async_context(
@ -244,28 +244,23 @@ class LLMClient:
formatted_messages = []
for msg in messages:
# print(msg)
formatted_messages.append({
"role": msg["role"],
"content": msg["content"]
})
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
client = AzureOpenAI(
api_key=self.api_key,
api_version="2023-07-01-preview",
base_url=url
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
)
response = client.chat.completions.create(
messages=formatted_messages,
model="gpt-4o-mini",
# response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
messages=formatted_messages,
model="gpt-4o-mini",
# response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
if response.choices[0].message.content:
# print("response: " + response.choices[0].message.content)
return response.choices[0].message.content
@ -412,12 +407,16 @@ class ChatSession:
"4. Use appropriate context from the user's question\n"
"5. Avoid simply repeating the raw data\n\n"
"Please use only the tools that are explicitly defined above."
)
messages = [{"role": "system", "content": system_message}]
messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
"For each value there is only one correct answer, try to find it with the tools provided."})
messages.append(
{
"role": "assistant",
"content": "You have to extract data from pdf files and have different tools for extracting."
"For each value there is only one correct answer, try to find it with the tools provided.",
}
)
while True:
try:
@ -455,7 +454,6 @@ class ChatSession:
# messages.append({"role": "assistant", "content": llm_response})
# logging.info("\nFinal response: %s", llm_response)
except KeyboardInterrupt:
logging.info("\nExiting...")
break
@ -476,5 +474,6 @@ async def main() -> None:
chat_session = ChatSession(servers, llm_client)
await chat_session.start()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -8,54 +8,86 @@ mcp = FastMCP("Demo")
risikoProfile = ["Core/Core+, Core", "Value Add"]
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
# Add an addition tool
@mcp.tool()
def add(a: int, b: int) -> int:
"""Add two numbers"""
return a + b
@mcp.tool()
def getFromSpaCy() -> list:
"""Get data from SpaCy"""
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
{"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
return [
{
"page": random.randint(1, 35),
"value": random.choice(risikoProfileSpacy),
"key": "Risiko",
},
{"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
]
@mcp.tool()
def getFromChatGPT() -> list:
"""Get data from ChatGPT"""
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
{"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
return [
{
"page": random.randint(1, 35),
"value": random.choice(risikoProfile),
"key": "Risiko",
},
{"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
]
@mcp.tool()
def checkSpacyResult() -> dict:
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
return {
"page": random.randint(1, 35),
"value": random.choice(risikoProfile),
"key": "Risiko",
}
@mcp.tool()
def getFromChatGPTSingle(value: str) -> dict:
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
return {
"page": random.randint(1, 35),
"value": random.choice(risikoProfile),
"key": value,
}
context = ""
@mcp.tool()
def getContext() -> str:
"""This tool gets context information."""
return context
@mcp.tool()
def setContext(value: str) -> None:
"""This tool sets context information."""
global context
context = value
# Add a dynamic greeting resource
@mcp.resource("greeting://{name}")
def get_greeting(name: str) -> str:
"""Get a personalized greeting"""
return f"Hello, {name}!"
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
@mcp.tool()
def validate_entity(entity: str, label: str) -> dict:
"""Returns if the entity is valid based on hardcoded rules."""
@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict:
return {"status": "valid", "entity": entity}
return {"status": "invalid", "entity": entity}
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
@mcp.tool()
def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
def merge_spacy_exxeta(
spacy_result: list[dict], exxeta_result: list[dict]
) -> list[dict]:
"""Merge two results, mark as validated if label/entity/page match."""
def norm(e): return e["entity"].lower().replace(" ", "")
def norm(e):
return e["entity"].lower().replace(" ", "")
merged = []
seen = set()
@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
for s in spacy_result:
s_norm = norm(s)
s_page = s["page"]
match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
match = next(
(
e
for e in exxeta_result
if e["label"] == s["label"]
and norm(e) == s_norm
and e["page"] == s_page
),
None,
)
if match:
merged.append({**s, "status": "validated"})
seen.add((match["entity"], match["page"]))
@ -88,4 +136,4 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
for e in exxeta_result:
if (e["entity"], e["page"]) not in seen:
merged.append({**e, "status": "exxeta_only"})
return merged
return merged

View File

@ -12,10 +12,12 @@ app = Flask(__name__)
UPLOAD_FOLDER = Path("pitchbooks")
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
@app.route("/")
def home():
return "Backend is running!"
@app.route("/upload", methods=["POST"])
def upload():
file = request.files.get("file")
@ -44,5 +46,6 @@ def upload():
return "status: complete\n"
if __name__ == "__main__":
app.run(debug=True)
app.run(debug=True)

View File

@ -1,2 +1,2 @@
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"

View File

@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo"
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
def extract_with_exxeta(pages_json):
results = []
@ -18,33 +19,36 @@ def extract_with_exxeta(pages_json):
continue
prompt = (
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
"Beispiele:\n"
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
"TEXT:\n" + text
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
"Beispiele:\n"
'- "Core, Core+" → entity: "Core, Core+"\n'
'- "Core/Core+" → entity: "Core/Core+"\n'
'- "Core and Core+" → entity: "Core and Core+"\n\n'
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
"TEXT:\n" + text
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"
"Authorization": f"Bearer {EXXETA_API_KEY}",
}
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
{"role": "user", "content": prompt}
{
"role": "system",
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
},
{"role": "user", "content": prompt},
],
"temperature": 0.0
"temperature": 0.0,
}
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@ -77,4 +81,4 @@ def extract_with_exxeta(pages_json):
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return results
return results

View File

@ -1,13 +1,16 @@
from pathlib import Path
import json
def normalize_entity(entity_str):
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
def load_json(path: Path):
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def merge_and_validate_entities(filter_label=None):
base = Path(__file__).resolve().parent.parent
spacy_path = base / "spacy_service/output/spacy-results.json"
@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None):
s_page = s["page"]
match = next(
(e for e in exxeta_data
if e["label"] == s["label"] and
normalize_entity(e["entity"]) == s_norm and
e["page"] == s_page),
None
(
e
for e in exxeta_data
if e["label"] == s["label"]
and normalize_entity(e["entity"]) == s_norm
and e["page"] == s_page
),
None,
)
if match:

View File

@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent
OUTPUT_FOLDER = BASE_DIR / "output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
def run_ocr_and_extract(pdf_path: str):
pdf_path = Path(pdf_path)
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str):
cmd = [
"ocrmypdf",
"--force-ocr",
"--output-type", "pdfa",
"--language", "deu+eng",
"--output-type",
"pdfa",
"--language",
"deu+eng",
str(pdf_path),
str(output_pdf)
str(output_pdf),
]
result = subprocess.run(cmd, capture_output=True)
@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str):
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
with pdfplumber.open(output_pdf) as pdf:
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
with open(json_path, "w", encoding="utf-8") as f:
json.dump(pages, f, indent=2, ensure_ascii=False)
return {
"ocr_pdf": str(output_pdf),
"json_path": str(json_path)
}
return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}

View File

@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
nlp = spacy.load(model_path)
input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
input_pdf_path = (
Path(__file__).resolve().parent
/ ".."
/ "ocr_pdf_service"
/ "output"
/ "pitchbook-OCR.pdf"
)
input_pdf = Path(input_pdf_path)
@ -25,14 +31,10 @@ def extract_with_spacy(pages_json):
doc = nlp(text)
for ent in doc.ents:
results.append({
"label": ent.label_,
"entity": ent.text,
"page": page_num
})
results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
output_path = OUTPUT_FOLDER / f"spacy-results.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return results
return results

View File

@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
API_KEY = os.getenv("API_KEY")
client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-07-01-preview",
base_url=BASE_URL
)
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
)
def extract_text_from_pdf(file_path):
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
all_text = ""
@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
pdf_text = extract_text_from_pdf(file_path)
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "Always respond with a valid JSON object"
},
{
"role": "user",
"content": """extract the values from the text. let not found values empty:
messages=[
{"role": "system", "content": "Always respond with a valid JSON object"},
{
"role": "user",
"content": """extract the values from the text. let not found values empty:
-Fondsname
-Fondsmanager
-Name Kapitalverwaltungsgesellschaft
@ -71,20 +68,20 @@ response = client.chat.completions.create(
- the page where this value was found
- a confidence score, how confident the model is about the value (low, medium, high)
Here ist the text:""" + pdf_text
}
],
model="gpt-4o-mini",
response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
Here ist the text:"""
+ pdf_text,
},
],
model="gpt-4o-mini",
response_format={"type": "json_object"},
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
print(response.choices[0].message.content)

View File

@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
API_KEY = os.getenv("API_KEY")
client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-07-01-preview",
base_url=BASE_URL
)
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
)
def extract_text_from_pdf(file_path):
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
all_text = ""
@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
pdf_text = extract_text_from_pdf(file_path)
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "Always respond with a valid JSON object"
},
{
"role": "user",
"content": """extract the values from the text. let not found values empty:
messages=[
{"role": "system", "content": "Always respond with a valid JSON object"},
{
"role": "user",
"content": """extract the values from the text. let not found values empty:
-Fondsname
-Fondsmanager
-Name Kapitalverwaltungsgesellschaft
@ -71,20 +68,20 @@ response = client.chat.completions.create(
- the page where this value was found
- a confidence score, how confident the model is about the value (low, medium, high)
Here ist the text:""" + pdf_text
}
],
model="gpt-4o-mini",
response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
Here ist the text:"""
+ pdf_text,
},
],
model="gpt-4o-mini",
response_format={"type": "json_object"},
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
print(response.choices[0].message.content)

View File

@ -0,0 +1,2 @@
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}
{"text": "Das geplante Projektvolumen beträgt 120 Mio. €.", "entities": [[28, 44, "KENNZAHL"]]}

View File

@ -2,7 +2,7 @@ import spacy
from spacy.tokens import DocBin
from training_data import TRAINING_DATA
nlp = spacy.blank("de")
nlp = spacy.blank("de")
doc_bin = DocBin()
for text, annotations in TRAINING_DATA:
@ -17,4 +17,4 @@ for text, annotations in TRAINING_DATA:
doc.ents = ents
doc_bin.add(doc)
doc_bin.to_disk("data/train.spacy")
doc_bin.to_disk("data/train.spacy")

View File

@ -0,0 +1,17 @@
import streamlit as st
import json
st.title("Neue Kennzahl annotieren")
text = st.text_area("Text", "Das geplante Projektvolumen beträgt 120 Mio. €.")
start = st.number_input("Start-Position", min_value=0, max_value=len(text), value=28)
end = st.number_input("End-Position", min_value=0, max_value=len(text), value=44)
label = st.text_input("Label (z.B. KENNZAHL)", "KENNZAHL")
if st.button("Speichern"):
example = {"text": text, "entities": [[start, end, label]]}
with open("annotated_data.json", "a", encoding="utf-8") as f:
f.write(json.dumps(example, ensure_ascii=False) + "\n")
st.success("✅ Annotation gespeichert!")

View File

@ -15,13 +15,11 @@ for page_number in range(len(doc)):
text = page.get_text()
spacy_doc = nlp(text)
for ent in spacy_doc.ents:
results.append({
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number + 1
})
results.append(
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
)
with open("entities_output.json", "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print("✅ Extraction completed. Results saved to 'entities_output.json'")
print("✅ Extraction completed. Results saved to 'entities_output.json'")

View File

@ -71,33 +71,33 @@ TRAINING_DATA = [
"core, core+, value-added",
{"entities": [[0, 24, "RISIKOPROFIL"]]},
),
(
"Manage to Core: max 20%",
{"entities": [[10, 14, "RISIKOPROFIL"]]},
),
(
"Benefits of the core/ core+ segment",
{"entities": [[16, 27, "RISIKOPROFIL"]]},
),
(
"Drawbacks of the core/ core+ segment",
{"entities": [[17, 28, "RISIKOPROFIL"]]},
),
(
"Why a Core / Core + investment program?",
{"entities": [[6, 19, "RISIKOPROFIL"]]},
),
(
"Different risk profile (core, core+, value-added)",
{"entities": [[24, 48, "RISIKOPROFIL"]]},
),
(
(
"Manage to Core: max 20%",
{"entities": [[10, 14, "RISIKOPROFIL"]]},
),
(
"Benefits of the core/ core+ segment",
{"entities": [[16, 27, "RISIKOPROFIL"]]},
),
(
"Drawbacks of the core/ core+ segment",
{"entities": [[17, 28, "RISIKOPROFIL"]]},
),
(
"Why a Core / Core + investment program?",
{"entities": [[6, 19, "RISIKOPROFIL"]]},
),
(
"Different risk profile (core, core+, value-added)",
{"entities": [[24, 48, "RISIKOPROFIL"]]},
),
(
"INK MGallery Hotel Area: Amsterdam Core Tenant: Closed in 2018",
{"entities": [[35, 39, "RISIKOPROFIL"]]},
),
(
"A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
{"entities": [[34, 48, "RISIKOPROFIL"]]},
),
(
"A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
{"entities": [[34, 48, "RISIKOPROFIL"]]},
),
(
"Navigate the diversity of the Core/Core+ investment opportunities in European Prime Cities",
@ -226,9 +226,5 @@ TRAINING_DATA = [
(
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
)
),
]

View File

@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA):
for start, end, label in annot["entities"]:
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
print(
f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
)
else:
ents.append(span)
print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
print(
f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
)
# label the text with the ents
doc.ents = ents
db.add(doc)

View File

@ -87,9 +87,9 @@ class Server:
server_params = StdioServerParameters(
command=command,
args=self.config["args"],
env={**os.environ, **self.config["env"]}
if self.config.get("env")
else None,
env=(
{**os.environ, **self.config["env"]} if self.config.get("env") else None
),
)
try:
stdio_transport = await self.exit_stack.enter_async_context(
@ -244,28 +244,23 @@ class LLMClient:
formatted_messages = []
for msg in messages:
print(msg)
formatted_messages.append({
"role": msg["role"],
"content": msg["content"]
})
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
client = AzureOpenAI(
api_key=self.api_key,
api_version="2023-07-01-preview",
base_url=url
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
)
response = client.chat.completions.create(
messages=formatted_messages,
model="gpt-4o-mini",
# response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
messages=formatted_messages,
model="gpt-4o-mini",
# response_format={"type": "json_object"}
# temperature=0.7,
# top_p=0.95,
# frequency_penalty=0,
# presence_penalty=0,
# max_tokens=800,
# stop="",
# stream=False
)
if response.choices[0].message.content:
print("response: " + response.choices[0].message.content)
return response.choices[0].message.content

View File

@ -1,5 +1,6 @@
# server.py
from mcp.server.fastmcp import FastMCP
# Create an MCP server
mcp = FastMCP("Demo")

View File

@ -1,3 +1,3 @@
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
MODEL_ID = "gpt-35-turbo"
MODEL_ID = "gpt-35-turbo"

View File

@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
OUTPUT_PATH = "mcp_spacy_validated_result.json"
def load_spacy_entities():
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
return json.load(f)
def load_pitchbook_pages():
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
return json.load(f)
def get_page_text(pages, page_number):
for page in pages:
if page.get("page") == page_number:
return page.get("text", "")
return ""
def normalize_entity(entity):
return ' '.join(entity.replace('\n', ' ').split())
return " ".join(entity.replace("\n", " ").split())
def validate_entity_with_exxeta(entity, page_num, text):
prompt = (
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
f"Ziel-Formulierung:\n"
f"\"{entity}\"\n\n"
f'"{entity}"\n\n'
f"Validierungsregeln:\n"
f"- Groß- und Kleinschreibung ignorieren.\n"
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
f"OCR-Text auf Seite {page_num}:\n{text}"
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"
"Authorization": f"Bearer {EXXETA_API_KEY}",
}
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
{"role": "user", "content": prompt}
{
"role": "system",
"content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
},
{"role": "user", "content": prompt},
],
"temperature": 0.0
"temperature": 0.0,
}
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text):
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
return False
def run():
spacy_entities = load_spacy_entities()
pitchbook_pages = load_pitchbook_pages()
@ -81,17 +90,20 @@ def run():
page_text = get_page_text(pitchbook_pages, page)
is_valid = validate_entity_with_exxeta(entity, page, page_text)
validated_results.append({
"label": entity_data.get("label"),
"entity": raw_entity,
"page": page,
"validated": is_valid
})
validated_results.append(
{
"label": entity_data.get("label"),
"entity": raw_entity,
"page": page,
"validated": is_valid,
}
)
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(validated_results, f, indent=2, ensure_ascii=False)
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
if __name__ == "__main__":
run()
run()

View File

@ -10,19 +10,23 @@ KPI_SERVICE_MAP = {
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
def load_spacy_entities(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def load_exxeta_entities(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def normalize(text):
if not text:
return ""
return text.strip().lower().replace(" ", "").replace("/", "/")
def validate_kpi(kpi, spacy_entities, exxeta_entities):
results = []
@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
for ee in exxeta_entries:
ee_entity = normalize(ee["entity"])
if se_entity == ee_entity:
results.append({
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "validated"
})
results.append(
{
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "validated",
}
)
matched = True
break
if not matched:
results.append({
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "spacy-only"
})
results.append(
{
"kpi": kpi,
"entity": se["entity"],
"page": page,
"validation_status": "spacy-only",
}
)
for ee in exxeta_entries:
ee_entity = normalize(ee["entity"])
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
results.append({
"kpi": kpi,
"entity": ee["entity"],
"page": page,
"validation_status": "exxeta-only"
})
results.append(
{
"kpi": kpi,
"entity": ee["entity"],
"page": page,
"validation_status": "exxeta-only",
}
)
return results
def save_results(results, filename):
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def run():
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
@ -96,5 +108,6 @@ def run():
save_results(all_results, "mcp_validated_result.json")
print("✅ Validation complete! Output: mcp_validated_result.json")
if __name__ == "__main__":
run()

View File

@ -1,3 +1,3 @@
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
MODEL_ID = "gpt-35-turbo"
MODEL_ID = "gpt-35-turbo"

View File

@ -4,6 +4,7 @@ import json
MODEL = "gpt-35-turbo"
def extract_risikoprofil_from_exxeta(pages_json):
results = []
@ -15,34 +16,36 @@ def extract_risikoprofil_from_exxeta(pages_json):
continue
prompt = (
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
"Beispiele:\n"
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
"TEXT:\n" + text
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
"Beispiele:\n"
'- "Core, Core+" → entity: "Core, Core+"\n'
'- "Core/Core+" → entity: "Core/Core+"\n'
'- "Core and Core+" → entity: "Core and Core+"\n\n'
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
"TEXT:\n" + text
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {EXXETA_API_KEY}"
"Authorization": f"Bearer {EXXETA_API_KEY}",
}
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
{"role": "user", "content": prompt}
{
"role": "system",
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
},
{"role": "user", "content": prompt},
],
"temperature": 0.0
"temperature": 0.0,
}
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
@ -71,4 +74,4 @@ def extract_risikoprofil_from_exxeta(pages_json):
except Exception as e:
print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}")
return results
return results

View File

@ -1,10 +1,11 @@
def normalize_entity(entity_str):
if not entity_str:
return ""
normalized = entity_str.replace('\n', ' ')
normalized = ''.join(normalized.lower().split())
normalized = entity_str.replace("\n", " ")
normalized = "".join(normalized.lower().split())
return normalized
def merge_and_validate_entities(spacy_data, exxeta_data):
merged = []
seen = set()
@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
e_page = e["page"]
# Match if normalized entity and page match
if (s["label"] == e["label"] and
s_entity_norm == e_entity_norm and
s_page == e_page):
if (
s["label"] == e["label"]
and s_entity_norm == e_entity_norm
and s_page == e_page
):
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "validated"
})
merged.append(
{
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "validated",
}
)
seen.add((e["entity"], e_page))
found = True
break
# If no match found, add as single-source
if not found:
merged.append({
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "single-source",
"source": "spacy"
})
merged.append(
{
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "single-source",
"source": "spacy",
}
)
# Add remaining Exxeta entities not already processed
for e in exxeta_data:
if (e["entity"], e["page"]) not in seen:
merged.append({
"label": e["label"],
"entity": e["entity"],
"page": e["page"],
"status": "single-source",
"source": "exxeta"
})
merged.append(
{
"label": e["label"],
"entity": e["entity"],
"page": e["page"],
"status": "single-source",
"source": "exxeta",
}
)
return merged
return merged

View File

@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
def load_pitchbook_pages():
path = Path(PITCHBOOK_PATH)
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_json(data, filename):
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def sort_by_page_number(entities):
return sorted(entities, key=lambda x: x.get("page", 0))
def run():
spacy_entities = load_spacy_entities(SPACY_PATH)
pitchbook_pages = load_pitchbook_pages()
@ -33,5 +37,6 @@ def run():
print("- merged_result.json")
print(f"- Total entities in merged result: {len(merged_sorted)}")
if __name__ == "__main__":
run()
run()

View File

@ -1,7 +1,8 @@
import json
from pathlib import Path
def load_spacy_entities(path):
path = Path(path)
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
return json.load(f)

View File

@ -11,15 +11,20 @@ log_folder = Path("logs")
for folder in [output_folder, log_folder]:
folder.mkdir(parents=True, exist_ok=True)
def extract_text_to_json(pdf_path: Path):
json_path = output_folder / f"{pdf_path.stem}.json"
with pdfplumber.open(pdf_path) as pdf:
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
with open(json_path, "w", encoding="utf-8") as f:
json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"📄 Text JSON saved: {json_path.name}")
def ocr_pdf(input_file: Path):
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
log_file = log_folder / f"{input_file.stem}.log"
@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path):
cmd = [
"ocrmypdf",
"--force-ocr",
"--output-type", "pdfa",
"--language", "deu+eng",
"--sidecar", str(sidecar_txt),
"--output-type",
"pdfa",
"--language",
"deu+eng",
"--sidecar",
str(sidecar_txt),
str(input_file),
str(output_file)
str(output_file),
]
with open(log_file, "w") as log:
@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path):
else:
print(f"❌ OCR failed. See log: {log_file}")
if __name__ == "__main__":
if not input_folder.exists():
print("Input folder does not exist!")
@ -54,4 +63,4 @@ if __name__ == "__main__":
else:
for pdf in pdfs:
print(f"Processing: {pdf.name}")
ocr_pdf(pdf)
ocr_pdf(pdf)

View File

@ -1,4 +1,4 @@
import pdfplumber
import pdfplumber
pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
@ -10,7 +10,7 @@ pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
# # Print the extracted text with preserved structure
# print(f"Page {page.page_number}:\n{page_text}\n")
with pdfplumber.open(pdf_path) as pdf:
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()

View File

@ -1,6 +1,6 @@
# https://github.com/explosion/spacy-layout
### Run with: python extract_pitchbooks.py
import spacy
import spacy
from spacy_layout import spaCyLayout
from pathlib import Path
import pandas as pd
@ -34,14 +34,14 @@ for ent in doc_ner.ents:
break
if ent.text.strip():
ner_text_results.append({
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number
})
ner_text_results.append(
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
)
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
(output_dir / "ner_text.json").write_text(
json.dumps(ner_text_results, indent=2, ensure_ascii=False)
)
# 2. NER on table cells
table_ner_results = []
@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1):
doc_cell = nlp(cell)
for ent in doc_cell.ents:
if ent.text.strip():
table_ner_results.append({
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number,
"table": i
})
table_ner_results.append(
{
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number,
"table": i,
}
)
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
(output_dir / "ner_tables.json").write_text(
json.dumps(table_ner_results, indent=2, ensure_ascii=False)
)
print("✅ Done! Extracted data saved to /output")
print("✅ Done! Extracted data saved to /output")