Backend Flask aufsetzen (Ticket #4)
parent
ed7e01a395
commit
cd6c66a1fd
|
|
@ -4,8 +4,10 @@ repos:
|
|||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
files: ^project/backend/
|
||||
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 6.1.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
files: ^project/backend/
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
|||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
|
||||
fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
|
||||
fp = open("Teaser_5_OCR-MY-PDF.pdf", "rb")
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
|
|
@ -12,10 +12,10 @@ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|||
pages = PDFPage.get_pages(fp)
|
||||
|
||||
for page in pages:
|
||||
print('Processing next page...')
|
||||
print("Processing next page...")
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
for lobj in layout:
|
||||
if isinstance(lobj, LTTextBox):
|
||||
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
||||
print('At %r is text: %s' % ((x, y), text))
|
||||
print("At %r is text: %s" % ((x, y), text))
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#########################################################
|
||||
#Run: in Terminal -> streamlit run PyMuPdf_st.py
|
||||
# Run: in Terminal -> streamlit run PyMuPdf_st.py
|
||||
#########################################################
|
||||
|
||||
import streamlit as st
|
||||
|
|
@ -28,18 +28,14 @@ if uploaded_file and suchwort:
|
|||
rects = page.search_for(suchwort)
|
||||
|
||||
for rect in rects:
|
||||
fundstellen.append({
|
||||
"seite": page_num,
|
||||
"rect": rect
|
||||
})
|
||||
fundstellen.append({"seite": page_num, "rect": rect})
|
||||
|
||||
if fundstellen:
|
||||
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
||||
|
||||
# Auswahl der Fundstelle
|
||||
auswahl = st.selectbox(
|
||||
"Fundstelle auswählen:",
|
||||
[f"Seite {f['seite'] + 1}" for f in fundstellen]
|
||||
"Fundstelle auswählen:", [f"Seite {f['seite'] + 1}" for f in fundstellen]
|
||||
)
|
||||
|
||||
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
||||
|
|
|
|||
|
|
@ -38,7 +38,9 @@ for eintrag in kennzahlen:
|
|||
highlight = page.add_highlight_annot(rect)
|
||||
highlight.update()
|
||||
else:
|
||||
st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
|
||||
st.warning(
|
||||
f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)"
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
||||
|
||||
|
|
@ -68,13 +70,13 @@ aktuelle_seite = int(query_params.get("seite", 1))
|
|||
# PDF anzeigen mit Scroll zu aktueller Seite
|
||||
st.subheader(f"Vorschau")
|
||||
with open(highlighted_path, "rb") as f:
|
||||
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
||||
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
# Seite direkt ansteuern
|
||||
pdf_display = f'''
|
||||
pdf_display = f"""
|
||||
<iframe
|
||||
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
||||
width="100%" height="800px" type="application/pdf">
|
||||
</iframe>
|
||||
'''
|
||||
"""
|
||||
st.markdown(pdf_display, unsafe_allow_html=True)
|
||||
|
|
|
|||
|
|
@ -87,9 +87,9 @@ class Server:
|
|||
server_params = StdioServerParameters(
|
||||
command=command,
|
||||
args=self.config["args"],
|
||||
env={**os.environ, **self.config["env"]}
|
||||
if self.config.get("env")
|
||||
else None,
|
||||
env=(
|
||||
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
||||
),
|
||||
)
|
||||
try:
|
||||
stdio_transport = await self.exit_stack.enter_async_context(
|
||||
|
|
@ -244,28 +244,23 @@ class LLMClient:
|
|||
formatted_messages = []
|
||||
for msg in messages:
|
||||
# print(msg)
|
||||
formatted_messages.append({
|
||||
"role": msg["role"],
|
||||
"content": msg["content"]
|
||||
})
|
||||
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||
|
||||
client = AzureOpenAI(
|
||||
api_key=self.api_key,
|
||||
api_version="2023-07-01-preview",
|
||||
base_url=url
|
||||
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
||||
)
|
||||
response = client.chat.completions.create(
|
||||
messages=formatted_messages,
|
||||
model="gpt-4o-mini",
|
||||
# response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
messages=formatted_messages,
|
||||
model="gpt-4o-mini",
|
||||
# response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
if response.choices[0].message.content:
|
||||
# print("response: " + response.choices[0].message.content)
|
||||
return response.choices[0].message.content
|
||||
|
|
@ -412,12 +407,16 @@ class ChatSession:
|
|||
"4. Use appropriate context from the user's question\n"
|
||||
"5. Avoid simply repeating the raw data\n\n"
|
||||
"Please use only the tools that are explicitly defined above."
|
||||
|
||||
)
|
||||
|
||||
messages = [{"role": "system", "content": system_message}]
|
||||
messages.append({"role": "assistant", "content": "You have to extract data from pdf files and have different tools for extracting."
|
||||
"For each value there is only one correct answer, try to find it with the tools provided."})
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "You have to extract data from pdf files and have different tools for extracting."
|
||||
"For each value there is only one correct answer, try to find it with the tools provided.",
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
|
|
@ -455,7 +454,6 @@ class ChatSession:
|
|||
# messages.append({"role": "assistant", "content": llm_response})
|
||||
# logging.info("\nFinal response: %s", llm_response)
|
||||
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logging.info("\nExiting...")
|
||||
break
|
||||
|
|
@ -476,5 +474,6 @@ async def main() -> None:
|
|||
chat_session = ChatSession(servers, llm_client)
|
||||
await chat_session.start()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
|
|
|||
|
|
@ -8,54 +8,86 @@ mcp = FastMCP("Demo")
|
|||
risikoProfile = ["Core/Core+, Core", "Value Add"]
|
||||
risikoProfileSpacy = ["Core/Core+, Core", "Value Add", "3.2", "e au uae"]
|
||||
|
||||
|
||||
# Add an addition tool
|
||||
@mcp.tool()
|
||||
def add(a: int, b: int) -> int:
|
||||
"""Add two numbers"""
|
||||
return a + b
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def getFromSpaCy() -> list:
|
||||
"""Get data from SpaCy"""
|
||||
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfileSpacy), "key": "Risiko"},
|
||||
{"page":random.randint(1, 35), "value": "Real Estate", "key": "FondName"}]
|
||||
return [
|
||||
{
|
||||
"page": random.randint(1, 35),
|
||||
"value": random.choice(risikoProfileSpacy),
|
||||
"key": "Risiko",
|
||||
},
|
||||
{"page": random.randint(1, 35), "value": "Real Estate", "key": "FondName"},
|
||||
]
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def getFromChatGPT() -> list:
|
||||
"""Get data from ChatGPT"""
|
||||
return [{"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"},
|
||||
{"page":random.randint(1, 35), "value": "Real False Name", "key": "FondName"}]
|
||||
return [
|
||||
{
|
||||
"page": random.randint(1, 35),
|
||||
"value": random.choice(risikoProfile),
|
||||
"key": "Risiko",
|
||||
},
|
||||
{"page": random.randint(1, 35), "value": "Real False Name", "key": "FondName"},
|
||||
]
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def checkSpacyResult() -> dict:
|
||||
"""This tool checks the result of SpaCy, ensuring it meets certain criteria."""
|
||||
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": "Risiko"}
|
||||
return {
|
||||
"page": random.randint(1, 35),
|
||||
"value": random.choice(risikoProfile),
|
||||
"key": "Risiko",
|
||||
}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def getFromChatGPTSingle(value: str) -> dict:
|
||||
"""This tool get a single value from ChatGPT. You can use the value to specify for which key the value should calculated"""
|
||||
return {"page":random.randint(1, 35), "value": random.choice(risikoProfile), "key": value}
|
||||
return {
|
||||
"page": random.randint(1, 35),
|
||||
"value": random.choice(risikoProfile),
|
||||
"key": value,
|
||||
}
|
||||
|
||||
|
||||
context = ""
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def getContext() -> str:
|
||||
"""This tool gets context information."""
|
||||
return context
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def setContext(value: str) -> None:
|
||||
"""This tool sets context information."""
|
||||
global context
|
||||
context = value
|
||||
|
||||
|
||||
# Add a dynamic greeting resource
|
||||
@mcp.resource("greeting://{name}")
|
||||
def get_greeting(name: str) -> str:
|
||||
"""Get a personalized greeting"""
|
||||
return f"Hello, {name}!"
|
||||
|
||||
|
||||
""" Example prompt: Get data from spacy and exxeta and merge them. Validate if Core+ is a valid RISIKOPROFIL. """
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def validate_entity(entity: str, label: str) -> dict:
|
||||
"""Returns if the entity is valid based on hardcoded rules."""
|
||||
|
|
@ -66,11 +98,18 @@ def validate_entity(entity: str, label: str) -> dict:
|
|||
return {"status": "valid", "entity": entity}
|
||||
return {"status": "invalid", "entity": entity}
|
||||
|
||||
|
||||
""" Example prompt: Get spacy and exxeta results and merge them. Then validate if "Core/Core+" is a valid Risikoprofil. """
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> list[dict]:
|
||||
def merge_spacy_exxeta(
|
||||
spacy_result: list[dict], exxeta_result: list[dict]
|
||||
) -> list[dict]:
|
||||
"""Merge two results, mark as validated if label/entity/page match."""
|
||||
def norm(e): return e["entity"].lower().replace(" ", "")
|
||||
|
||||
def norm(e):
|
||||
return e["entity"].lower().replace(" ", "")
|
||||
|
||||
merged = []
|
||||
seen = set()
|
||||
|
|
@ -78,7 +117,16 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
|
|||
for s in spacy_result:
|
||||
s_norm = norm(s)
|
||||
s_page = s["page"]
|
||||
match = next((e for e in exxeta_result if e["label"] == s["label"] and norm(e) == s_norm and e["page"] == s_page), None)
|
||||
match = next(
|
||||
(
|
||||
e
|
||||
for e in exxeta_result
|
||||
if e["label"] == s["label"]
|
||||
and norm(e) == s_norm
|
||||
and e["page"] == s_page
|
||||
),
|
||||
None,
|
||||
)
|
||||
if match:
|
||||
merged.append({**s, "status": "validated"})
|
||||
seen.add((match["entity"], match["page"]))
|
||||
|
|
@ -88,4 +136,4 @@ def merge_spacy_exxeta(spacy_result: list[dict], exxeta_result: list[dict]) -> l
|
|||
for e in exxeta_result:
|
||||
if (e["entity"], e["page"]) not in seen:
|
||||
merged.append({**e, "status": "exxeta_only"})
|
||||
return merged
|
||||
return merged
|
||||
|
|
|
|||
|
|
@ -12,10 +12,12 @@ app = Flask(__name__)
|
|||
UPLOAD_FOLDER = Path("pitchbooks")
|
||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def home():
|
||||
return "Backend is running!"
|
||||
|
||||
|
||||
@app.route("/upload", methods=["POST"])
|
||||
def upload():
|
||||
file = request.files.get("file")
|
||||
|
|
@ -44,5 +46,6 @@ def upload():
|
|||
|
||||
return "status: complete\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
app.run(debug=True)
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
|
||||
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ MODEL = "gpt-35-turbo"
|
|||
OUTPUT_FOLDER = Path(__file__).resolve().parent / "output"
|
||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def extract_with_exxeta(pages_json):
|
||||
results = []
|
||||
|
||||
|
|
@ -18,33 +19,36 @@ def extract_with_exxeta(pages_json):
|
|||
continue
|
||||
|
||||
prompt = (
|
||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
||||
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||
"Beispiele:\n"
|
||||
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
||||
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
||||
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||
"TEXT:\n" + text
|
||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
||||
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||
"Beispiele:\n"
|
||||
'- "Core, Core+" → entity: "Core, Core+"\n'
|
||||
'- "Core/Core+" → entity: "Core/Core+"\n'
|
||||
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||
"TEXT:\n" + text
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
||||
{"role": "user", "content": prompt}
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.0
|
||||
"temperature": 0.0,
|
||||
}
|
||||
|
||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||
|
|
@ -77,4 +81,4 @@ def extract_with_exxeta(pages_json):
|
|||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return results
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -1,13 +1,16 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
|
||||
|
||||
def normalize_entity(entity_str):
|
||||
return ''.join(entity_str.replace('\n', ' ').lower().split()) if entity_str else ""
|
||||
return "".join(entity_str.replace("\n", " ").lower().split()) if entity_str else ""
|
||||
|
||||
|
||||
def load_json(path: Path):
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def merge_and_validate_entities(filter_label=None):
|
||||
base = Path(__file__).resolve().parent.parent
|
||||
spacy_path = base / "spacy_service/output/spacy-results.json"
|
||||
|
|
@ -25,11 +28,14 @@ def merge_and_validate_entities(filter_label=None):
|
|||
s_page = s["page"]
|
||||
|
||||
match = next(
|
||||
(e for e in exxeta_data
|
||||
if e["label"] == s["label"] and
|
||||
normalize_entity(e["entity"]) == s_norm and
|
||||
e["page"] == s_page),
|
||||
None
|
||||
(
|
||||
e
|
||||
for e in exxeta_data
|
||||
if e["label"] == s["label"]
|
||||
and normalize_entity(e["entity"]) == s_norm
|
||||
and e["page"] == s_page
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
if match:
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ BASE_DIR = Path(__file__).resolve().parent
|
|||
OUTPUT_FOLDER = BASE_DIR / "output"
|
||||
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def run_ocr_and_extract(pdf_path: str):
|
||||
pdf_path = Path(pdf_path)
|
||||
output_pdf = OUTPUT_FOLDER / f"pitchbook-OCR.pdf"
|
||||
|
|
@ -16,10 +17,12 @@ def run_ocr_and_extract(pdf_path: str):
|
|||
cmd = [
|
||||
"ocrmypdf",
|
||||
"--force-ocr",
|
||||
"--output-type", "pdfa",
|
||||
"--language", "deu+eng",
|
||||
"--output-type",
|
||||
"pdfa",
|
||||
"--language",
|
||||
"deu+eng",
|
||||
str(pdf_path),
|
||||
str(output_pdf)
|
||||
str(output_pdf),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
|
|
@ -28,12 +31,12 @@ def run_ocr_and_extract(pdf_path: str):
|
|||
raise RuntimeError(f"OCR failed:\n{result.stderr.decode()}")
|
||||
|
||||
with pdfplumber.open(output_pdf) as pdf:
|
||||
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
||||
pages = [
|
||||
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
||||
for i, page in enumerate(pdf.pages)
|
||||
]
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return {
|
||||
"ocr_pdf": str(output_pdf),
|
||||
"json_path": str(json_path)
|
||||
}
|
||||
return {"ocr_pdf": str(output_pdf), "json_path": str(json_path)}
|
||||
|
|
|
|||
|
|
@ -9,7 +9,13 @@ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
|||
|
||||
model_path = os.path.join(os.path.dirname(__file__), "models", "model-last")
|
||||
nlp = spacy.load(model_path)
|
||||
input_pdf_path = Path(__file__).resolve().parent / ".." / "ocr_pdf_service" / "output" / "pitchbook-OCR.pdf"
|
||||
input_pdf_path = (
|
||||
Path(__file__).resolve().parent
|
||||
/ ".."
|
||||
/ "ocr_pdf_service"
|
||||
/ "output"
|
||||
/ "pitchbook-OCR.pdf"
|
||||
)
|
||||
input_pdf = Path(input_pdf_path)
|
||||
doc = fitz.open(input_pdf)
|
||||
|
||||
|
|
@ -26,14 +32,10 @@ def extract_with_spacy(pages_json):
|
|||
|
||||
doc = nlp(text)
|
||||
for ent in doc.ents:
|
||||
results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text,
|
||||
"page": page_num
|
||||
})
|
||||
results.append({"label": ent.label_, "entity": ent.text, "page": page_num})
|
||||
|
||||
output_path = OUTPUT_FOLDER / f"spacy-results.json"
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return results
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
|||
API_KEY = os.getenv("API_KEY")
|
||||
|
||||
client = AzureOpenAI(
|
||||
api_key=API_KEY,
|
||||
api_version="2023-07-01-preview",
|
||||
base_url=BASE_URL
|
||||
)
|
||||
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
||||
)
|
||||
|
||||
|
||||
def extract_text_from_pdf(file_path):
|
||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||
all_text = ""
|
||||
|
|
@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
|
|||
pdf_text = extract_text_from_pdf(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Always respond with a valid JSON object"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """extract the values from the text. let not found values empty:
|
||||
messages=[
|
||||
{"role": "system", "content": "Always respond with a valid JSON object"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """extract the values from the text. let not found values empty:
|
||||
-Fondsname
|
||||
-Fondsmanager
|
||||
-Name Kapitalverwaltungsgesellschaft
|
||||
|
|
@ -71,20 +68,20 @@ response = client.chat.completions.create(
|
|||
- the page where this value was found
|
||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||
|
||||
Here ist the text:""" + pdf_text
|
||||
}
|
||||
],
|
||||
model="gpt-4o-mini",
|
||||
response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
|
||||
Here ist the text:"""
|
||||
+ pdf_text,
|
||||
},
|
||||
],
|
||||
model="gpt-4o-mini",
|
||||
response_format={"type": "json_object"},
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
|||
API_KEY = os.getenv("API_KEY")
|
||||
|
||||
client = AzureOpenAI(
|
||||
api_key=API_KEY,
|
||||
api_version="2023-07-01-preview",
|
||||
base_url=BASE_URL
|
||||
)
|
||||
api_key=API_KEY, api_version="2023-07-01-preview", base_url=BASE_URL
|
||||
)
|
||||
|
||||
|
||||
def extract_text_from_pdf(file_path):
|
||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||
all_text = ""
|
||||
|
|
@ -39,14 +39,11 @@ file_path = "../../pitch-books/Pitchbook 1.pdf"
|
|||
pdf_text = extract_text_from_pdf(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Always respond with a valid JSON object"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """extract the values from the text. let not found values empty:
|
||||
messages=[
|
||||
{"role": "system", "content": "Always respond with a valid JSON object"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """extract the values from the text. let not found values empty:
|
||||
-Fondsname
|
||||
-Fondsmanager
|
||||
-Name Kapitalverwaltungsgesellschaft
|
||||
|
|
@ -71,20 +68,20 @@ response = client.chat.completions.create(
|
|||
- the page where this value was found
|
||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||
|
||||
Here ist the text:""" + pdf_text
|
||||
}
|
||||
],
|
||||
model="gpt-4o-mini",
|
||||
response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
|
||||
Here ist the text:"""
|
||||
+ pdf_text,
|
||||
},
|
||||
],
|
||||
model="gpt-4o-mini",
|
||||
response_format={"type": "json_object"},
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import spacy
|
|||
from spacy.tokens import DocBin
|
||||
from training_data import TRAINING_DATA
|
||||
|
||||
nlp = spacy.blank("de")
|
||||
nlp = spacy.blank("de")
|
||||
doc_bin = DocBin()
|
||||
|
||||
for text, annotations in TRAINING_DATA:
|
||||
|
|
@ -17,4 +17,4 @@ for text, annotations in TRAINING_DATA:
|
|||
doc.ents = ents
|
||||
doc_bin.add(doc)
|
||||
|
||||
doc_bin.to_disk("data/train.spacy")
|
||||
doc_bin.to_disk("data/train.spacy")
|
||||
|
|
|
|||
|
|
@ -15,13 +15,11 @@ for page_number in range(len(doc)):
|
|||
text = page.get_text()
|
||||
spacy_doc = nlp(text)
|
||||
for ent in spacy_doc.ents:
|
||||
results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number + 1
|
||||
})
|
||||
results.append(
|
||||
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number + 1}
|
||||
)
|
||||
|
||||
with open("entities_output.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("✅ Extraction completed. Results saved to 'entities_output.json'")
|
||||
print("✅ Extraction completed. Results saved to 'entities_output.json'")
|
||||
|
|
|
|||
|
|
@ -71,33 +71,33 @@ TRAINING_DATA = [
|
|||
"core, core+, value-added",
|
||||
{"entities": [[0, 24, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Manage to Core: max 20%",
|
||||
{"entities": [[10, 14, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Benefits of the core/ core+ segment",
|
||||
{"entities": [[16, 27, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Drawbacks of the core/ core+ segment",
|
||||
{"entities": [[17, 28, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Why a Core / Core + investment program?",
|
||||
{"entities": [[6, 19, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Different risk profile (core, core+, value-added)",
|
||||
{"entities": [[24, 48, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
(
|
||||
"Manage to Core: max 20%",
|
||||
{"entities": [[10, 14, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Benefits of the core/ core+ segment",
|
||||
{"entities": [[16, 27, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Drawbacks of the core/ core+ segment",
|
||||
{"entities": [[17, 28, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Why a Core / Core + investment program?",
|
||||
{"entities": [[6, 19, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Different risk profile (core, core+, value-added)",
|
||||
{"entities": [[24, 48, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"INK MGallery Hotel Area: Amsterdam Core Tenant: Closed in 2018",
|
||||
{"entities": [[35, 39, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
|
||||
{"entities": [[34, 48, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.",
|
||||
{"entities": [[34, 48, "RISIKOPROFIL"]]},
|
||||
),
|
||||
(
|
||||
"Navigate the diversity of the Core/Core+ investment opportunities in European Prime Cities",
|
||||
|
|
@ -226,9 +226,5 @@ TRAINING_DATA = [
|
|||
(
|
||||
"Zielrendite 5,00-5,25 % Ausschüttungsrendite 1) Ankauf von Objekten an Tag eins mit 100% Eigenkapital. Die Strategie unterstellt die Aufnahme von Fremdkapital, sobald sich die Zins- und Finanzierungskonditionen nachhaltig stabilisieren. Strategie - Übersicht Risikoprofil Core+",
|
||||
{"entities": [[12, 23, "Ausschüttungsrendite"], [272, 277, "Risikoprofil"]]},
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,10 +22,14 @@ for text, annot in tqdm(TRAINING_DATA):
|
|||
for start, end, label in annot["entities"]:
|
||||
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
||||
if span is None:
|
||||
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
||||
print(
|
||||
f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
||||
)
|
||||
else:
|
||||
ents.append(span)
|
||||
print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
||||
print(
|
||||
f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}"
|
||||
)
|
||||
# label the text with the ents
|
||||
doc.ents = ents
|
||||
db.add(doc)
|
||||
|
|
|
|||
|
|
@ -87,9 +87,9 @@ class Server:
|
|||
server_params = StdioServerParameters(
|
||||
command=command,
|
||||
args=self.config["args"],
|
||||
env={**os.environ, **self.config["env"]}
|
||||
if self.config.get("env")
|
||||
else None,
|
||||
env=(
|
||||
{**os.environ, **self.config["env"]} if self.config.get("env") else None
|
||||
),
|
||||
)
|
||||
try:
|
||||
stdio_transport = await self.exit_stack.enter_async_context(
|
||||
|
|
@ -244,28 +244,23 @@ class LLMClient:
|
|||
formatted_messages = []
|
||||
for msg in messages:
|
||||
print(msg)
|
||||
formatted_messages.append({
|
||||
"role": msg["role"],
|
||||
"content": msg["content"]
|
||||
})
|
||||
formatted_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||
|
||||
client = AzureOpenAI(
|
||||
api_key=self.api_key,
|
||||
api_version="2023-07-01-preview",
|
||||
base_url=url
|
||||
api_key=self.api_key, api_version="2023-07-01-preview", base_url=url
|
||||
)
|
||||
response = client.chat.completions.create(
|
||||
messages=formatted_messages,
|
||||
model="gpt-4o-mini",
|
||||
# response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
messages=formatted_messages,
|
||||
model="gpt-4o-mini",
|
||||
# response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
if response.choices[0].message.content:
|
||||
print("response: " + response.choices[0].message.content)
|
||||
return response.choices[0].message.content
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# server.py
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
|
||||
# Create an MCP server
|
||||
mcp = FastMCP("Demo")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
|
||||
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
MODEL_ID = "gpt-35-turbo"
|
||||
MODEL_ID = "gpt-35-turbo"
|
||||
|
|
|
|||
|
|
@ -9,51 +9,59 @@ SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
|||
OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||
OUTPUT_PATH = "mcp_spacy_validated_result.json"
|
||||
|
||||
|
||||
def load_spacy_entities():
|
||||
with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def load_pitchbook_pages():
|
||||
with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def get_page_text(pages, page_number):
|
||||
for page in pages:
|
||||
if page.get("page") == page_number:
|
||||
return page.get("text", "")
|
||||
return ""
|
||||
|
||||
|
||||
def normalize_entity(entity):
|
||||
return ' '.join(entity.replace('\n', ' ').split())
|
||||
return " ".join(entity.replace("\n", " ").split())
|
||||
|
||||
|
||||
def validate_entity_with_exxeta(entity, page_num, text):
|
||||
prompt = (
|
||||
f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n"
|
||||
f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n"
|
||||
f"Ziel-Formulierung:\n"
|
||||
f"\"{entity}\"\n\n"
|
||||
f'"{entity}"\n\n'
|
||||
f"Validierungsregeln:\n"
|
||||
f"- Groß- und Kleinschreibung ignorieren.\n"
|
||||
f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n"
|
||||
f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n"
|
||||
f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n"
|
||||
f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n"
|
||||
f'- Antworte **ausschließlich** mit "true" (Treffer) oder "false" (kein Treffer).\n'
|
||||
f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n"
|
||||
f"OCR-Text auf Seite {page_num}:\n{text}"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."},
|
||||
{"role": "user", "content": prompt}
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.0
|
||||
"temperature": 0.0,
|
||||
}
|
||||
|
||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||
|
|
@ -67,6 +75,7 @@ def validate_entity_with_exxeta(entity, page_num, text):
|
|||
print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def run():
|
||||
spacy_entities = load_spacy_entities()
|
||||
pitchbook_pages = load_pitchbook_pages()
|
||||
|
|
@ -81,17 +90,20 @@ def run():
|
|||
page_text = get_page_text(pitchbook_pages, page)
|
||||
is_valid = validate_entity_with_exxeta(entity, page, page_text)
|
||||
|
||||
validated_results.append({
|
||||
"label": entity_data.get("label"),
|
||||
"entity": raw_entity,
|
||||
"page": page,
|
||||
"validated": is_valid
|
||||
})
|
||||
validated_results.append(
|
||||
{
|
||||
"label": entity_data.get("label"),
|
||||
"entity": raw_entity,
|
||||
"page": page,
|
||||
"validated": is_valid,
|
||||
}
|
||||
)
|
||||
|
||||
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(validated_results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
run()
|
||||
|
|
|
|||
|
|
@ -10,19 +10,23 @@ KPI_SERVICE_MAP = {
|
|||
SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||
EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json"
|
||||
|
||||
|
||||
def load_spacy_entities(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def load_exxeta_entities(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def normalize(text):
|
||||
if not text:
|
||||
return ""
|
||||
return text.strip().lower().replace(" ", "").replace("/", "/")
|
||||
|
||||
|
||||
def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
||||
results = []
|
||||
|
||||
|
|
@ -50,39 +54,47 @@ def validate_kpi(kpi, spacy_entities, exxeta_entities):
|
|||
for ee in exxeta_entries:
|
||||
ee_entity = normalize(ee["entity"])
|
||||
if se_entity == ee_entity:
|
||||
results.append({
|
||||
"kpi": kpi,
|
||||
"entity": se["entity"],
|
||||
"page": page,
|
||||
"validation_status": "validated"
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"kpi": kpi,
|
||||
"entity": se["entity"],
|
||||
"page": page,
|
||||
"validation_status": "validated",
|
||||
}
|
||||
)
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
results.append({
|
||||
"kpi": kpi,
|
||||
"entity": se["entity"],
|
||||
"page": page,
|
||||
"validation_status": "spacy-only"
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"kpi": kpi,
|
||||
"entity": se["entity"],
|
||||
"page": page,
|
||||
"validation_status": "spacy-only",
|
||||
}
|
||||
)
|
||||
|
||||
for ee in exxeta_entries:
|
||||
ee_entity = normalize(ee["entity"])
|
||||
if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries):
|
||||
results.append({
|
||||
"kpi": kpi,
|
||||
"entity": ee["entity"],
|
||||
"page": page,
|
||||
"validation_status": "exxeta-only"
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"kpi": kpi,
|
||||
"entity": ee["entity"],
|
||||
"page": page,
|
||||
"validation_status": "exxeta-only",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def save_results(results, filename):
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def run():
|
||||
spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH)
|
||||
exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH)
|
||||
|
|
@ -96,5 +108,6 @@ def run():
|
|||
save_results(all_results, "mcp_validated_result.json")
|
||||
print("✅ Validation complete! Output: mcp_validated_result.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0"
|
||||
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
MODEL_ID = "gpt-35-turbo"
|
||||
MODEL_ID = "gpt-35-turbo"
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import json
|
|||
|
||||
MODEL = "gpt-35-turbo"
|
||||
|
||||
|
||||
def extract_risikoprofil_from_exxeta(pages_json):
|
||||
results = []
|
||||
|
||||
|
|
@ -15,34 +16,36 @@ def extract_risikoprofil_from_exxeta(pages_json):
|
|||
continue
|
||||
|
||||
prompt = (
|
||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||
"Nur Begriffe wie \"Core\", \"Core+\", \"Core/Core+\", \"Value-added\" oder \"Opportunistisch\" sowie sehr ähnliche Varianten extrahieren.\n\n"
|
||||
"Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (\",\"), Schrägstriche (\"/\") oder Wörter wie \"und\" verbunden, "
|
||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||
"Beispiele:\n"
|
||||
"- \"Core, Core+\" → entity: \"Core, Core+\"\n"
|
||||
"- \"Core/Core+\" → entity: \"Core/Core+\"\n"
|
||||
"- \"Core and Core+\" → entity: \"Core and Core+\"\n\n"
|
||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||
f"[{{\"label\": \"RISIKOPROFIL\", \"entity\": \"Core, Core+\", \"page\": {page_num}}}]\n\n"
|
||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||
"TEXT:\n" + text
|
||||
"Bitte extrahiere alle relevanten Nennungen von Risikoprofilen eines Fonds aus folgendem Pitchbook-Text.\n"
|
||||
'Nur Begriffe wie "Core", "Core+", "Core/Core+", "Value-added" oder "Opportunistisch" sowie sehr ähnliche Varianten extrahieren.\n\n'
|
||||
'Wenn mehrere dieser Begriffe direkt zusammen genannt werden, egal ob durch Kommas (","), Schrägstriche ("/") oder Wörter wie "und" verbunden, '
|
||||
"bitte sie als **eine gemeinsame Entität** extrahieren, im Originalformat.\n\n"
|
||||
"Beispiele:\n"
|
||||
'- "Core, Core+" → entity: "Core, Core+"\n'
|
||||
'- "Core/Core+" → entity: "Core/Core+"\n'
|
||||
'- "Core and Core+" → entity: "Core and Core+"\n\n'
|
||||
"Gib die Ergebnisse als reines JSON-Array im folgenden Format aus:\n"
|
||||
f'[{{"label": "RISIKOPROFIL", "entity": "Core, Core+", "page": {page_num}}}]\n\n'
|
||||
"Falls keine Risikoprofile vorhanden sind, gib ein leeres Array [] zurück.\n\n"
|
||||
"Nur JSON-Antwort ohne Kommentare oder zusätzlichen Text.\n\n"
|
||||
"TEXT:\n" + text
|
||||
)
|
||||
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}"
|
||||
"Authorization": f"Bearer {EXXETA_API_KEY}",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
||||
{"role": "user", "content": prompt}
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.0
|
||||
"temperature": 0.0,
|
||||
}
|
||||
|
||||
url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions"
|
||||
|
|
@ -71,4 +74,4 @@ def extract_risikoprofil_from_exxeta(pages_json):
|
|||
except Exception as e:
|
||||
print(f"⚠️ Failed on page {page_num} (attempt {attempt+1}): {e}")
|
||||
|
||||
return results
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
def normalize_entity(entity_str):
|
||||
if not entity_str:
|
||||
return ""
|
||||
normalized = entity_str.replace('\n', ' ')
|
||||
normalized = ''.join(normalized.lower().split())
|
||||
normalized = entity_str.replace("\n", " ")
|
||||
normalized = "".join(normalized.lower().split())
|
||||
return normalized
|
||||
|
||||
|
||||
def merge_and_validate_entities(spacy_data, exxeta_data):
|
||||
merged = []
|
||||
seen = set()
|
||||
|
|
@ -21,39 +22,47 @@ def merge_and_validate_entities(spacy_data, exxeta_data):
|
|||
e_page = e["page"]
|
||||
|
||||
# Match if normalized entity and page match
|
||||
if (s["label"] == e["label"] and
|
||||
s_entity_norm == e_entity_norm and
|
||||
s_page == e_page):
|
||||
if (
|
||||
s["label"] == e["label"]
|
||||
and s_entity_norm == e_entity_norm
|
||||
and s_page == e_page
|
||||
):
|
||||
|
||||
merged.append({
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "validated"
|
||||
})
|
||||
merged.append(
|
||||
{
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "validated",
|
||||
}
|
||||
)
|
||||
seen.add((e["entity"], e_page))
|
||||
found = True
|
||||
break
|
||||
|
||||
# If no match found, add as single-source
|
||||
if not found:
|
||||
merged.append({
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "single-source",
|
||||
"source": "spacy"
|
||||
})
|
||||
merged.append(
|
||||
{
|
||||
"label": s["label"],
|
||||
"entity": s["entity"],
|
||||
"page": s_page,
|
||||
"status": "single-source",
|
||||
"source": "spacy",
|
||||
}
|
||||
)
|
||||
|
||||
# Add remaining Exxeta entities not already processed
|
||||
for e in exxeta_data:
|
||||
if (e["entity"], e["page"]) not in seen:
|
||||
merged.append({
|
||||
"label": e["label"],
|
||||
"entity": e["entity"],
|
||||
"page": e["page"],
|
||||
"status": "single-source",
|
||||
"source": "exxeta"
|
||||
})
|
||||
merged.append(
|
||||
{
|
||||
"label": e["label"],
|
||||
"entity": e["entity"],
|
||||
"page": e["page"],
|
||||
"status": "single-source",
|
||||
"source": "exxeta",
|
||||
}
|
||||
)
|
||||
|
||||
return merged
|
||||
return merged
|
||||
|
|
|
|||
|
|
@ -7,18 +7,22 @@ from merge_logic import merge_and_validate_entities
|
|||
SPACY_PATH = "../fine_tuning_spaCy/entities_output.json"
|
||||
PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json"
|
||||
|
||||
|
||||
def load_pitchbook_pages():
|
||||
path = Path(PITCHBOOK_PATH)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_json(data, filename):
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def sort_by_page_number(entities):
|
||||
return sorted(entities, key=lambda x: x.get("page", 0))
|
||||
|
||||
|
||||
def run():
|
||||
spacy_entities = load_spacy_entities(SPACY_PATH)
|
||||
pitchbook_pages = load_pitchbook_pages()
|
||||
|
|
@ -33,5 +37,6 @@ def run():
|
|||
print("- merged_result.json")
|
||||
print(f"- Total entities in merged result: {len(merged_sorted)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
run()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_spacy_entities(path):
|
||||
path = Path(path)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return json.load(f)
|
||||
|
|
|
|||
|
|
@ -11,15 +11,20 @@ log_folder = Path("logs")
|
|||
for folder in [output_folder, log_folder]:
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def extract_text_to_json(pdf_path: Path):
|
||||
json_path = output_folder / f"{pdf_path.stem}.json"
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
pages = [{"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages)]
|
||||
pages = [
|
||||
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
||||
for i, page in enumerate(pdf.pages)
|
||||
]
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(pages, f, indent=2, ensure_ascii=False)
|
||||
print(f"📄 Text JSON saved: {json_path.name}")
|
||||
|
||||
|
||||
def ocr_pdf(input_file: Path):
|
||||
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
||||
log_file = log_folder / f"{input_file.stem}.log"
|
||||
|
|
@ -28,11 +33,14 @@ def ocr_pdf(input_file: Path):
|
|||
cmd = [
|
||||
"ocrmypdf",
|
||||
"--force-ocr",
|
||||
"--output-type", "pdfa",
|
||||
"--language", "deu+eng",
|
||||
"--sidecar", str(sidecar_txt),
|
||||
"--output-type",
|
||||
"pdfa",
|
||||
"--language",
|
||||
"deu+eng",
|
||||
"--sidecar",
|
||||
str(sidecar_txt),
|
||||
str(input_file),
|
||||
str(output_file)
|
||||
str(output_file),
|
||||
]
|
||||
|
||||
with open(log_file, "w") as log:
|
||||
|
|
@ -44,6 +52,7 @@ def ocr_pdf(input_file: Path):
|
|||
else:
|
||||
print(f"❌ OCR failed. See log: {log_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not input_folder.exists():
|
||||
print("Input folder does not exist!")
|
||||
|
|
@ -54,4 +63,4 @@ if __name__ == "__main__":
|
|||
else:
|
||||
for pdf in pdfs:
|
||||
print(f"Processing: {pdf.name}")
|
||||
ocr_pdf(pdf)
|
||||
ocr_pdf(pdf)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import pdfplumber
|
||||
import pdfplumber
|
||||
|
||||
pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
|
||||
|
||||
|
|
@ -10,7 +10,7 @@ pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
|
|||
|
||||
# # Print the extracted text with preserved structure
|
||||
# print(f"Page {page.page_number}:\n{page_text}\n")
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
|
||||
for i, page in enumerate(pdf.pages):
|
||||
tables = page.extract_tables()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# https://github.com/explosion/spacy-layout
|
||||
### Run with: python extract_pitchbooks.py
|
||||
import spacy
|
||||
import spacy
|
||||
from spacy_layout import spaCyLayout
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
|
@ -34,14 +34,14 @@ for ent in doc_ner.ents:
|
|||
break
|
||||
|
||||
if ent.text.strip():
|
||||
ner_text_results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number
|
||||
})
|
||||
ner_text_results.append(
|
||||
{"label": ent.label_, "entity": ent.text.strip(), "page": page_number}
|
||||
)
|
||||
|
||||
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
||||
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
|
||||
(output_dir / "ner_text.json").write_text(
|
||||
json.dumps(ner_text_results, indent=2, ensure_ascii=False)
|
||||
)
|
||||
|
||||
# 2. NER on table cells
|
||||
table_ner_results = []
|
||||
|
|
@ -62,14 +62,18 @@ for i, table in enumerate(doc._.tables, 1):
|
|||
doc_cell = nlp(cell)
|
||||
for ent in doc_cell.ents:
|
||||
if ent.text.strip():
|
||||
table_ner_results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number,
|
||||
"table": i
|
||||
})
|
||||
table_ner_results.append(
|
||||
{
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number,
|
||||
"table": i,
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
||||
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
|
||||
(output_dir / "ner_tables.json").write_text(
|
||||
json.dumps(table_ner_results, indent=2, ensure_ascii=False)
|
||||
)
|
||||
|
||||
print("✅ Done! Extracted data saved to /output")
|
||||
print("✅ Done! Extracted data saved to /output")
|
||||
|
|
|
|||
Loading…
Reference in New Issue