diff --git a/prototypes/exxeta/index.py b/prototypes/exxeta/index.py new file mode 100644 index 0000000..32e0cd7 --- /dev/null +++ b/prototypes/exxeta/index.py @@ -0,0 +1,90 @@ +from openai import AzureOpenAI +from dotenv import load_dotenv +import os +import pymupdf + +load_dotenv() + +BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" +API_KEY = os.getenv("API_KEY") + +client = AzureOpenAI( + api_key=API_KEY, + api_version="2023-07-01-preview", + base_url=BASE_URL + ) +def extract_text_from_pdf(file_path): + """Extract text content from a PDF file using PyMuPDF (fitz).""" + all_text = "" + # Open the PDF file + doc = pymupdf.open(file_path) + + # Print number of pages + print(f"PDF has {len(doc)} pages") + + # Extract and print text from each page + for page_num in range(len(doc)): + page = doc[page_num] + text = page.get_text() + + # Print page number and content + print(text) + + all_text += "[Page " + str(page_num + 1) + "]\n" + text + "\n\n" + + return all_text + + +file_path = "../../pitch-books/Pitchbook 1.pdf" +pdf_text = extract_text_from_pdf(file_path) + +response = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "Always respond with a valid JSON object" + }, + { + "role": "user", + "content": """extract the values from the text. let not found values empty: + -Fondsname + -Fondsmanager + -Name Kapitalverwaltungsgesellschaft + -Datum + -Risikoprofil + -Artikel gem. SFDR + -Ziel + -Zielrendite über die Fondslaufzeit + -Rendite seit Auflage + -Zielausschüttungsrendite über die Fondslaufzeit + -Ausschüttungsrendite seit Auflage + -Laufzeit + -LTV (Loan-to-Value) + -Soll/Ist + -Ziel + -Managementgebühren Bezogen auf NAV (Net Asset Value) + -Sektorenallokation + -Länderallokation + for each value return: + - the Key + - the Value + - the page where this value was found + - a confidence score, how confident the model is about the value (low, medium, high) + + Here ist the text:""" + pdf_text + } + ], + model="gpt-4o-mini", + response_format={"type": "json_object"} + # temperature=0.7, + # top_p=0.95, + # frequency_penalty=0, + # presence_penalty=0, + # max_tokens=800, + # stop="", + # stream=False + ) + + + +print(response.choices[0].message.content) diff --git a/prototypes/exxeta/requirements.txt b/prototypes/exxeta/requirements.txt new file mode 100644 index 0000000..69d8659 --- /dev/null +++ b/prototypes/exxeta/requirements.txt @@ -0,0 +1,52 @@ +acres==0.3.0 +annotated-types==0.7.0 +anyio==4.9.0 +certifi==2025.1.31 +charset-normalizer==3.4.1 +ci-info==0.3.0 +click==8.1.8 +configobj==5.0.9 +configparser==7.2.0 +distro==1.9.0 +etelemetry==0.3.1 +filelock==3.18.0 +h11==0.14.0 +httpcore==1.0.8 +httplib2==0.22.0 +httpx==0.28.1 +idna==3.10 +isodate==0.6.1 +jiter==0.9.0 +looseversion==1.3.0 +lxml==5.4.0 +networkx==3.4.2 +nibabel==5.3.2 +nipype==1.10.0 +numpy==2.2.5 +openai==1.75.0 +packaging==25.0 +pandas==2.2.3 +pathlib==1.0.1 +prov==2.0.1 +puremagic==1.28 +pydantic==2.11.3 +pydantic_core==2.33.1 +pydot==3.0.4 +PyMuPDF==1.25.5 +pyparsing==3.2.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +pytz==2025.2 +pyxnat==1.6.3 +rdflib==6.3.2 +requests==2.32.3 +scipy==1.15.2 +simplejson==3.20.1 +six==1.17.0 +sniffio==1.3.1 +tqdm==4.67.1 +traits==7.0.2 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +tzdata==2025.2 +urllib3==2.4.0