Test exxeta gpt
parent
5945122fb0
commit
7b6a19bbc3
|
|
@ -0,0 +1,90 @@
|
|||
from openai import AzureOpenAI
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import pymupdf
|
||||
|
||||
load_dotenv()
|
||||
|
||||
BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||
API_KEY = os.getenv("API_KEY")
|
||||
|
||||
client = AzureOpenAI(
|
||||
api_key=API_KEY,
|
||||
api_version="2023-07-01-preview",
|
||||
base_url=BASE_URL
|
||||
)
|
||||
def extract_text_from_pdf(file_path):
|
||||
"""Extract text content from a PDF file using PyMuPDF (fitz)."""
|
||||
all_text = ""
|
||||
# Open the PDF file
|
||||
doc = pymupdf.open(file_path)
|
||||
|
||||
# Print number of pages
|
||||
print(f"PDF has {len(doc)} pages")
|
||||
|
||||
# Extract and print text from each page
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
|
||||
# Print page number and content
|
||||
print(text)
|
||||
|
||||
all_text += "[Page " + str(page_num + 1) + "]\n" + text + "\n\n"
|
||||
|
||||
return all_text
|
||||
|
||||
|
||||
file_path = "../../pitch-books/Pitchbook 1.pdf"
|
||||
pdf_text = extract_text_from_pdf(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Always respond with a valid JSON object"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """extract the values from the text. let not found values empty:
|
||||
-Fondsname
|
||||
-Fondsmanager
|
||||
-Name Kapitalverwaltungsgesellschaft
|
||||
-Datum
|
||||
-Risikoprofil
|
||||
-Artikel gem. SFDR
|
||||
-Ziel
|
||||
-Zielrendite über die Fondslaufzeit
|
||||
-Rendite seit Auflage
|
||||
-Zielausschüttungsrendite über die Fondslaufzeit
|
||||
-Ausschüttungsrendite seit Auflage
|
||||
-Laufzeit
|
||||
-LTV (Loan-to-Value)
|
||||
-Soll/Ist
|
||||
-Ziel
|
||||
-Managementgebühren Bezogen auf NAV (Net Asset Value)
|
||||
-Sektorenallokation
|
||||
-Länderallokation
|
||||
for each value return:
|
||||
- the Key
|
||||
- the Value
|
||||
- the page where this value was found
|
||||
- a confidence score, how confident the model is about the value (low, medium, high)
|
||||
|
||||
Here ist the text:""" + pdf_text
|
||||
}
|
||||
],
|
||||
model="gpt-4o-mini",
|
||||
response_format={"type": "json_object"}
|
||||
# temperature=0.7,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0,
|
||||
# presence_penalty=0,
|
||||
# max_tokens=800,
|
||||
# stop="",
|
||||
# stream=False
|
||||
)
|
||||
|
||||
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
acres==0.3.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
certifi==2025.1.31
|
||||
charset-normalizer==3.4.1
|
||||
ci-info==0.3.0
|
||||
click==8.1.8
|
||||
configobj==5.0.9
|
||||
configparser==7.2.0
|
||||
distro==1.9.0
|
||||
etelemetry==0.3.1
|
||||
filelock==3.18.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.8
|
||||
httplib2==0.22.0
|
||||
httpx==0.28.1
|
||||
idna==3.10
|
||||
isodate==0.6.1
|
||||
jiter==0.9.0
|
||||
looseversion==1.3.0
|
||||
lxml==5.4.0
|
||||
networkx==3.4.2
|
||||
nibabel==5.3.2
|
||||
nipype==1.10.0
|
||||
numpy==2.2.5
|
||||
openai==1.75.0
|
||||
packaging==25.0
|
||||
pandas==2.2.3
|
||||
pathlib==1.0.1
|
||||
prov==2.0.1
|
||||
puremagic==1.28
|
||||
pydantic==2.11.3
|
||||
pydantic_core==2.33.1
|
||||
pydot==3.0.4
|
||||
PyMuPDF==1.25.5
|
||||
pyparsing==3.2.3
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.1.0
|
||||
pytz==2025.2
|
||||
pyxnat==1.6.3
|
||||
rdflib==6.3.2
|
||||
requests==2.32.3
|
||||
scipy==1.15.2
|
||||
simplejson==3.20.1
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
tqdm==4.67.1
|
||||
traits==7.0.2
|
||||
typing-inspection==0.4.0
|
||||
typing_extensions==4.13.2
|
||||
tzdata==2025.2
|
||||
urllib3==2.4.0
|
||||
Loading…
Reference in New Issue