Fixed json format

pull/51/head
s8613 2025-06-03 21:17:30 +02:00
parent 5f8580d1da
commit 1165bbbf08
6 changed files with 16 additions and 18 deletions

View File

@ -25,7 +25,7 @@ def extract_text_from_ocr_json():
} }
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
return jsonify({"Sent to validate-service"}), 200 return jsonify("Sent to validate-service"), 200
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,11 +3,10 @@ import json
import os import os
import time import time
import logging import logging
from dotenv import load_dotenv from dotenv import load_dotenv
MODEL = "gpt-4o-mini" MODEL = "gpt-4o-mini"
EXXETA_BASE_URL= "https://ai.exxeta.com/api/v2/azure/openai" EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
load_dotenv() load_dotenv()
EXXETA_API_KEY = os.getenv("API_KEY") EXXETA_API_KEY = os.getenv("API_KEY")
@ -19,13 +18,15 @@ logger = logging.getLogger(__name__)
def extract_with_exxeta(pages_json): def extract_with_exxeta(pages_json):
results = [] results = []
if not EXXETA_API_KEY: if not EXXETA_API_KEY:
logger.warning("EXXETA_API_KEY nicht gesetzt. Rückgabe eines leeren JSON.") logger.warning("EXXETA_API_KEY nicht gesetzt. Rückgabe eines leeren JSON.")
return json.dumps(results, indent=2, ensure_ascii=False) return json.dumps(results, indent=2, ensure_ascii=False)
for page_data in pages_json: for page_data in pages_json:
page_num = page_data.get("page") page_num = page_data.get("page")
text = page_data.get("text", "").strip() page_data.get("page")
text = page_data.get("text", "")
if not text: if not text:
continue continue
@ -105,7 +106,7 @@ def extract_with_exxeta(pages_json):
payload = { payload = {
"model": MODEL, "model": MODEL,
"messages": [ "messages": [
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."}, {"role": "system", "content": "Du bist ein Finanzanalyst. Antworte ausschließlich mit einem validen JSON-Array."},
{"role": "user", "content": prompt} {"role": "user", "content": prompt}
], ],
"temperature": 0.0 "temperature": 0.0
@ -143,5 +144,4 @@ def extract_with_exxeta(pages_json):
if attempt == MAX_RETRIES: if attempt == MAX_RETRIES:
results.extend([]) results.extend([])
json_result = json.dumps(results, indent=2, ensure_ascii=False) return json.dumps(results, indent=2, ensure_ascii=False)
return json_result

View File

@ -16,7 +16,7 @@ def convert_extract_text_from_pdf():
return {"error": "No file"}, 400 return {"error": "No file"}, 400
file = request.files["file"] file = request.files["file"]
pitchbook_id = request.form.get("id") pitchbook_id = request.form.get("pitchbook_id")
if not pitchbook_id: if not pitchbook_id:
return {"error": "No ID"}, 400 return {"error": "No ID"}, 400

View File

@ -25,7 +25,7 @@ def extract_pdf():
} }
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
return jsonify({"Sent to validate-service"}), 200 return jsonify("Sent to validate-service"), 200
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -6,15 +6,14 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(current_dir, "spacy_training/output/model-last") model_path = os.path.join(current_dir, "spacy_training/output/model-last")
nlp = spacy.load(model_path) nlp = spacy.load(model_path)
def extract(pages_json): def extract(pages_json):
results = [] results = []
for page in pages_json: for page in pages_json:
text = page.get("text", "").strip() text = page.get("text", "")
page_num = page.get("page") text = text.strip()
page_num = page.get("page")
if not text: if not text:
continue continue
@ -26,5 +25,4 @@ def extract(pages_json):
"page": page_num "page": page_num
}) })
json_result = json.dumps(results, indent=2, ensure_ascii=False) return json.dumps(results, indent=2, ensure_ascii=False)
return json_result

View File

@ -29,9 +29,9 @@ services:
condition: service_healthy condition: service_healthy
healthcheck: healthcheck:
test: wget --spider --no-verbose http://127.0.0.1:5000/health || exit 1 test: wget --spider --no-verbose http://127.0.0.1:5000/health || exit 1
interval: 10s interval: 20s
timeout: 5s timeout: 10s
retries: 5 retries: 10
ports: ports:
- 5050:5000 - 5050:5000