Fixed json format
parent
5f8580d1da
commit
1165bbbf08
|
|
@ -25,7 +25,7 @@ def extract_text_from_ocr_json():
|
||||||
}
|
}
|
||||||
|
|
||||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||||
return jsonify({"Sent to validate-service"}), 200
|
return jsonify("Sent to validate-service"), 200
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,10 @@ import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
MODEL = "gpt-4o-mini"
|
MODEL = "gpt-4o-mini"
|
||||||
EXXETA_BASE_URL= "https://ai.exxeta.com/api/v2/azure/openai"
|
EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai"
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
EXXETA_API_KEY = os.getenv("API_KEY")
|
EXXETA_API_KEY = os.getenv("API_KEY")
|
||||||
|
|
||||||
|
|
@ -19,13 +18,15 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def extract_with_exxeta(pages_json):
|
def extract_with_exxeta(pages_json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
if not EXXETA_API_KEY:
|
if not EXXETA_API_KEY:
|
||||||
logger.warning("EXXETA_API_KEY nicht gesetzt. Rückgabe eines leeren JSON.")
|
logger.warning("EXXETA_API_KEY nicht gesetzt. Rückgabe eines leeren JSON.")
|
||||||
return json.dumps(results, indent=2, ensure_ascii=False)
|
return json.dumps(results, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
for page_data in pages_json:
|
for page_data in pages_json:
|
||||||
page_num = page_data.get("page")
|
page_num = page_data.get("page")
|
||||||
text = page_data.get("text", "").strip()
|
page_data.get("page")
|
||||||
|
text = page_data.get("text", "")
|
||||||
|
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|
@ -105,7 +106,7 @@ def extract_with_exxeta(pages_json):
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "Du bist ein Finanzanalyst, der Fondsprofile auswertet. Antworte nur mit validen JSON-Arrays."},
|
{"role": "system", "content": "Du bist ein Finanzanalyst. Antworte ausschließlich mit einem validen JSON-Array."},
|
||||||
{"role": "user", "content": prompt}
|
{"role": "user", "content": prompt}
|
||||||
],
|
],
|
||||||
"temperature": 0.0
|
"temperature": 0.0
|
||||||
|
|
@ -143,5 +144,4 @@ def extract_with_exxeta(pages_json):
|
||||||
if attempt == MAX_RETRIES:
|
if attempt == MAX_RETRIES:
|
||||||
results.extend([])
|
results.extend([])
|
||||||
|
|
||||||
json_result = json.dumps(results, indent=2, ensure_ascii=False)
|
return json.dumps(results, indent=2, ensure_ascii=False)
|
||||||
return json_result
|
|
||||||
|
|
@ -16,7 +16,7 @@ def convert_extract_text_from_pdf():
|
||||||
return {"error": "No file"}, 400
|
return {"error": "No file"}, 400
|
||||||
|
|
||||||
file = request.files["file"]
|
file = request.files["file"]
|
||||||
pitchbook_id = request.form.get("id")
|
pitchbook_id = request.form.get("pitchbook_id")
|
||||||
|
|
||||||
if not pitchbook_id:
|
if not pitchbook_id:
|
||||||
return {"error": "No ID"}, 400
|
return {"error": "No ID"}, 400
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ def extract_pdf():
|
||||||
}
|
}
|
||||||
|
|
||||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||||
return jsonify({"Sent to validate-service"}), 200
|
return jsonify("Sent to validate-service"), 200
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -6,15 +6,14 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
model_path = os.path.join(current_dir, "spacy_training/output/model-last")
|
model_path = os.path.join(current_dir, "spacy_training/output/model-last")
|
||||||
nlp = spacy.load(model_path)
|
nlp = spacy.load(model_path)
|
||||||
|
|
||||||
|
|
||||||
def extract(pages_json):
|
def extract(pages_json):
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for page in pages_json:
|
for page in pages_json:
|
||||||
text = page.get("text", "").strip()
|
text = page.get("text", "")
|
||||||
page_num = page.get("page")
|
text = text.strip()
|
||||||
|
|
||||||
|
page_num = page.get("page")
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -26,5 +25,4 @@ def extract(pages_json):
|
||||||
"page": page_num
|
"page": page_num
|
||||||
})
|
})
|
||||||
|
|
||||||
json_result = json.dumps(results, indent=2, ensure_ascii=False)
|
return json.dumps(results, indent=2, ensure_ascii=False)
|
||||||
return json_result
|
|
||||||
|
|
@ -29,9 +29,9 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: wget --spider --no-verbose http://127.0.0.1:5000/health || exit 1
|
test: wget --spider --no-verbose http://127.0.0.1:5000/health || exit 1
|
||||||
interval: 10s
|
interval: 20s
|
||||||
timeout: 5s
|
timeout: 10s
|
||||||
retries: 5
|
retries: 10
|
||||||
ports:
|
ports:
|
||||||
- 5050:5000
|
- 5050:5000
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue