added return ocrd pdf

pull/51/head
s8613 2025-06-03 22:34:07 +02:00
parent f3bee2b62b
commit af75439270
1 changed files with 20 additions and 10 deletions

View File

@ -3,6 +3,7 @@ from ocr_runner import pdf_to_json, ocr_pdf
import requests
import os
import tempfile
import base64
from pathlib import Path
app = Flask(__name__)
@ -25,17 +26,22 @@ def convert_extract_text_from_pdf():
file.seek(0)
temp_file.write(file.read())
temp_path = Path(temp_file.name)
ocr_path = ocr_pdf(temp_path)
if ocr_path and ocr_path.exists():
with open(ocr_path, 'rb') as ocr_file:
result = pdf_to_json(ocr_file)
ocr_path.unlink() # cleanup
else:
file.seek(0)
result = pdf_to_json(file)
temp_path.unlink() # cleanup
if not ocr_path or not ocr_path.exists():
temp_path.unlink() # cleanup
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
with open(ocr_path, 'rb') as ocr_file:
ocr_pdf_data = ocr_file.read()
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
ocr_file.seek(0)
result = pdf_to_json(ocr_file)
ocr_path.unlink()
temp_path.unlink()
payload = {
"id": int(pitchbook_id),
@ -45,7 +51,11 @@ def convert_extract_text_from_pdf():
requests.post(EXXETA_URL, json=payload, timeout=600)
requests.post(SPACY_URL, json=payload, timeout=600)
return {"status": "sent"}, 200
return {
"status": "sent",
"ocr_pdf": ocr_pdf_base64,
"message": "PDF successfully OCR'd and processed"
}, 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5051)