From af75439270f783083b729eb31a04b99c0fd86a46 Mon Sep 17 00:00:00 2001 From: s8613 Date: Tue, 3 Jun 2025 22:34:07 +0200 Subject: [PATCH] added return ocrd pdf --- project/backend/ocr-service/app.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/project/backend/ocr-service/app.py b/project/backend/ocr-service/app.py index ac21052..b590ffe 100644 --- a/project/backend/ocr-service/app.py +++ b/project/backend/ocr-service/app.py @@ -3,6 +3,7 @@ from ocr_runner import pdf_to_json, ocr_pdf import requests import os import tempfile +import base64 from pathlib import Path app = Flask(__name__) @@ -25,17 +26,22 @@ def convert_extract_text_from_pdf(): file.seek(0) temp_file.write(file.read()) temp_path = Path(temp_file.name) - ocr_path = ocr_pdf(temp_path) - if ocr_path and ocr_path.exists(): - with open(ocr_path, 'rb') as ocr_file: - result = pdf_to_json(ocr_file) - ocr_path.unlink() # cleanup - else: - file.seek(0) - result = pdf_to_json(file) - temp_path.unlink() # cleanup + if not ocr_path or not ocr_path.exists(): + temp_path.unlink() # cleanup + return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 + + + with open(ocr_path, 'rb') as ocr_file: + ocr_pdf_data = ocr_file.read() + ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8') + + ocr_file.seek(0) + result = pdf_to_json(ocr_file) + + ocr_path.unlink() + temp_path.unlink() payload = { "id": int(pitchbook_id), @@ -45,7 +51,11 @@ def convert_extract_text_from_pdf(): requests.post(EXXETA_URL, json=payload, timeout=600) requests.post(SPACY_URL, json=payload, timeout=600) - return {"status": "sent"}, 200 + return { + "status": "sent", + "ocr_pdf": ocr_pdf_base64, + "message": "PDF successfully OCR'd and processed" + }, 200 if __name__ == "__main__": app.run(host="0.0.0.0", port=5051) \ No newline at end of file