From af75439270f783083b729eb31a04b99c0fd86a46 Mon Sep 17 00:00:00 2001
From: s8613 <s86136@bht-berlin.de>
Date: Tue, 3 Jun 2025 22:34:07 +0200
Subject: [PATCH] added return ocrd pdf

---
 project/backend/ocr-service/app.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/project/backend/ocr-service/app.py b/project/backend/ocr-service/app.py
index ac21052..b590ffe 100644
--- a/project/backend/ocr-service/app.py
+++ b/project/backend/ocr-service/app.py
@@ -3,6 +3,7 @@ from ocr_runner import pdf_to_json, ocr_pdf
 import requests
 import os
 import tempfile
+import base64
 from pathlib import Path
 
 app = Flask(__name__)
@@ -25,17 +26,22 @@ def convert_extract_text_from_pdf():
         file.seek(0)
         temp_file.write(file.read())
         temp_path = Path(temp_file.name)
-
     ocr_path = ocr_pdf(temp_path)
-    if ocr_path and ocr_path.exists():
-        with open(ocr_path, 'rb') as ocr_file:
-            result = pdf_to_json(ocr_file)
-        ocr_path.unlink()  # cleanup
-    else:
-        file.seek(0)
-        result = pdf_to_json(file)
 
-    temp_path.unlink()  # cleanup
+    if not ocr_path or not ocr_path.exists():
+        temp_path.unlink()  # cleanup
+        return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
+
+
+    with open(ocr_path, 'rb') as ocr_file:
+        ocr_pdf_data = ocr_file.read()
+        ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
+
+        ocr_file.seek(0)
+        result = pdf_to_json(ocr_file)
+
+    ocr_path.unlink()
+    temp_path.unlink()
 
     payload = {
         "id": int(pitchbook_id),
@@ -45,7 +51,11 @@ def convert_extract_text_from_pdf():
     requests.post(EXXETA_URL, json=payload, timeout=600)
     requests.post(SPACY_URL, json=payload, timeout=600)
 
-    return {"status": "sent"}, 200
+    return {
+        "status": "sent",
+        "ocr_pdf": ocr_pdf_base64,
+        "message": "PDF successfully OCR'd and processed"
+    }, 200
 
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=5051)
\ No newline at end of file