pse2_ff/project/backend/ocr-service/app.py

from flask import Flask, request
from ocr_runner import pdf_to_json, ocr_pdf
import requests
import os
import tempfile
from pathlib import Path
import logging
import threading


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

EXXETA_URL = os.getenv("EXXETA_SERVICE_URL", "http://localhost:5053/extract")
SPACY_URL = os.getenv("SPACY_SERVICE_URL", "http://localhost:5052/extract")
COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5050")


def convert_pdf_async(temp_path, pitchbook_id):
    try:
        logger.info("Starting OCR process...")

        ocr_path = ocr_pdf(temp_path)

        if not ocr_path or not ocr_path.exists():
            temp_path.unlink()  # cleanup
            return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500

        with open(ocr_path, 'rb') as ocr_file:
            ocr_file.seek(0)
            result = pdf_to_json(ocr_file)


        payload = {
            "id": int(pitchbook_id),
            "extracted_text_per_page": result["pages"]
        }

        logger.info("Sending payload to EXXETA and SPACY services")

        requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35})
        try:
            exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
            logger.info(f"EXXETA response: {exxeta_response.status_code}")
        except Exception as e:
            logger.error(f"Error calling EXXETA: {e}")

        try:
            spacy_response = requests.post(SPACY_URL, json=payload, timeout=600)
            logger.info(f"SPACY response: {spacy_response.status_code}")
        except Exception as e:
            logger.error(f"Error calling SPACY: {e}")

        files=[
           ('file',('',open(ocr_path,'rb'),'application/pdf'))
        ]
        headers = {}

        try:

            requests.put(f"{COORDINATOR_URL}/api/pitch_book/{pitchbook_id}", files=files, timeout=600, headers=headers)
            logger.info("COORDINATOR response: Progress + File updated")
        except Exception as e:
            logger.error(f"Error calling COORDINATOR: {e}")

        ocr_path.unlink()
        temp_path.unlink()
    except Exception as e:
        logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)


@app.route('/ocr', methods=['POST'])
def convert_extract_text_from_pdf():
    if "file" not in request.files:
        return {"error": "No file"}, 400

    file = request.files["file"]
    pitchbook_id = request.form.get("id")

    logger.info(f"Processing file for pitchbook_id: {pitchbook_id}")

    if not pitchbook_id:
        return {"error": "No ID"}, 400

    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
        file.seek(0)
        temp_file.write(file.read())
        temp_path = Path(temp_file.name)

    thread = threading.Thread(target=convert_pdf_async, args=(temp_path, pitchbook_id))
    thread.start()

    return {
        "status": "sent",
        "message": "PDF successfully OCR'd and processed"
    }, 200


if __name__ == "__main__":
    logger.info("Starting OCR service on port 5000")
    app.run(host="0.0.0.0", port=5051, debug=True)