from flask import Flask, request from ocr_runner import pdf_to_json, ocr_pdf import requests import os import tempfile from pathlib import Path import logging import threading # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) EXXETA_URL = os.getenv("EXXETA_SERVICE_URL", "http://localhost:5053/extract") SPACY_URL = os.getenv("SPACY_SERVICE_URL", "http://localhost:5052/extract") COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5050") def convert_pdf_async(temp_path, pitchbook_id): try: logger.info("Starting OCR process...") ocr_path = ocr_pdf(temp_path) if not ocr_path or not ocr_path.exists(): temp_path.unlink() # cleanup return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 with open(ocr_path, "rb") as ocr_file: ocr_file.seek(0) result = pdf_to_json(ocr_file) payload = {"id": int(pitchbook_id), "extracted_text_per_page": result["pages"]} logger.info("Sending payload to EXXETA and SPACY services") requests.post( COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35} ) try: exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600) logger.info(f"EXXETA response: {exxeta_response.status_code}") except Exception as e: logger.error(f"Error calling EXXETA: {e}") try: spacy_response = requests.post(SPACY_URL, json=payload, timeout=600) logger.info(f"SPACY response: {spacy_response.status_code}") except Exception as e: logger.error(f"Error calling SPACY: {e}") files = [("file", ("", open(ocr_path, "rb"), "application/pdf"))] headers = {} try: requests.put( f"{COORDINATOR_URL}/api/pitch_book/{pitchbook_id}", files=files, timeout=600, headers=headers, ) logger.info("COORDINATOR response: Progress + File updated") except Exception as e: logger.error(f"Error calling COORDINATOR: {e}") ocr_path.unlink() temp_path.unlink() except Exception as e: logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True) @app.route("/ocr", methods=["POST"]) def convert_extract_text_from_pdf(): if "file" not in request.files: return {"error": "No file"}, 400 file = request.files["file"] pitchbook_id = request.form.get("id") logger.info(f"Processing file for pitchbook_id: {pitchbook_id}") if not pitchbook_id: return {"error": "No ID"}, 400 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: file.seek(0) temp_file.write(file.read()) temp_path = Path(temp_file.name) thread = threading.Thread(target=convert_pdf_async, args=(temp_path, pitchbook_id)) thread.start() return {"status": "sent", "message": "PDF successfully OCR'd and processed"}, 200 if __name__ == "__main__": logger.info("Starting OCR service on port 5000") app.run(host="0.0.0.0", port=5051, debug=True)