from flask import Flask, request, jsonify from ocr_runner import pdf_to_json, ocr_pdf import requests import os import tempfile import base64 from pathlib import Path app = Flask(__name__) EXXETA_URL = os.getenv("EXXETA_SERVICE_URL", "http://localhost:5053/extract") SPACY_URL = os.getenv("SPACY_SERVICE_URL", "http://localhost:5052/extract") @app.route('/ocr', methods=['POST']) def convert_extract_text_from_pdf(): if "file" not in request.files: return {"error": "No file"}, 400 file = request.files["file"] pitchbook_id = request.form.get("id") if not pitchbook_id: return {"error": "No ID"}, 400 with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: file.seek(0) temp_file.write(file.read()) temp_path = Path(temp_file.name) ocr_path = ocr_pdf(temp_path) if not ocr_path or not ocr_path.exists(): temp_path.unlink() # cleanup return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 with open(ocr_path, 'rb') as ocr_file: ocr_pdf_data = ocr_file.read() ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8') ocr_file.seek(0) result = pdf_to_json(ocr_file) ocr_path.unlink() temp_path.unlink() payload = { "id": int(pitchbook_id), "extracted_text_per_page": result["pages"] } requests.post(EXXETA_URL, json=payload, timeout=600) requests.post(SPACY_URL, json=payload, timeout=600) return { "status": "sent", "ocr_pdf": ocr_pdf_base64, "message": "PDF successfully OCR'd and processed" }, 200 if __name__ == "__main__": app.run(host="0.0.0.0", port=5051)