87 lines
2.7 KiB
Python
87 lines
2.7 KiB
Python
from flask import Flask, request, jsonify
|
|
from ocr_runner import pdf_to_json, ocr_pdf
|
|
import requests
|
|
import os
|
|
import tempfile
|
|
import base64
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = Flask(__name__)
|
|
|
|
EXXETA_URL = os.getenv("EXXETA_SERVICE_URL", "http://localhost:5053/extract")
|
|
SPACY_URL = os.getenv("SPACY_SERVICE_URL", "http://localhost:5052/extract")
|
|
|
|
@app.route('/ocr', methods=['POST'])
|
|
def convert_extract_text_from_pdf():
|
|
if "file" not in request.files:
|
|
return {"error": "No file"}, 400
|
|
|
|
file = request.files["file"]
|
|
pitchbook_id = request.form.get("id")
|
|
|
|
logger.info(f"Processing file for pitchbook_id: {pitchbook_id}")
|
|
|
|
if not pitchbook_id:
|
|
return {"error": "No ID"}, 400
|
|
|
|
try:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
|
file.seek(0)
|
|
temp_file.write(file.read())
|
|
temp_path = Path(temp_file.name)
|
|
|
|
logger.info("Starting OCR process...")
|
|
|
|
ocr_path = ocr_pdf(temp_path)
|
|
|
|
if not ocr_path or not ocr_path.exists():
|
|
temp_path.unlink() # cleanup
|
|
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
|
|
|
with open(ocr_path, 'rb') as ocr_file:
|
|
ocr_pdf_data = ocr_file.read()
|
|
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
|
|
|
ocr_file.seek(0)
|
|
result = pdf_to_json(ocr_file)
|
|
|
|
ocr_path.unlink()
|
|
temp_path.unlink()
|
|
|
|
payload = {
|
|
"id": int(pitchbook_id),
|
|
"extracted_text_per_page": result["pages"]
|
|
}
|
|
|
|
logger.info(f"Sending payload to EXXETA and SPACY services")
|
|
|
|
try:
|
|
exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
|
|
logger.info(f"EXXETA response: {exxeta_response.status_code}")
|
|
except Exception as e:
|
|
logger.error(f"Error calling EXXETA: {e}")
|
|
|
|
try:
|
|
spacy_response = requests.post(SPACY_URL, json=payload, timeout=600)
|
|
logger.info(f"SPACY response: {spacy_response.status_code}")
|
|
except Exception as e:
|
|
logger.error(f"Error calling SPACY: {e}")
|
|
|
|
return {
|
|
"status": "sent",
|
|
"ocr_pdf": ocr_pdf_base64,
|
|
"message": "PDF successfully OCR'd and processed"
|
|
}, 200
|
|
|
|
except Exception as e:
|
|
logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)
|
|
return {"error": f"Processing failed: {str(e)}"}, 500
|
|
|
|
if __name__ == "__main__":
|
|
logger.info("Starting OCR service on port 5000")
|
|
app.run(host="0.0.0.0", port=5000, debug=True) |