pse2_ff/project/backend/ocr-service/app.py

51 lines
1.4 KiB
Python

from flask import Flask, request, jsonify
from ocr_runner import pdf_to_json, ocr_pdf
import requests
import os
import tempfile
from pathlib import Path
app = Flask(__name__)
EXXETA_URL = os.getenv("EXXETA_SERVICE_URL", "http://localhost:5053/extract")
SPACY_URL = os.getenv("SPACY_SERVICE_URL", "http://localhost:5052/extract")
@app.route('/ocr', methods=['POST'])
def convert_extract_text_from_pdf():
if "file" not in request.files:
return {"error": "No file"}, 400
file = request.files["file"]
pitchbook_id = request.form.get("id")
if not pitchbook_id:
return {"error": "No ID"}, 400
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
file.seek(0)
temp_file.write(file.read())
temp_path = Path(temp_file.name)
ocr_path = ocr_pdf(temp_path)
if ocr_path and ocr_path.exists():
with open(ocr_path, 'rb') as ocr_file:
result = pdf_to_json(ocr_file)
ocr_path.unlink() # cleanup
else:
file.seek(0)
result = pdf_to_json(file)
temp_path.unlink() # cleanup
payload = {
"id": int(pitchbook_id),
"extracted_text_per_page": result["pages"]
}
requests.post(EXXETA_URL, json=payload, timeout=30)
requests.post(SPACY_URL, json=payload, timeout=30)
return {"status": "sent"}, 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5051)