added return ocrd pdf
parent
f3bee2b62b
commit
af75439270
|
|
@ -3,6 +3,7 @@ from ocr_runner import pdf_to_json, ocr_pdf
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import base64
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
@ -25,17 +26,22 @@ def convert_extract_text_from_pdf():
|
||||||
file.seek(0)
|
file.seek(0)
|
||||||
temp_file.write(file.read())
|
temp_file.write(file.read())
|
||||||
temp_path = Path(temp_file.name)
|
temp_path = Path(temp_file.name)
|
||||||
|
|
||||||
ocr_path = ocr_pdf(temp_path)
|
ocr_path = ocr_pdf(temp_path)
|
||||||
if ocr_path and ocr_path.exists():
|
|
||||||
with open(ocr_path, 'rb') as ocr_file:
|
|
||||||
result = pdf_to_json(ocr_file)
|
|
||||||
ocr_path.unlink() # cleanup
|
|
||||||
else:
|
|
||||||
file.seek(0)
|
|
||||||
result = pdf_to_json(file)
|
|
||||||
|
|
||||||
temp_path.unlink() # cleanup
|
if not ocr_path or not ocr_path.exists():
|
||||||
|
temp_path.unlink() # cleanup
|
||||||
|
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
||||||
|
|
||||||
|
|
||||||
|
with open(ocr_path, 'rb') as ocr_file:
|
||||||
|
ocr_pdf_data = ocr_file.read()
|
||||||
|
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
||||||
|
|
||||||
|
ocr_file.seek(0)
|
||||||
|
result = pdf_to_json(ocr_file)
|
||||||
|
|
||||||
|
ocr_path.unlink()
|
||||||
|
temp_path.unlink()
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"id": int(pitchbook_id),
|
"id": int(pitchbook_id),
|
||||||
|
|
@ -45,7 +51,11 @@ def convert_extract_text_from_pdf():
|
||||||
requests.post(EXXETA_URL, json=payload, timeout=600)
|
requests.post(EXXETA_URL, json=payload, timeout=600)
|
||||||
requests.post(SPACY_URL, json=payload, timeout=600)
|
requests.post(SPACY_URL, json=payload, timeout=600)
|
||||||
|
|
||||||
return {"status": "sent"}, 200
|
return {
|
||||||
|
"status": "sent",
|
||||||
|
"ocr_pdf": ocr_pdf_base64,
|
||||||
|
"message": "PDF successfully OCR'd and processed"
|
||||||
|
}, 200
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(host="0.0.0.0", port=5051)
|
app.run(host="0.0.0.0", port=5051)
|
||||||
Loading…
Reference in New Issue