Working ocr, exxeta, spacy to validate
parent
af75439270
commit
3992cac54f
|
|
@ -26,4 +26,4 @@ def health_check():
|
|||
|
||||
# für Docker wichtig: host='0.0.0.0'
|
||||
if __name__ == "__main__":
|
||||
socketio.run(app, debug=True, host="0.0.0.0", port=5000)
|
||||
socketio.run(app, debug=True, host="0.0.0.0", port=5050)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from flask import Blueprint, request, jsonify, send_file
|
||||
from flask import Blueprint, request, jsonify, send_file, current_app
|
||||
from model.database import db
|
||||
from model.pitch_book_model import PitchBookModel
|
||||
from io import BytesIO
|
||||
|
|
@ -13,39 +13,46 @@ from controller.socketIO import socketio
|
|||
pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book")
|
||||
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:5051")
|
||||
|
||||
def process_pdf_async(file_id, file_data, filename):
|
||||
try:
|
||||
socketio.emit("progress", {"id": file_id, "progress": 10})
|
||||
def process_pdf_async(app, file_id, file_data, filename):
|
||||
with app.app_context():
|
||||
try:
|
||||
socketio.emit("progress", {"id": file_id, "progress": 10})
|
||||
|
||||
file_obj = BytesIO(file_data)
|
||||
file_obj.name = filename
|
||||
file_obj = BytesIO(file_data)
|
||||
file_obj.name = filename
|
||||
|
||||
files = {'file': (filename, file_obj, 'application/pdf')}
|
||||
data = {'id': file_id}
|
||||
files = {'file': (filename, file_obj, 'application/pdf')}
|
||||
data = {'id': file_id}
|
||||
|
||||
response = requests.post(
|
||||
f"{OCR_SERVICE_URL}/ocr",
|
||||
files=files,
|
||||
data=data,
|
||||
timeout=600 # 10 minute timeout
|
||||
)
|
||||
response = requests.post(
|
||||
f"{OCR_SERVICE_URL}/ocr",
|
||||
files=files,
|
||||
data=data,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
if 'ocr_pdf' in response_data:
|
||||
import base64
|
||||
ocr_pdf_data = base64.b64decode(response_data['ocr_pdf'])
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
if 'ocr_pdf' in response_data:
|
||||
import base64
|
||||
ocr_pdf_data = base64.b64decode(response_data['ocr_pdf'])
|
||||
|
||||
file_record = PitchBookModel.query.get(file_id)
|
||||
if file_record:
|
||||
file_record.file = ocr_pdf_data
|
||||
db.session.commit()
|
||||
socketio.emit("progress", {"id": file_id, "progress": 50})
|
||||
else:
|
||||
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
|
||||
file_record = PitchBookModel.query.get(file_id)
|
||||
if file_record:
|
||||
file_record.file = ocr_pdf_data
|
||||
db.session.commit()
|
||||
|
||||
except Exception as e:
|
||||
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
|
||||
print(f"[DEBUG] PDF updated in database:")
|
||||
print(f"[DEBUG] - Successfully saved to database")
|
||||
|
||||
socketio.emit("progress", {"id": file_id, "progress": 50})
|
||||
else:
|
||||
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
|
||||
|
||||
|
||||
@pitch_book_controller.route("/", methods=["POST"])
|
||||
|
|
@ -73,9 +80,11 @@ def upload_file():
|
|||
db.session.add(new_file)
|
||||
db.session.commit()
|
||||
|
||||
app = current_app._get_current_object()
|
||||
|
||||
processing_thread = threading.Thread(
|
||||
target=process_pdf_async,
|
||||
args=(new_file.id, file_data, fileName),
|
||||
args=(app, new_file.id, file_data, fileName),
|
||||
daemon=True
|
||||
)
|
||||
processing_thread.start()
|
||||
|
|
@ -145,24 +154,4 @@ def delete_file(id):
|
|||
db.session.delete(file)
|
||||
db.session.commit()
|
||||
|
||||
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
||||
|
||||
|
||||
# endpoint to receive final results from validate service
|
||||
@pitch_book_controller.route("/<int:id>/results", methods=["POST"])
|
||||
def receive_results(id):
|
||||
try:
|
||||
results = request.get_json()
|
||||
print(f"Received final results for pitchbook {id}:")
|
||||
print(f"Results: {results}")
|
||||
|
||||
file = PitchBookModel.query.get_or_404(id)
|
||||
file.kpi = str(results)
|
||||
db.session.commit()
|
||||
socketio.emit("progress", {"id": id, "progress": 100})
|
||||
|
||||
return jsonify({"message": "Results received successfully"}), 200
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing results for pitchbook {id}: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
||||
|
|
@ -24,9 +24,19 @@ def extract_text_from_ocr_json():
|
|||
"entities": entities
|
||||
}
|
||||
|
||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
|
||||
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
|
||||
|
||||
try:
|
||||
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||
print(f"[EXXETA] Validate service response: {response.status_code}")
|
||||
if response.status_code != 200:
|
||||
print(f"[EXXETA] Validate service error: {response.text}")
|
||||
except Exception as e:
|
||||
print(f"[EXXETA] Error sending to validate service: {e}")
|
||||
|
||||
return jsonify("Sent to validate-service"), 200
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5053)
|
||||
app.run(host="0.0.0.0", port=5053, debug=True)
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
FROM python:3.11-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apk add --no-cache \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libffi-dev \
|
||||
python3-dev \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-data-deu \
|
||||
tesseract-ocr-data-eng \
|
||||
ghostscript \
|
||||
qpdf \
|
||||
pngquant \
|
||||
leptonica \
|
||||
openjpeg \
|
||||
libgomp
|
||||
|
||||
RUN pip install gunicorn
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["gunicorn", "--timeout", "600", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"]
|
||||
|
|
@ -5,6 +5,11 @@ import os
|
|||
import tempfile
|
||||
import base64
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
|
@ -19,43 +24,64 @@ def convert_extract_text_from_pdf():
|
|||
file = request.files["file"]
|
||||
pitchbook_id = request.form.get("id")
|
||||
|
||||
logger.info(f"Processing file for pitchbook_id: {pitchbook_id}")
|
||||
|
||||
if not pitchbook_id:
|
||||
return {"error": "No ID"}, 400
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
||||
file.seek(0)
|
||||
temp_file.write(file.read())
|
||||
temp_path = Path(temp_file.name)
|
||||
ocr_path = ocr_pdf(temp_path)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
||||
file.seek(0)
|
||||
temp_file.write(file.read())
|
||||
temp_path = Path(temp_file.name)
|
||||
|
||||
if not ocr_path or not ocr_path.exists():
|
||||
temp_path.unlink() # cleanup
|
||||
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
||||
logger.info("Starting OCR process...")
|
||||
|
||||
ocr_path = ocr_pdf(temp_path)
|
||||
|
||||
with open(ocr_path, 'rb') as ocr_file:
|
||||
ocr_pdf_data = ocr_file.read()
|
||||
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
||||
if not ocr_path or not ocr_path.exists():
|
||||
temp_path.unlink() # cleanup
|
||||
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
||||
|
||||
ocr_file.seek(0)
|
||||
result = pdf_to_json(ocr_file)
|
||||
with open(ocr_path, 'rb') as ocr_file:
|
||||
ocr_pdf_data = ocr_file.read()
|
||||
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
||||
|
||||
ocr_path.unlink()
|
||||
temp_path.unlink()
|
||||
ocr_file.seek(0)
|
||||
result = pdf_to_json(ocr_file)
|
||||
|
||||
payload = {
|
||||
"id": int(pitchbook_id),
|
||||
"extracted_text_per_page": result["pages"]
|
||||
}
|
||||
ocr_path.unlink()
|
||||
temp_path.unlink()
|
||||
|
||||
requests.post(EXXETA_URL, json=payload, timeout=600)
|
||||
requests.post(SPACY_URL, json=payload, timeout=600)
|
||||
payload = {
|
||||
"id": int(pitchbook_id),
|
||||
"extracted_text_per_page": result["pages"]
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "sent",
|
||||
"ocr_pdf": ocr_pdf_base64,
|
||||
"message": "PDF successfully OCR'd and processed"
|
||||
}, 200
|
||||
logger.info(f"Sending payload to EXXETA and SPACY services")
|
||||
|
||||
try:
|
||||
exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
|
||||
logger.info(f"EXXETA response: {exxeta_response.status_code}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling EXXETA: {e}")
|
||||
|
||||
try:
|
||||
spacy_response = requests.post(SPACY_URL, json=payload, timeout=600)
|
||||
logger.info(f"SPACY response: {spacy_response.status_code}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling SPACY: {e}")
|
||||
|
||||
return {
|
||||
"status": "sent",
|
||||
"ocr_pdf": ocr_pdf_base64,
|
||||
"message": "PDF successfully OCR'd and processed"
|
||||
}, 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)
|
||||
return {"error": f"Processing failed: {str(e)}"}, 500
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5051)
|
||||
logger.info("Starting OCR service on port 5000")
|
||||
app.run(host="0.0.0.0", port=5000, debug=True)
|
||||
|
|
@ -4,6 +4,11 @@ import pdfplumber
|
|||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TEMP_DIR = Path("/tmp/ocr_processing")
|
||||
TEMP_DIR.mkdir(exist_ok=True)
|
||||
|
|
@ -39,6 +44,7 @@ def pdf_to_json(pdf_input):
|
|||
raise ValueError("Invalid file type provided")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from PDF: {str(e)}")
|
||||
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
||||
|
||||
|
||||
|
|
@ -49,6 +55,16 @@ def ocr_pdf(input_file_path: Path):
|
|||
log_file = log_folder / f"{input_path.stem}.log"
|
||||
sidecar_txt = output_folder / f"{input_path.stem}.txt"
|
||||
|
||||
if not input_path.exists():
|
||||
logger.error(f"Input file does not exist: {input_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
logger.error("ocrmypdf is not installed or not in PATH")
|
||||
return None
|
||||
|
||||
cmd = [
|
||||
"ocrmypdf",
|
||||
"--force-ocr",
|
||||
|
|
@ -66,15 +82,24 @@ def ocr_pdf(input_file_path: Path):
|
|||
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
|
||||
|
||||
if result.returncode == 0:
|
||||
return output_file
|
||||
if output_file.exists():
|
||||
logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes")
|
||||
return output_file
|
||||
else:
|
||||
logger.error(f"OCR completed but output file not found: {output_file}")
|
||||
return None
|
||||
else:
|
||||
logger.error(f"OCR failed with return code: {result.returncode}")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("OCR timed out after 600 seconds")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"File not found error: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in OCR: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -93,4 +118,5 @@ def extract_text_to_json(pdf_path: Path):
|
|||
return json_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text to JSON: {e}")
|
||||
return None
|
||||
|
|
@ -24,9 +24,19 @@ def extract_pdf():
|
|||
"entities": entities
|
||||
}
|
||||
|
||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
|
||||
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
|
||||
|
||||
try:
|
||||
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||
print(f"[EXXETA] Validate service response: {response.status_code}")
|
||||
if response.status_code != 200:
|
||||
print(f"[EXXETA] Validate service error: {response.text}")
|
||||
except Exception as e:
|
||||
print(f"[EXXETA] Error sending to validate service: {e}")
|
||||
|
||||
return jsonify("Sent to validate-service"), 200
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5052)
|
||||
app.run(host="0.0.0.0", port=5052, debug=True)
|
||||
|
|
@ -38,7 +38,7 @@ services:
|
|||
ocr:
|
||||
build:
|
||||
context: backend/ocr-service
|
||||
dockerfile: ../../Dockerfile
|
||||
dockerfile: Dockerfile
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -6,7 +6,7 @@ import FileUpload from "react-material-file-upload";
|
|||
import { socket } from "../socket";
|
||||
import { CircularProgressWithLabel } from "./CircularProgressWithLabel";
|
||||
|
||||
const PROGRESS = false;
|
||||
const PROGRESS = true;
|
||||
|
||||
export default function UploadPage() {
|
||||
const [files, setFiles] = useState<File[]>([]);
|
||||
|
|
@ -26,15 +26,14 @@ export default function UploadPage() {
|
|||
if (response.ok) {
|
||||
console.log("File uploaded successfully");
|
||||
const data = await response.json();
|
||||
console.log(data);
|
||||
setPageId(data.id);
|
||||
setPageId(data.id.toString());
|
||||
setLoadingState(0);
|
||||
|
||||
!PROGRESS &&
|
||||
navigate({
|
||||
to: "/extractedResult/$pitchBook",
|
||||
params: { pitchBook: data.id },
|
||||
});
|
||||
navigate({
|
||||
to: "/extractedResult/$pitchBook",
|
||||
params: { pitchBook: data.id.toString() },
|
||||
});
|
||||
} else {
|
||||
console.error("Failed to upload file");
|
||||
}
|
||||
|
|
@ -46,12 +45,12 @@ export default function UploadPage() {
|
|||
|
||||
const onProgress = useCallback(
|
||||
(progress: { id: number; progress: number }) => {
|
||||
console.log("Progress:", progress);
|
||||
console.log(pageId);
|
||||
if (Number(pageId) === progress.id) {
|
||||
if (pageId === progress.id.toString()) {
|
||||
setLoadingState(progress.progress);
|
||||
|
||||
if (progress.progress === 100) {
|
||||
setPageId(null);
|
||||
setLoadingState(null);
|
||||
navigate({
|
||||
to: "/extractedResult/$pitchBook",
|
||||
params: { pitchBook: progress.id.toString() },
|
||||
|
|
@ -71,12 +70,19 @@ export default function UploadPage() {
|
|||
};
|
||||
}, [onConnection, onProgress]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
setPageId(null);
|
||||
setLoadingState(null);
|
||||
};
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<>
|
||||
{PROGRESS && (
|
||||
<Backdrop
|
||||
sx={(theme) => ({ color: "#fff", zIndex: theme.zIndex.drawer + 1 })}
|
||||
open={pageId !== null && loadingState !== null}
|
||||
open={pageId !== null && loadingState !== null && loadingState < 100}
|
||||
>
|
||||
<CircularProgressWithLabel
|
||||
color="inherit"
|
||||
|
|
@ -175,4 +181,4 @@ export default function UploadPage() {
|
|||
</Box>
|
||||
</>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -9,6 +9,7 @@ import { Box, IconButton } from "@mui/material";
|
|||
interface PDFViewerProps {
|
||||
pitchBookId: string;
|
||||
}
|
||||
|
||||
export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
|
||||
const [numPages, setNumPages] = useState<number | null>(null);
|
||||
const [pageNumber, setPageNumber] = useState(1);
|
||||
|
|
@ -90,4 +91,4 @@ export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
|
|||
</Box>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue