Working ocr, exxeta, spacy to validate

pull/51/head
s8613 2025-06-04 09:28:16 +02:00
parent af75439270
commit 3992cac54f
11 changed files with 193 additions and 98 deletions

View File

@ -26,4 +26,4 @@ def health_check():
# für Docker wichtig: host='0.0.0.0'
if __name__ == "__main__":
socketio.run(app, debug=True, host="0.0.0.0", port=5000)
socketio.run(app, debug=True, host="0.0.0.0", port=5050)

View File

@ -1,4 +1,4 @@
from flask import Blueprint, request, jsonify, send_file
from flask import Blueprint, request, jsonify, send_file, current_app
from model.database import db
from model.pitch_book_model import PitchBookModel
from io import BytesIO
@ -13,39 +13,46 @@ from controller.socketIO import socketio
pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book")
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:5051")
def process_pdf_async(file_id, file_data, filename):
try:
socketio.emit("progress", {"id": file_id, "progress": 10})
def process_pdf_async(app, file_id, file_data, filename):
with app.app_context():
try:
socketio.emit("progress", {"id": file_id, "progress": 10})
file_obj = BytesIO(file_data)
file_obj.name = filename
file_obj = BytesIO(file_data)
file_obj.name = filename
files = {'file': (filename, file_obj, 'application/pdf')}
data = {'id': file_id}
files = {'file': (filename, file_obj, 'application/pdf')}
data = {'id': file_id}
response = requests.post(
f"{OCR_SERVICE_URL}/ocr",
files=files,
data=data,
timeout=600 # 10 minute timeout
)
response = requests.post(
f"{OCR_SERVICE_URL}/ocr",
files=files,
data=data,
timeout=600
)
if response.status_code == 200:
response_data = response.json()
if 'ocr_pdf' in response_data:
import base64
ocr_pdf_data = base64.b64decode(response_data['ocr_pdf'])
if response.status_code == 200:
response_data = response.json()
if 'ocr_pdf' in response_data:
import base64
ocr_pdf_data = base64.b64decode(response_data['ocr_pdf'])
file_record = PitchBookModel.query.get(file_id)
if file_record:
file_record.file = ocr_pdf_data
db.session.commit()
socketio.emit("progress", {"id": file_id, "progress": 50})
else:
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
file_record = PitchBookModel.query.get(file_id)
if file_record:
file_record.file = ocr_pdf_data
db.session.commit()
except Exception as e:
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
print(f"[DEBUG] PDF updated in database:")
print(f"[DEBUG] - Successfully saved to database")
socketio.emit("progress", {"id": file_id, "progress": 50})
else:
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
except Exception as e:
import traceback
traceback.print_exc()
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
@pitch_book_controller.route("/", methods=["POST"])
@ -73,9 +80,11 @@ def upload_file():
db.session.add(new_file)
db.session.commit()
app = current_app._get_current_object()
processing_thread = threading.Thread(
target=process_pdf_async,
args=(new_file.id, file_data, fileName),
args=(app, new_file.id, file_data, fileName),
daemon=True
)
processing_thread.start()
@ -145,24 +154,4 @@ def delete_file(id):
db.session.delete(file)
db.session.commit()
return jsonify({"message": f"File {id} deleted successfully"}), 200
# endpoint to receive final results from validate service
@pitch_book_controller.route("/<int:id>/results", methods=["POST"])
def receive_results(id):
try:
results = request.get_json()
print(f"Received final results for pitchbook {id}:")
print(f"Results: {results}")
file = PitchBookModel.query.get_or_404(id)
file.kpi = str(results)
db.session.commit()
socketio.emit("progress", {"id": id, "progress": 100})
return jsonify({"message": "Results received successfully"}), 200
except Exception as e:
print(f"Error processing results for pitchbook {id}: {e}")
return jsonify({"error": str(e)}), 500
return jsonify({"message": f"File {id} deleted successfully"}), 200

View File

@ -24,9 +24,19 @@ def extract_text_from_ocr_json():
"entities": entities
}
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
try:
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
print(f"[EXXETA] Validate service response: {response.status_code}")
if response.status_code != 200:
print(f"[EXXETA] Validate service error: {response.text}")
except Exception as e:
print(f"[EXXETA] Error sending to validate service: {e}")
return jsonify("Sent to validate-service"), 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5053)
app.run(host="0.0.0.0", port=5053, debug=True)

View File

@ -0,0 +1,27 @@
FROM python:3.11-alpine
WORKDIR /app
RUN apk add --no-cache \
gcc \
musl-dev \
libffi-dev \
python3-dev \
tesseract-ocr \
tesseract-ocr-data-deu \
tesseract-ocr-data-eng \
ghostscript \
qpdf \
pngquant \
leptonica \
openjpeg \
libgomp
RUN pip install gunicorn
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["gunicorn", "--timeout", "600", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"]

View File

@ -5,6 +5,11 @@ import os
import tempfile
import base64
from pathlib import Path
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
@ -19,43 +24,64 @@ def convert_extract_text_from_pdf():
file = request.files["file"]
pitchbook_id = request.form.get("id")
logger.info(f"Processing file for pitchbook_id: {pitchbook_id}")
if not pitchbook_id:
return {"error": "No ID"}, 400
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
file.seek(0)
temp_file.write(file.read())
temp_path = Path(temp_file.name)
ocr_path = ocr_pdf(temp_path)
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
file.seek(0)
temp_file.write(file.read())
temp_path = Path(temp_file.name)
if not ocr_path or not ocr_path.exists():
temp_path.unlink() # cleanup
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
logger.info("Starting OCR process...")
ocr_path = ocr_pdf(temp_path)
with open(ocr_path, 'rb') as ocr_file:
ocr_pdf_data = ocr_file.read()
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
if not ocr_path or not ocr_path.exists():
temp_path.unlink() # cleanup
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
ocr_file.seek(0)
result = pdf_to_json(ocr_file)
with open(ocr_path, 'rb') as ocr_file:
ocr_pdf_data = ocr_file.read()
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
ocr_path.unlink()
temp_path.unlink()
ocr_file.seek(0)
result = pdf_to_json(ocr_file)
payload = {
"id": int(pitchbook_id),
"extracted_text_per_page": result["pages"]
}
ocr_path.unlink()
temp_path.unlink()
requests.post(EXXETA_URL, json=payload, timeout=600)
requests.post(SPACY_URL, json=payload, timeout=600)
payload = {
"id": int(pitchbook_id),
"extracted_text_per_page": result["pages"]
}
return {
"status": "sent",
"ocr_pdf": ocr_pdf_base64,
"message": "PDF successfully OCR'd and processed"
}, 200
logger.info(f"Sending payload to EXXETA and SPACY services")
try:
exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
logger.info(f"EXXETA response: {exxeta_response.status_code}")
except Exception as e:
logger.error(f"Error calling EXXETA: {e}")
try:
spacy_response = requests.post(SPACY_URL, json=payload, timeout=600)
logger.info(f"SPACY response: {spacy_response.status_code}")
except Exception as e:
logger.error(f"Error calling SPACY: {e}")
return {
"status": "sent",
"ocr_pdf": ocr_pdf_base64,
"message": "PDF successfully OCR'd and processed"
}, 200
except Exception as e:
logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)
return {"error": f"Processing failed: {str(e)}"}, 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5051)
logger.info("Starting OCR service on port 5000")
app.run(host="0.0.0.0", port=5000, debug=True)

View File

@ -4,6 +4,11 @@ import pdfplumber
import json
from pathlib import Path
import os
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
TEMP_DIR = Path("/tmp/ocr_processing")
TEMP_DIR.mkdir(exist_ok=True)
@ -39,6 +44,7 @@ def pdf_to_json(pdf_input):
raise ValueError("Invalid file type provided")
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
raise Exception(f"Failed to extract text from PDF: {str(e)}")
@ -49,6 +55,16 @@ def ocr_pdf(input_file_path: Path):
log_file = log_folder / f"{input_path.stem}.log"
sidecar_txt = output_folder / f"{input_path.stem}.txt"
if not input_path.exists():
logger.error(f"Input file does not exist: {input_path}")
return None
try:
subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
logger.error("ocrmypdf is not installed or not in PATH")
return None
cmd = [
"ocrmypdf",
"--force-ocr",
@ -66,15 +82,24 @@ def ocr_pdf(input_file_path: Path):
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
if result.returncode == 0:
return output_file
if output_file.exists():
logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes")
return output_file
else:
logger.error(f"OCR completed but output file not found: {output_file}")
return None
else:
logger.error(f"OCR failed with return code: {result.returncode}")
return None
except subprocess.TimeoutExpired:
logger.error("OCR timed out after 600 seconds")
return None
except FileNotFoundError:
except FileNotFoundError as e:
logger.error(f"File not found error: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error in OCR: {e}", exc_info=True)
return None
@ -93,4 +118,5 @@ def extract_text_to_json(pdf_path: Path):
return json_path
except Exception as e:
logger.error(f"Failed to extract text to JSON: {e}")
return None

View File

@ -24,9 +24,19 @@ def extract_pdf():
"entities": entities
}
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
try:
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
print(f"[EXXETA] Validate service response: {response.status_code}")
if response.status_code != 200:
print(f"[EXXETA] Validate service error: {response.text}")
except Exception as e:
print(f"[EXXETA] Error sending to validate service: {e}")
return jsonify("Sent to validate-service"), 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5052)
app.run(host="0.0.0.0", port=5052, debug=True)

View File

@ -38,7 +38,7 @@ services:
ocr:
build:
context: backend/ocr-service
dockerfile: ../../Dockerfile
dockerfile: Dockerfile
env_file:
- .env
environment:

Binary file not shown.

View File

@ -6,7 +6,7 @@ import FileUpload from "react-material-file-upload";
import { socket } from "../socket";
import { CircularProgressWithLabel } from "./CircularProgressWithLabel";
const PROGRESS = false;
const PROGRESS = true;
export default function UploadPage() {
const [files, setFiles] = useState<File[]>([]);
@ -26,15 +26,14 @@ export default function UploadPage() {
if (response.ok) {
console.log("File uploaded successfully");
const data = await response.json();
console.log(data);
setPageId(data.id);
setPageId(data.id.toString());
setLoadingState(0);
!PROGRESS &&
navigate({
to: "/extractedResult/$pitchBook",
params: { pitchBook: data.id },
});
navigate({
to: "/extractedResult/$pitchBook",
params: { pitchBook: data.id.toString() },
});
} else {
console.error("Failed to upload file");
}
@ -46,12 +45,12 @@ export default function UploadPage() {
const onProgress = useCallback(
(progress: { id: number; progress: number }) => {
console.log("Progress:", progress);
console.log(pageId);
if (Number(pageId) === progress.id) {
if (pageId === progress.id.toString()) {
setLoadingState(progress.progress);
if (progress.progress === 100) {
setPageId(null);
setLoadingState(null);
navigate({
to: "/extractedResult/$pitchBook",
params: { pitchBook: progress.id.toString() },
@ -71,12 +70,19 @@ export default function UploadPage() {
};
}, [onConnection, onProgress]);
useEffect(() => {
return () => {
setPageId(null);
setLoadingState(null);
};
}, []);
return (
<>
{PROGRESS && (
<Backdrop
sx={(theme) => ({ color: "#fff", zIndex: theme.zIndex.drawer + 1 })}
open={pageId !== null && loadingState !== null}
open={pageId !== null && loadingState !== null && loadingState < 100}
>
<CircularProgressWithLabel
color="inherit"
@ -175,4 +181,4 @@ export default function UploadPage() {
</Box>
</>
);
}
}

View File

@ -9,6 +9,7 @@ import { Box, IconButton } from "@mui/material";
interface PDFViewerProps {
pitchBookId: string;
}
export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
const [numPages, setNumPages] = useState<number | null>(null);
const [pageNumber, setPageNumber] = useState(1);
@ -90,4 +91,4 @@ export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
</Box>
</Box>
);
}
}