Working ocr, exxeta, spacy to validate
parent
af75439270
commit
3992cac54f
|
|
@ -26,4 +26,4 @@ def health_check():
|
||||||
|
|
||||||
# für Docker wichtig: host='0.0.0.0'
|
# für Docker wichtig: host='0.0.0.0'
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
socketio.run(app, debug=True, host="0.0.0.0", port=5000)
|
socketio.run(app, debug=True, host="0.0.0.0", port=5050)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from flask import Blueprint, request, jsonify, send_file
|
from flask import Blueprint, request, jsonify, send_file, current_app
|
||||||
from model.database import db
|
from model.database import db
|
||||||
from model.pitch_book_model import PitchBookModel
|
from model.pitch_book_model import PitchBookModel
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
@ -13,7 +13,8 @@ from controller.socketIO import socketio
|
||||||
pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book")
|
pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book")
|
||||||
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:5051")
|
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:5051")
|
||||||
|
|
||||||
def process_pdf_async(file_id, file_data, filename):
|
def process_pdf_async(app, file_id, file_data, filename):
|
||||||
|
with app.app_context():
|
||||||
try:
|
try:
|
||||||
socketio.emit("progress", {"id": file_id, "progress": 10})
|
socketio.emit("progress", {"id": file_id, "progress": 10})
|
||||||
|
|
||||||
|
|
@ -27,7 +28,7 @@ def process_pdf_async(file_id, file_data, filename):
|
||||||
f"{OCR_SERVICE_URL}/ocr",
|
f"{OCR_SERVICE_URL}/ocr",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
timeout=600 # 10 minute timeout
|
timeout=600
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
|
@ -40,11 +41,17 @@ def process_pdf_async(file_id, file_data, filename):
|
||||||
if file_record:
|
if file_record:
|
||||||
file_record.file = ocr_pdf_data
|
file_record.file = ocr_pdf_data
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
|
print(f"[DEBUG] PDF updated in database:")
|
||||||
|
print(f"[DEBUG] - Successfully saved to database")
|
||||||
|
|
||||||
socketio.emit("progress", {"id": file_id, "progress": 50})
|
socketio.emit("progress", {"id": file_id, "progress": 50})
|
||||||
else:
|
else:
|
||||||
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
|
socketio.emit("error", {"id": file_id, "message": "OCR processing failed"})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
|
socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"})
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,9 +80,11 @@ def upload_file():
|
||||||
db.session.add(new_file)
|
db.session.add(new_file)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
|
app = current_app._get_current_object()
|
||||||
|
|
||||||
processing_thread = threading.Thread(
|
processing_thread = threading.Thread(
|
||||||
target=process_pdf_async,
|
target=process_pdf_async,
|
||||||
args=(new_file.id, file_data, fileName),
|
args=(app, new_file.id, file_data, fileName),
|
||||||
daemon=True
|
daemon=True
|
||||||
)
|
)
|
||||||
processing_thread.start()
|
processing_thread.start()
|
||||||
|
|
@ -146,23 +155,3 @@ def delete_file(id):
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
return jsonify({"message": f"File {id} deleted successfully"}), 200
|
||||||
|
|
||||||
|
|
||||||
# endpoint to receive final results from validate service
|
|
||||||
@pitch_book_controller.route("/<int:id>/results", methods=["POST"])
|
|
||||||
def receive_results(id):
|
|
||||||
try:
|
|
||||||
results = request.get_json()
|
|
||||||
print(f"Received final results for pitchbook {id}:")
|
|
||||||
print(f"Results: {results}")
|
|
||||||
|
|
||||||
file = PitchBookModel.query.get_or_404(id)
|
|
||||||
file.kpi = str(results)
|
|
||||||
db.session.commit()
|
|
||||||
socketio.emit("progress", {"id": id, "progress": 100})
|
|
||||||
|
|
||||||
return jsonify({"message": "Results received successfully"}), 200
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error processing results for pitchbook {id}: {e}")
|
|
||||||
return jsonify({"error": str(e)}), 500
|
|
||||||
|
|
@ -24,9 +24,19 @@ def extract_text_from_ocr_json():
|
||||||
"entities": entities
|
"entities": entities
|
||||||
}
|
}
|
||||||
|
|
||||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
|
||||||
|
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||||
|
print(f"[EXXETA] Validate service response: {response.status_code}")
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"[EXXETA] Validate service error: {response.text}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[EXXETA] Error sending to validate service: {e}")
|
||||||
|
|
||||||
return jsonify("Sent to validate-service"), 200
|
return jsonify("Sent to validate-service"), 200
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(host="0.0.0.0", port=5053)
|
app.run(host="0.0.0.0", port=5053, debug=True)
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
FROM python:3.11-alpine
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apk add --no-cache \
|
||||||
|
gcc \
|
||||||
|
musl-dev \
|
||||||
|
libffi-dev \
|
||||||
|
python3-dev \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-data-deu \
|
||||||
|
tesseract-ocr-data-eng \
|
||||||
|
ghostscript \
|
||||||
|
qpdf \
|
||||||
|
pngquant \
|
||||||
|
leptonica \
|
||||||
|
openjpeg \
|
||||||
|
libgomp
|
||||||
|
|
||||||
|
RUN pip install gunicorn
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
CMD ["gunicorn", "--timeout", "600", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"]
|
||||||
|
|
@ -5,6 +5,11 @@ import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import base64
|
import base64
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
@ -19,20 +24,25 @@ def convert_extract_text_from_pdf():
|
||||||
file = request.files["file"]
|
file = request.files["file"]
|
||||||
pitchbook_id = request.form.get("id")
|
pitchbook_id = request.form.get("id")
|
||||||
|
|
||||||
|
logger.info(f"Processing file for pitchbook_id: {pitchbook_id}")
|
||||||
|
|
||||||
if not pitchbook_id:
|
if not pitchbook_id:
|
||||||
return {"error": "No ID"}, 400
|
return {"error": "No ID"}, 400
|
||||||
|
|
||||||
|
try:
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
||||||
file.seek(0)
|
file.seek(0)
|
||||||
temp_file.write(file.read())
|
temp_file.write(file.read())
|
||||||
temp_path = Path(temp_file.name)
|
temp_path = Path(temp_file.name)
|
||||||
|
|
||||||
|
logger.info("Starting OCR process...")
|
||||||
|
|
||||||
ocr_path = ocr_pdf(temp_path)
|
ocr_path = ocr_pdf(temp_path)
|
||||||
|
|
||||||
if not ocr_path or not ocr_path.exists():
|
if not ocr_path or not ocr_path.exists():
|
||||||
temp_path.unlink() # cleanup
|
temp_path.unlink() # cleanup
|
||||||
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500
|
||||||
|
|
||||||
|
|
||||||
with open(ocr_path, 'rb') as ocr_file:
|
with open(ocr_path, 'rb') as ocr_file:
|
||||||
ocr_pdf_data = ocr_file.read()
|
ocr_pdf_data = ocr_file.read()
|
||||||
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8')
|
||||||
|
|
@ -48,8 +58,19 @@ def convert_extract_text_from_pdf():
|
||||||
"extracted_text_per_page": result["pages"]
|
"extracted_text_per_page": result["pages"]
|
||||||
}
|
}
|
||||||
|
|
||||||
requests.post(EXXETA_URL, json=payload, timeout=600)
|
logger.info(f"Sending payload to EXXETA and SPACY services")
|
||||||
requests.post(SPACY_URL, json=payload, timeout=600)
|
|
||||||
|
try:
|
||||||
|
exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600)
|
||||||
|
logger.info(f"EXXETA response: {exxeta_response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling EXXETA: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
spacy_response = requests.post(SPACY_URL, json=payload, timeout=600)
|
||||||
|
logger.info(f"SPACY response: {spacy_response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling SPACY: {e}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "sent",
|
"status": "sent",
|
||||||
|
|
@ -57,5 +78,10 @@ def convert_extract_text_from_pdf():
|
||||||
"message": "PDF successfully OCR'd and processed"
|
"message": "PDF successfully OCR'd and processed"
|
||||||
}, 200
|
}, 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True)
|
||||||
|
return {"error": f"Processing failed: {str(e)}"}, 500
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(host="0.0.0.0", port=5051)
|
logger.info("Starting OCR service on port 5000")
|
||||||
|
app.run(host="0.0.0.0", port=5000, debug=True)
|
||||||
|
|
@ -4,6 +4,11 @@ import pdfplumber
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
TEMP_DIR = Path("/tmp/ocr_processing")
|
TEMP_DIR = Path("/tmp/ocr_processing")
|
||||||
TEMP_DIR.mkdir(exist_ok=True)
|
TEMP_DIR.mkdir(exist_ok=True)
|
||||||
|
|
@ -39,6 +44,7 @@ def pdf_to_json(pdf_input):
|
||||||
raise ValueError("Invalid file type provided")
|
raise ValueError("Invalid file type provided")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract text from PDF: {str(e)}")
|
||||||
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -49,6 +55,16 @@ def ocr_pdf(input_file_path: Path):
|
||||||
log_file = log_folder / f"{input_path.stem}.log"
|
log_file = log_folder / f"{input_path.stem}.log"
|
||||||
sidecar_txt = output_folder / f"{input_path.stem}.txt"
|
sidecar_txt = output_folder / f"{input_path.stem}.txt"
|
||||||
|
|
||||||
|
if not input_path.exists():
|
||||||
|
logger.error(f"Input file does not exist: {input_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True)
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||||
|
logger.error("ocrmypdf is not installed or not in PATH")
|
||||||
|
return None
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
"--force-ocr",
|
"--force-ocr",
|
||||||
|
|
@ -66,15 +82,24 @@ def ocr_pdf(input_file_path: Path):
|
||||||
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
|
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
|
if output_file.exists():
|
||||||
|
logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes")
|
||||||
return output_file
|
return output_file
|
||||||
else:
|
else:
|
||||||
|
logger.error(f"OCR completed but output file not found: {output_file}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logger.error(f"OCR failed with return code: {result.returncode}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.error("OCR timed out after 600 seconds")
|
||||||
return None
|
return None
|
||||||
except FileNotFoundError:
|
except FileNotFoundError as e:
|
||||||
|
logger.error(f"File not found error: {e}")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error in OCR: {e}", exc_info=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -93,4 +118,5 @@ def extract_text_to_json(pdf_path: Path):
|
||||||
return json_path
|
return json_path
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract text to JSON: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
@ -24,9 +24,19 @@ def extract_pdf():
|
||||||
"entities": entities
|
"entities": entities
|
||||||
}
|
}
|
||||||
|
|
||||||
requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}")
|
||||||
|
print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600)
|
||||||
|
print(f"[EXXETA] Validate service response: {response.status_code}")
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"[EXXETA] Validate service error: {response.text}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[EXXETA] Error sending to validate service: {e}")
|
||||||
|
|
||||||
return jsonify("Sent to validate-service"), 200
|
return jsonify("Sent to validate-service"), 200
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(host="0.0.0.0", port=5052)
|
app.run(host="0.0.0.0", port=5052, debug=True)
|
||||||
|
|
@ -38,7 +38,7 @@ services:
|
||||||
ocr:
|
ocr:
|
||||||
build:
|
build:
|
||||||
context: backend/ocr-service
|
context: backend/ocr-service
|
||||||
dockerfile: ../../Dockerfile
|
dockerfile: Dockerfile
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
environment:
|
environment:
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -6,7 +6,7 @@ import FileUpload from "react-material-file-upload";
|
||||||
import { socket } from "../socket";
|
import { socket } from "../socket";
|
||||||
import { CircularProgressWithLabel } from "./CircularProgressWithLabel";
|
import { CircularProgressWithLabel } from "./CircularProgressWithLabel";
|
||||||
|
|
||||||
const PROGRESS = false;
|
const PROGRESS = true;
|
||||||
|
|
||||||
export default function UploadPage() {
|
export default function UploadPage() {
|
||||||
const [files, setFiles] = useState<File[]>([]);
|
const [files, setFiles] = useState<File[]>([]);
|
||||||
|
|
@ -26,14 +26,13 @@ export default function UploadPage() {
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
console.log("File uploaded successfully");
|
console.log("File uploaded successfully");
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
console.log(data);
|
setPageId(data.id.toString());
|
||||||
setPageId(data.id);
|
|
||||||
setLoadingState(0);
|
setLoadingState(0);
|
||||||
|
|
||||||
!PROGRESS &&
|
!PROGRESS &&
|
||||||
navigate({
|
navigate({
|
||||||
to: "/extractedResult/$pitchBook",
|
to: "/extractedResult/$pitchBook",
|
||||||
params: { pitchBook: data.id },
|
params: { pitchBook: data.id.toString() },
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
console.error("Failed to upload file");
|
console.error("Failed to upload file");
|
||||||
|
|
@ -46,12 +45,12 @@ export default function UploadPage() {
|
||||||
|
|
||||||
const onProgress = useCallback(
|
const onProgress = useCallback(
|
||||||
(progress: { id: number; progress: number }) => {
|
(progress: { id: number; progress: number }) => {
|
||||||
console.log("Progress:", progress);
|
if (pageId === progress.id.toString()) {
|
||||||
console.log(pageId);
|
|
||||||
if (Number(pageId) === progress.id) {
|
|
||||||
setLoadingState(progress.progress);
|
setLoadingState(progress.progress);
|
||||||
|
|
||||||
if (progress.progress === 100) {
|
if (progress.progress === 100) {
|
||||||
|
setPageId(null);
|
||||||
|
setLoadingState(null);
|
||||||
navigate({
|
navigate({
|
||||||
to: "/extractedResult/$pitchBook",
|
to: "/extractedResult/$pitchBook",
|
||||||
params: { pitchBook: progress.id.toString() },
|
params: { pitchBook: progress.id.toString() },
|
||||||
|
|
@ -71,12 +70,19 @@ export default function UploadPage() {
|
||||||
};
|
};
|
||||||
}, [onConnection, onProgress]);
|
}, [onConnection, onProgress]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
return () => {
|
||||||
|
setPageId(null);
|
||||||
|
setLoadingState(null);
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
{PROGRESS && (
|
{PROGRESS && (
|
||||||
<Backdrop
|
<Backdrop
|
||||||
sx={(theme) => ({ color: "#fff", zIndex: theme.zIndex.drawer + 1 })}
|
sx={(theme) => ({ color: "#fff", zIndex: theme.zIndex.drawer + 1 })}
|
||||||
open={pageId !== null && loadingState !== null}
|
open={pageId !== null && loadingState !== null && loadingState < 100}
|
||||||
>
|
>
|
||||||
<CircularProgressWithLabel
|
<CircularProgressWithLabel
|
||||||
color="inherit"
|
color="inherit"
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import { Box, IconButton } from "@mui/material";
|
||||||
interface PDFViewerProps {
|
interface PDFViewerProps {
|
||||||
pitchBookId: string;
|
pitchBookId: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
|
export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
|
||||||
const [numPages, setNumPages] = useState<number | null>(null);
|
const [numPages, setNumPages] = useState<number | null>(null);
|
||||||
const [pageNumber, setPageNumber] = useState(1);
|
const [pageNumber, setPageNumber] = useState(1);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue