diff --git a/project/backend/coordinator/app.py b/project/backend/coordinator/app.py index cb3f307..4b734ff 100644 --- a/project/backend/coordinator/app.py +++ b/project/backend/coordinator/app.py @@ -26,4 +26,4 @@ def health_check(): # für Docker wichtig: host='0.0.0.0' if __name__ == "__main__": - socketio.run(app, debug=True, host="0.0.0.0", port=5000) + socketio.run(app, debug=True, host="0.0.0.0", port=5050) diff --git a/project/backend/coordinator/controller/pitch_book_controller.py b/project/backend/coordinator/controller/pitch_book_controller.py index f438304..c4d7f3a 100644 --- a/project/backend/coordinator/controller/pitch_book_controller.py +++ b/project/backend/coordinator/controller/pitch_book_controller.py @@ -1,4 +1,4 @@ -from flask import Blueprint, request, jsonify, send_file +from flask import Blueprint, request, jsonify, send_file, current_app from model.database import db from model.pitch_book_model import PitchBookModel from io import BytesIO @@ -13,39 +13,46 @@ from controller.socketIO import socketio pitch_book_controller = Blueprint("pitch_books", __name__, url_prefix="/api/pitch_book") OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:5051") -def process_pdf_async(file_id, file_data, filename): - try: - socketio.emit("progress", {"id": file_id, "progress": 10}) +def process_pdf_async(app, file_id, file_data, filename): + with app.app_context(): + try: + socketio.emit("progress", {"id": file_id, "progress": 10}) - file_obj = BytesIO(file_data) - file_obj.name = filename + file_obj = BytesIO(file_data) + file_obj.name = filename - files = {'file': (filename, file_obj, 'application/pdf')} - data = {'id': file_id} + files = {'file': (filename, file_obj, 'application/pdf')} + data = {'id': file_id} - response = requests.post( - f"{OCR_SERVICE_URL}/ocr", - files=files, - data=data, - timeout=600 # 10 minute timeout - ) + response = requests.post( + f"{OCR_SERVICE_URL}/ocr", + files=files, + data=data, + timeout=600 + ) - if response.status_code == 200: - response_data = response.json() - if 'ocr_pdf' in response_data: - import base64 - ocr_pdf_data = base64.b64decode(response_data['ocr_pdf']) + if response.status_code == 200: + response_data = response.json() + if 'ocr_pdf' in response_data: + import base64 + ocr_pdf_data = base64.b64decode(response_data['ocr_pdf']) - file_record = PitchBookModel.query.get(file_id) - if file_record: - file_record.file = ocr_pdf_data - db.session.commit() - socketio.emit("progress", {"id": file_id, "progress": 50}) - else: - socketio.emit("error", {"id": file_id, "message": "OCR processing failed"}) + file_record = PitchBookModel.query.get(file_id) + if file_record: + file_record.file = ocr_pdf_data + db.session.commit() - except Exception as e: - socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"}) + print(f"[DEBUG] PDF updated in database:") + print(f"[DEBUG] - Successfully saved to database") + + socketio.emit("progress", {"id": file_id, "progress": 50}) + else: + socketio.emit("error", {"id": file_id, "message": "OCR processing failed"}) + + except Exception as e: + import traceback + traceback.print_exc() + socketio.emit("error", {"id": file_id, "message": f"Processing failed: {str(e)}"}) @pitch_book_controller.route("/", methods=["POST"]) @@ -73,9 +80,11 @@ def upload_file(): db.session.add(new_file) db.session.commit() + app = current_app._get_current_object() + processing_thread = threading.Thread( target=process_pdf_async, - args=(new_file.id, file_data, fileName), + args=(app, new_file.id, file_data, fileName), daemon=True ) processing_thread.start() @@ -145,24 +154,4 @@ def delete_file(id): db.session.delete(file) db.session.commit() - return jsonify({"message": f"File {id} deleted successfully"}), 200 - - -# endpoint to receive final results from validate service -@pitch_book_controller.route("//results", methods=["POST"]) -def receive_results(id): - try: - results = request.get_json() - print(f"Received final results for pitchbook {id}:") - print(f"Results: {results}") - - file = PitchBookModel.query.get_or_404(id) - file.kpi = str(results) - db.session.commit() - socketio.emit("progress", {"id": id, "progress": 100}) - - return jsonify({"message": "Results received successfully"}), 200 - - except Exception as e: - print(f"Error processing results for pitchbook {id}: {e}") - return jsonify({"error": str(e)}), 500 \ No newline at end of file + return jsonify({"message": f"File {id} deleted successfully"}), 200 \ No newline at end of file diff --git a/project/backend/exxetaGPT-service/app.py b/project/backend/exxetaGPT-service/app.py index 1326b24..b40ee91 100644 --- a/project/backend/exxetaGPT-service/app.py +++ b/project/backend/exxetaGPT-service/app.py @@ -24,9 +24,19 @@ def extract_text_from_ocr_json(): "entities": entities } - requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) + print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}") + print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}") + + try: + response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) + print(f"[EXXETA] Validate service response: {response.status_code}") + if response.status_code != 200: + print(f"[EXXETA] Validate service error: {response.text}") + except Exception as e: + print(f"[EXXETA] Error sending to validate service: {e}") + return jsonify("Sent to validate-service"), 200 if __name__ == "__main__": - app.run(host="0.0.0.0", port=5053) \ No newline at end of file + app.run(host="0.0.0.0", port=5053, debug=True) \ No newline at end of file diff --git a/project/backend/ocr-service/Dockerfile b/project/backend/ocr-service/Dockerfile new file mode 100644 index 0000000..1f772d4 --- /dev/null +++ b/project/backend/ocr-service/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-alpine + +WORKDIR /app + +RUN apk add --no-cache \ + gcc \ + musl-dev \ + libffi-dev \ + python3-dev \ + tesseract-ocr \ + tesseract-ocr-data-deu \ + tesseract-ocr-data-eng \ + ghostscript \ + qpdf \ + pngquant \ + leptonica \ + openjpeg \ + libgomp + +RUN pip install gunicorn + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["gunicorn", "--timeout", "600", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"] \ No newline at end of file diff --git a/project/backend/ocr-service/app.py b/project/backend/ocr-service/app.py index b590ffe..96bc4f8 100644 --- a/project/backend/ocr-service/app.py +++ b/project/backend/ocr-service/app.py @@ -5,6 +5,11 @@ import os import tempfile import base64 from pathlib import Path +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) app = Flask(__name__) @@ -19,43 +24,64 @@ def convert_extract_text_from_pdf(): file = request.files["file"] pitchbook_id = request.form.get("id") + logger.info(f"Processing file for pitchbook_id: {pitchbook_id}") + if not pitchbook_id: return {"error": "No ID"}, 400 - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: - file.seek(0) - temp_file.write(file.read()) - temp_path = Path(temp_file.name) - ocr_path = ocr_pdf(temp_path) + try: + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + file.seek(0) + temp_file.write(file.read()) + temp_path = Path(temp_file.name) - if not ocr_path or not ocr_path.exists(): - temp_path.unlink() # cleanup - return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 + logger.info("Starting OCR process...") + ocr_path = ocr_pdf(temp_path) - with open(ocr_path, 'rb') as ocr_file: - ocr_pdf_data = ocr_file.read() - ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8') + if not ocr_path or not ocr_path.exists(): + temp_path.unlink() # cleanup + return {"error": "OCR processing failed - all PDFs must be OCR'd"}, 500 - ocr_file.seek(0) - result = pdf_to_json(ocr_file) + with open(ocr_path, 'rb') as ocr_file: + ocr_pdf_data = ocr_file.read() + ocr_pdf_base64 = base64.b64encode(ocr_pdf_data).decode('utf-8') - ocr_path.unlink() - temp_path.unlink() + ocr_file.seek(0) + result = pdf_to_json(ocr_file) - payload = { - "id": int(pitchbook_id), - "extracted_text_per_page": result["pages"] - } + ocr_path.unlink() + temp_path.unlink() - requests.post(EXXETA_URL, json=payload, timeout=600) - requests.post(SPACY_URL, json=payload, timeout=600) + payload = { + "id": int(pitchbook_id), + "extracted_text_per_page": result["pages"] + } - return { - "status": "sent", - "ocr_pdf": ocr_pdf_base64, - "message": "PDF successfully OCR'd and processed" - }, 200 + logger.info(f"Sending payload to EXXETA and SPACY services") + + try: + exxeta_response = requests.post(EXXETA_URL, json=payload, timeout=600) + logger.info(f"EXXETA response: {exxeta_response.status_code}") + except Exception as e: + logger.error(f"Error calling EXXETA: {e}") + + try: + spacy_response = requests.post(SPACY_URL, json=payload, timeout=600) + logger.info(f"SPACY response: {spacy_response.status_code}") + except Exception as e: + logger.error(f"Error calling SPACY: {e}") + + return { + "status": "sent", + "ocr_pdf": ocr_pdf_base64, + "message": "PDF successfully OCR'd and processed" + }, 200 + + except Exception as e: + logger.error(f"Exception in OCR processing: {str(e)}", exc_info=True) + return {"error": f"Processing failed: {str(e)}"}, 500 if __name__ == "__main__": - app.run(host="0.0.0.0", port=5051) \ No newline at end of file + logger.info("Starting OCR service on port 5000") + app.run(host="0.0.0.0", port=5000, debug=True) \ No newline at end of file diff --git a/project/backend/ocr-service/ocr_runner.py b/project/backend/ocr-service/ocr_runner.py index 8da941a..1ce9599 100644 --- a/project/backend/ocr-service/ocr_runner.py +++ b/project/backend/ocr-service/ocr_runner.py @@ -4,6 +4,11 @@ import pdfplumber import json from pathlib import Path import os +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) TEMP_DIR = Path("/tmp/ocr_processing") TEMP_DIR.mkdir(exist_ok=True) @@ -39,6 +44,7 @@ def pdf_to_json(pdf_input): raise ValueError("Invalid file type provided") except Exception as e: + logger.error(f"Failed to extract text from PDF: {str(e)}") raise Exception(f"Failed to extract text from PDF: {str(e)}") @@ -49,6 +55,16 @@ def ocr_pdf(input_file_path: Path): log_file = log_folder / f"{input_path.stem}.log" sidecar_txt = output_folder / f"{input_path.stem}.txt" + if not input_path.exists(): + logger.error(f"Input file does not exist: {input_path}") + return None + + try: + subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + logger.error("ocrmypdf is not installed or not in PATH") + return None + cmd = [ "ocrmypdf", "--force-ocr", @@ -66,15 +82,24 @@ def ocr_pdf(input_file_path: Path): result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600) if result.returncode == 0: - return output_file + if output_file.exists(): + logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes") + return output_file + else: + logger.error(f"OCR completed but output file not found: {output_file}") + return None else: + logger.error(f"OCR failed with return code: {result.returncode}") return None except subprocess.TimeoutExpired: + logger.error("OCR timed out after 600 seconds") return None - except FileNotFoundError: + except FileNotFoundError as e: + logger.error(f"File not found error: {e}") return None except Exception as e: + logger.error(f"Unexpected error in OCR: {e}", exc_info=True) return None @@ -93,4 +118,5 @@ def extract_text_to_json(pdf_path: Path): return json_path except Exception as e: + logger.error(f"Failed to extract text to JSON: {e}") return None \ No newline at end of file diff --git a/project/backend/spacy-service/app.py b/project/backend/spacy-service/app.py index 9a81cdc..503910d 100644 --- a/project/backend/spacy-service/app.py +++ b/project/backend/spacy-service/app.py @@ -24,9 +24,19 @@ def extract_pdf(): "entities": entities } - requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) + print(f"[EXXETA] Sending to validate service: {VALIDATE_SERVICE_URL}") + print(f"[EXXETA] Payload: {validate_payload} entities for pitchbook {pitchbook_id}") + + try: + response = requests.post(VALIDATE_SERVICE_URL, json=validate_payload, timeout=600) + print(f"[EXXETA] Validate service response: {response.status_code}") + if response.status_code != 200: + print(f"[EXXETA] Validate service error: {response.text}") + except Exception as e: + print(f"[EXXETA] Error sending to validate service: {e}") + return jsonify("Sent to validate-service"), 200 if __name__ == "__main__": - app.run(host="0.0.0.0", port=5052) \ No newline at end of file + app.run(host="0.0.0.0", port=5052, debug=True) \ No newline at end of file diff --git a/project/docker-compose.yml b/project/docker-compose.yml index 48a8d33..df08628 100644 --- a/project/docker-compose.yml +++ b/project/docker-compose.yml @@ -38,7 +38,7 @@ services: ocr: build: context: backend/ocr-service - dockerfile: ../../Dockerfile + dockerfile: Dockerfile env_file: - .env environment: diff --git a/project/frontend/public/example.pdf b/project/frontend/public/example.pdf deleted file mode 100644 index 759093b..0000000 Binary files a/project/frontend/public/example.pdf and /dev/null differ diff --git a/project/frontend/src/components/UploadPage.tsx b/project/frontend/src/components/UploadPage.tsx index 4373676..66fe220 100644 --- a/project/frontend/src/components/UploadPage.tsx +++ b/project/frontend/src/components/UploadPage.tsx @@ -6,7 +6,7 @@ import FileUpload from "react-material-file-upload"; import { socket } from "../socket"; import { CircularProgressWithLabel } from "./CircularProgressWithLabel"; -const PROGRESS = false; +const PROGRESS = true; export default function UploadPage() { const [files, setFiles] = useState([]); @@ -26,15 +26,14 @@ export default function UploadPage() { if (response.ok) { console.log("File uploaded successfully"); const data = await response.json(); - console.log(data); - setPageId(data.id); + setPageId(data.id.toString()); setLoadingState(0); !PROGRESS && - navigate({ - to: "/extractedResult/$pitchBook", - params: { pitchBook: data.id }, - }); + navigate({ + to: "/extractedResult/$pitchBook", + params: { pitchBook: data.id.toString() }, + }); } else { console.error("Failed to upload file"); } @@ -46,12 +45,12 @@ export default function UploadPage() { const onProgress = useCallback( (progress: { id: number; progress: number }) => { - console.log("Progress:", progress); - console.log(pageId); - if (Number(pageId) === progress.id) { + if (pageId === progress.id.toString()) { setLoadingState(progress.progress); if (progress.progress === 100) { + setPageId(null); + setLoadingState(null); navigate({ to: "/extractedResult/$pitchBook", params: { pitchBook: progress.id.toString() }, @@ -71,12 +70,19 @@ export default function UploadPage() { }; }, [onConnection, onProgress]); + useEffect(() => { + return () => { + setPageId(null); + setLoadingState(null); + }; + }, []); + return ( <> {PROGRESS && ( ({ color: "#fff", zIndex: theme.zIndex.drawer + 1 })} - open={pageId !== null && loadingState !== null} + open={pageId !== null && loadingState !== null && loadingState < 100} > ); -} +} \ No newline at end of file diff --git a/project/frontend/src/components/pdfViewer.tsx b/project/frontend/src/components/pdfViewer.tsx index 87e1edc..85034ab 100644 --- a/project/frontend/src/components/pdfViewer.tsx +++ b/project/frontend/src/components/pdfViewer.tsx @@ -9,6 +9,7 @@ import { Box, IconButton } from "@mui/material"; interface PDFViewerProps { pitchBookId: string; } + export default function PDFViewer({ pitchBookId }: PDFViewerProps) { const [numPages, setNumPages] = useState(null); const [pageNumber, setPageNumber] = useState(1); @@ -90,4 +91,4 @@ export default function PDFViewer({ pitchBookId }: PDFViewerProps) { ); -} +} \ No newline at end of file