import subprocess import tempfile import pdfplumber import json from pathlib import Path import os import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) TEMP_DIR = Path("/tmp/ocr_processing") TEMP_DIR.mkdir(exist_ok=True) output_folder = TEMP_DIR / "output" log_folder = TEMP_DIR / "logs" output_folder.mkdir(exist_ok=True) log_folder.mkdir(exist_ok=True) def pdf_to_json(pdf_input): try: if hasattr(pdf_input, "read"): pdf_input.seek(0) with pdfplumber.open(pdf_input) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] return {"pages": pages} elif isinstance(pdf_input, (str, Path)): pdf_path = Path(pdf_input) with pdfplumber.open(pdf_path) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] return {"pages": pages} else: raise ValueError("Invalid file type provided") except Exception as e: logger.error(f"Failed to extract text from PDF: {str(e)}") raise Exception(f"Failed to extract text from PDF: {str(e)}") def ocr_pdf(input_file_path: Path): try: input_path = Path(input_file_path) output_file = output_folder / f"{input_path.stem}-OCR.pdf" log_file = log_folder / f"{input_path.stem}.log" sidecar_txt = output_folder / f"{input_path.stem}.txt" if not input_path.exists(): logger.error(f"Input file does not exist: {input_path}") return None try: subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): logger.error("ocrmypdf is not installed or not in PATH") return None cmd = [ "ocrmypdf", "--force-ocr", "--output-type", "pdfa", "--language", "deu+eng", "--sidecar", str(sidecar_txt), str(input_path), str(output_file), ] with open(log_file, "w") as log: result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600) if result.returncode == 0: if output_file.exists(): logger.info( f"OCR successful, output file size: {output_file.stat().st_size} bytes" ) return output_file else: logger.error(f"OCR completed but output file not found: {output_file}") return None else: logger.error(f"OCR failed with return code: {result.returncode}") return None except subprocess.TimeoutExpired: logger.error("OCR timed out after 600 seconds") return None except FileNotFoundError as e: logger.error(f"File not found error: {e}") return None except Exception as e: logger.error(f"Unexpected error in OCR: {e}", exc_info=True) return None def extract_text_to_json(pdf_path: Path): try: json_path = output_folder / f"{pdf_path.stem}.json" with pdfplumber.open(pdf_path) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] with open(json_path, "w", encoding="utf-8") as f: json.dump(pages, f, indent=2, ensure_ascii=False) return json_path except Exception as e: logger.error(f"Failed to extract text to JSON: {e}") return None