import subprocess import tempfile import pdfplumber import json from pathlib import Path import os TEMP_DIR = Path("/tmp/ocr_processing") TEMP_DIR.mkdir(exist_ok=True) output_folder = TEMP_DIR / "output" log_folder = TEMP_DIR / "logs" output_folder.mkdir(exist_ok=True) log_folder.mkdir(exist_ok=True) def pdf_to_json(pdf_input): try: if hasattr(pdf_input, 'read'): pdf_input.seek(0) with pdfplumber.open(pdf_input) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] return {"pages": pages} elif isinstance(pdf_input, (str, Path)): pdf_path = Path(pdf_input) with pdfplumber.open(pdf_path) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] return {"pages": pages} else: raise ValueError("Invalid file type provided") except Exception as e: raise Exception(f"Failed to extract text from PDF: {str(e)}") def ocr_pdf(input_file_path: Path): try: input_path = Path(input_file_path) output_file = output_folder / f"{input_path.stem}-OCR.pdf" log_file = log_folder / f"{input_path.stem}.log" sidecar_txt = output_folder / f"{input_path.stem}.txt" cmd = [ "ocrmypdf", "--force-ocr", "--output-type", "pdfa", "--language", "deu+eng", "--sidecar", str(sidecar_txt), str(input_path), str(output_file), ] with open(log_file, "w") as log: result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600) if result.returncode == 0: return output_file else: return None except subprocess.TimeoutExpired: return None except FileNotFoundError: return None except Exception as e: return None def extract_text_to_json(pdf_path: Path): try: json_path = output_folder / f"{pdf_path.stem}.json" with pdfplumber.open(pdf_path) as pdf: pages = [ {"page": i + 1, "text": (page.extract_text() or "").strip()} for i, page in enumerate(pdf.pages) ] with open(json_path, "w", encoding="utf-8") as f: json.dump(pages, f, indent=2, ensure_ascii=False) return json_path except Exception as e: return None