pse2_ff/project/backend/ocr-service/ocr_runner.py

122 lines
3.8 KiB
Python

import subprocess
import tempfile
import pdfplumber
import json
from pathlib import Path
import os
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
TEMP_DIR = Path("/tmp/ocr_processing")
TEMP_DIR.mkdir(exist_ok=True)
output_folder = TEMP_DIR / "output"
log_folder = TEMP_DIR / "logs"
output_folder.mkdir(exist_ok=True)
log_folder.mkdir(exist_ok=True)
def pdf_to_json(pdf_input):
try:
if hasattr(pdf_input, 'read'):
pdf_input.seek(0)
with pdfplumber.open(pdf_input) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
return {"pages": pages}
elif isinstance(pdf_input, (str, Path)):
pdf_path = Path(pdf_input)
with pdfplumber.open(pdf_path) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
return {"pages": pages}
else:
raise ValueError("Invalid file type provided")
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def ocr_pdf(input_file_path: Path):
try:
input_path = Path(input_file_path)
output_file = output_folder / f"{input_path.stem}-OCR.pdf"
log_file = log_folder / f"{input_path.stem}.log"
sidecar_txt = output_folder / f"{input_path.stem}.txt"
if not input_path.exists():
logger.error(f"Input file does not exist: {input_path}")
return None
try:
subprocess.run(["ocrmypdf", "--version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
logger.error("ocrmypdf is not installed or not in PATH")
return None
cmd = [
"ocrmypdf",
"--force-ocr",
"--output-type",
"pdfa",
"--language",
"deu+eng",
"--sidecar",
str(sidecar_txt),
str(input_path),
str(output_file),
]
with open(log_file, "w") as log:
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
if result.returncode == 0:
if output_file.exists():
logger.info(f"OCR successful, output file size: {output_file.stat().st_size} bytes")
return output_file
else:
logger.error(f"OCR completed but output file not found: {output_file}")
return None
else:
logger.error(f"OCR failed with return code: {result.returncode}")
return None
except subprocess.TimeoutExpired:
logger.error("OCR timed out after 600 seconds")
return None
except FileNotFoundError as e:
logger.error(f"File not found error: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error in OCR: {e}", exc_info=True)
return None
def extract_text_to_json(pdf_path: Path):
try:
json_path = output_folder / f"{pdf_path.stem}.json"
with pdfplumber.open(pdf_path) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
with open(json_path, "w", encoding="utf-8") as f:
json.dump(pages, f, indent=2, ensure_ascii=False)
return json_path
except Exception as e:
logger.error(f"Failed to extract text to JSON: {e}")
return None