pse2_ff/project/backend/ocr-service/ocr_runner.py

96 lines
2.6 KiB
Python

import subprocess
import tempfile
import pdfplumber
import json
from pathlib import Path
import os
TEMP_DIR = Path("/tmp/ocr_processing")
TEMP_DIR.mkdir(exist_ok=True)
output_folder = TEMP_DIR / "output"
log_folder = TEMP_DIR / "logs"
output_folder.mkdir(exist_ok=True)
log_folder.mkdir(exist_ok=True)
def pdf_to_json(pdf_input):
try:
if hasattr(pdf_input, 'read'):
pdf_input.seek(0)
with pdfplumber.open(pdf_input) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
return {"pages": pages}
elif isinstance(pdf_input, (str, Path)):
pdf_path = Path(pdf_input)
with pdfplumber.open(pdf_path) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
return {"pages": pages}
else:
raise ValueError("Invalid file type provided")
except Exception as e:
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def ocr_pdf(input_file_path: Path):
try:
input_path = Path(input_file_path)
output_file = output_folder / f"{input_path.stem}-OCR.pdf"
log_file = log_folder / f"{input_path.stem}.log"
sidecar_txt = output_folder / f"{input_path.stem}.txt"
cmd = [
"ocrmypdf",
"--force-ocr",
"--output-type",
"pdfa",
"--language",
"deu+eng",
"--sidecar",
str(sidecar_txt),
str(input_path),
str(output_file),
]
with open(log_file, "w") as log:
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
if result.returncode == 0:
return output_file
else:
return None
except subprocess.TimeoutExpired:
return None
except FileNotFoundError:
return None
except Exception as e:
return None
def extract_text_to_json(pdf_path: Path):
try:
json_path = output_folder / f"{pdf_path.stem}.json"
with pdfplumber.open(pdf_path) as pdf:
pages = [
{"page": i + 1, "text": (page.extract_text() or "").strip()}
for i, page in enumerate(pdf.pages)
]
with open(json_path, "w", encoding="utf-8") as f:
json.dump(pages, f, indent=2, ensure_ascii=False)
return json_path
except Exception as e:
return None