96 lines
2.6 KiB
Python
96 lines
2.6 KiB
Python
import subprocess
|
|
import tempfile
|
|
import pdfplumber
|
|
import json
|
|
from pathlib import Path
|
|
import os
|
|
|
|
TEMP_DIR = Path("/tmp/ocr_processing")
|
|
TEMP_DIR.mkdir(exist_ok=True)
|
|
output_folder = TEMP_DIR / "output"
|
|
log_folder = TEMP_DIR / "logs"
|
|
output_folder.mkdir(exist_ok=True)
|
|
log_folder.mkdir(exist_ok=True)
|
|
|
|
def pdf_to_json(pdf_input):
|
|
try:
|
|
if hasattr(pdf_input, 'read'):
|
|
pdf_input.seek(0)
|
|
|
|
with pdfplumber.open(pdf_input) as pdf:
|
|
pages = [
|
|
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
|
for i, page in enumerate(pdf.pages)
|
|
]
|
|
|
|
return {"pages": pages}
|
|
|
|
elif isinstance(pdf_input, (str, Path)):
|
|
pdf_path = Path(pdf_input)
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
pages = [
|
|
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
|
for i, page in enumerate(pdf.pages)
|
|
]
|
|
|
|
return {"pages": pages}
|
|
|
|
else:
|
|
raise ValueError("Invalid file type provided")
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
|
|
|
|
|
def ocr_pdf(input_file_path: Path):
|
|
try:
|
|
input_path = Path(input_file_path)
|
|
output_file = output_folder / f"{input_path.stem}-OCR.pdf"
|
|
log_file = log_folder / f"{input_path.stem}.log"
|
|
sidecar_txt = output_folder / f"{input_path.stem}.txt"
|
|
|
|
cmd = [
|
|
"ocrmypdf",
|
|
"--force-ocr",
|
|
"--output-type",
|
|
"pdfa",
|
|
"--language",
|
|
"deu+eng",
|
|
"--sidecar",
|
|
str(sidecar_txt),
|
|
str(input_path),
|
|
str(output_file),
|
|
]
|
|
|
|
with open(log_file, "w") as log:
|
|
result = subprocess.run(cmd, stdout=log, stderr=log, timeout=600)
|
|
|
|
if result.returncode == 0:
|
|
return output_file
|
|
else:
|
|
return None
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return None
|
|
except FileNotFoundError:
|
|
return None
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def extract_text_to_json(pdf_path: Path):
|
|
try:
|
|
json_path = output_folder / f"{pdf_path.stem}.json"
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
pages = [
|
|
{"page": i + 1, "text": (page.extract_text() or "").strip()}
|
|
for i, page in enumerate(pdf.pages)
|
|
]
|
|
|
|
with open(json_path, "w", encoding="utf-8") as f:
|
|
json.dump(pages, f, indent=2, ensure_ascii=False)
|
|
return json_path
|
|
|
|
except Exception as e:
|
|
return None |