Recommit: Fix OCR and add PDF tools

2025-04-21 14:57:24 +02:00 · 2025-04-21 14:57:24 +02:00 · 4f1c8ea005
parent ae52759871
commit 4f1c8ea005
18 changed files with 275 additions and 0 deletions
--- a/prototypes/ocr_demo/logs/Pitchbook
+++ b/prototypes/ocr_demo/logs/Pitchbook
@ -0,0 +1,60 @@
+
+Start processing 10 pages concurrently
+    2 page already has text! - rasterizing text and running OCR anyway
+    3 page already has text! - rasterizing text and running OCR anyway
+    4 page already has text! - rasterizing text and running OCR anyway
+    5 page already has text! - rasterizing text and running OCR anyway
+    6 page already has text! - rasterizing text and running OCR anyway
+    7 page already has text! - rasterizing text and running OCR anyway
+    8 page already has text! - rasterizing text and running OCR anyway
+    9 page already has text! - rasterizing text and running OCR anyway
+   10 page already has text! - rasterizing text and running OCR anyway
+   11 page already has text! - rasterizing text and running OCR anyway
+   12 page already has text! - rasterizing text and running OCR anyway
+   13 page already has text! - rasterizing text and running OCR anyway
+   14 page already has text! - rasterizing text and running OCR anyway
+   15 page already has text! - rasterizing text and running OCR anyway
+   16 page already has text! - rasterizing text and running OCR anyway
+   17 page already has text! - rasterizing text and running OCR anyway
+   18 page already has text! - rasterizing text and running OCR anyway
+   11 [tesseract] lots of diacritics - possibly poor OCR
+   19 page already has text! - rasterizing text and running OCR anyway
+   20 page already has text! - rasterizing text and running OCR anyway
+   21 page already has text! - rasterizing text and running OCR anyway
+   22 page already has text! - rasterizing text and running OCR anyway
+   23 page already has text! - rasterizing text and running OCR anyway
+   24 page already has text! - rasterizing text and running OCR anyway
+   25 page already has text! - rasterizing text and running OCR anyway
+   26 page already has text! - rasterizing text and running OCR anyway
+   27 page already has text! - rasterizing text and running OCR anyway
+   28 page already has text! - rasterizing text and running OCR anyway
+   29 page already has text! - rasterizing text and running OCR anyway
+   30 page already has text! - rasterizing text and running OCR anyway
+   31 page already has text! - rasterizing text and running OCR anyway
+   20 [tesseract] lots of diacritics - possibly poor OCR
+   32 page already has text! - rasterizing text and running OCR anyway
+   33 page already has text! - rasterizing text and running OCR anyway
+   34 page already has text! - rasterizing text and running OCR anyway
+   35 page already has text! - rasterizing text and running OCR anyway
+   36 page already has text! - rasterizing text and running OCR anyway
+   26 [tesseract] lots of diacritics - possibly poor OCR
+   37 page already has text! - rasterizing text and running OCR anyway
+   31 [tesseract] lots of diacritics - possibly poor OCR
+   38 page already has text! - rasterizing text and running OCR anyway
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.64 savings: 39.0%
+Total file size ratio: 0.59 savings: -68.4%
+Output file is a PDF/A-2B (as expected)
+The output file size is 1.68× larger than the input file.
+Possible reasons for this include:
+--force-ocr was issued, causing transcoding.
+The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
+The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.
+PDF/A conversion was enabled. (Try `--output-type pdf`.)
+
--- a/prototypes/ocr_demo/logs/Pitchbook
+++ b/prototypes/ocr_demo/logs/Pitchbook
@ -0,0 +1,17 @@
+
+Start processing 10 pages concurrently
+   12 [tesseract] lots of diacritics - possibly poor OCR
+   15 [tesseract] lots of diacritics - possibly poor OCR
+   37 [tesseract] lots of diacritics - possibly poor OCR
+   47 [tesseract] lots of diacritics - possibly poor OCR
+   49 [tesseract] lots of diacritics - possibly poor OCR
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.26 savings: 20.7%
+Total file size ratio: 0.91 savings: -9.6%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Pitchbook
+++ b/prototypes/ocr_demo/logs/Pitchbook
@ -0,0 +1,13 @@
+
+Start processing 10 pages concurrently
+   21 [tesseract] lots of diacritics - possibly poor OCR
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.21 savings: 17.6%
+Total file size ratio: 0.97 savings: -3.3%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Teaser
+++ b/prototypes/ocr_demo/logs/Teaser
@ -0,0 +1,53 @@
+
+Start processing 10 pages concurrently
+    1 page already has text! - rasterizing text and running OCR anyway
+    2 page already has text! - rasterizing text and running OCR anyway
+    3 page already has text! - rasterizing text and running OCR anyway
+    4 page already has text! - rasterizing text and running OCR anyway
+    5 page already has text! - rasterizing text and running OCR anyway
+    6 page already has text! - rasterizing text and running OCR anyway
+    7 page already has text! - rasterizing text and running OCR anyway
+    8 page already has text! - rasterizing text and running OCR anyway
+    9 page already has text! - rasterizing text and running OCR anyway
+   10 page already has text! - rasterizing text and running OCR anyway
+   11 page already has text! - rasterizing text and running OCR anyway
+   12 page already has text! - rasterizing text and running OCR anyway
+   13 page already has text! - rasterizing text and running OCR anyway
+   14 page already has text! - rasterizing text and running OCR anyway
+   15 page already has text! - rasterizing text and running OCR anyway
+   16 page already has text! - rasterizing text and running OCR anyway
+   17 page already has text! - rasterizing text and running OCR anyway
+   18 page already has text! - rasterizing text and running OCR anyway
+   19 page already has text! - rasterizing text and running OCR anyway
+   20 page already has text! - rasterizing text and running OCR anyway
+    1 [tesseract] lots of diacritics - possibly poor OCR
+   21 page already has text! - rasterizing text and running OCR anyway
+   22 page already has text! - rasterizing text and running OCR anyway
+   23 page already has text! - rasterizing text and running OCR anyway
+   24 page already has text! - rasterizing text and running OCR anyway
+   25 page already has text! - rasterizing text and running OCR anyway
+   26 page already has text! - rasterizing text and running OCR anyway
+   27 page already has text! - rasterizing text and running OCR anyway
+   28 page already has text! - rasterizing text and running OCR anyway
+   29 page already has text! - rasterizing text and running OCR anyway
+   30 page already has text! - rasterizing text and running OCR anyway
+   31 page already has text! - rasterizing text and running OCR anyway
+   32 page already has text! - rasterizing text and running OCR anyway
+   26 [tesseract] lots of diacritics - possibly poor OCR
+   33 page already has text! - rasterizing text and running OCR anyway
+   21 [tesseract] lots of diacritics - possibly poor OCR
+   35 page already has text! - rasterizing text and running OCR anyway
+   36 page already has text! - rasterizing text and running OCR anyway
+   37 page already has text! - rasterizing text and running OCR anyway
+   38 page already has text! - rasterizing text and running OCR anyway
+   38 [tesseract] lots of diacritics - possibly poor OCR
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.62 savings: 38.4%
+Total file size ratio: 0.83 savings: -20.3%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Teaser
+++ b/prototypes/ocr_demo/logs/Teaser
@ -0,0 +1,48 @@
+
+Start processing 10 pages concurrently
+    2 page already has text! - rasterizing text and running OCR anyway
+    3 page already has text! - rasterizing text and running OCR anyway
+    4 page already has text! - rasterizing text and running OCR anyway
+    5 page already has text! - rasterizing text and running OCR anyway
+    6 page already has text! - rasterizing text and running OCR anyway
+    7 page already has text! - rasterizing text and running OCR anyway
+    8 page already has text! - rasterizing text and running OCR anyway
+    9 page already has text! - rasterizing text and running OCR anyway
+   10 page already has text! - rasterizing text and running OCR anyway
+   11 page already has text! - rasterizing text and running OCR anyway
+   12 page already has text! - rasterizing text and running OCR anyway
+   13 page already has text! - rasterizing text and running OCR anyway
+   14 page already has text! - rasterizing text and running OCR anyway
+   15 page already has text! - rasterizing text and running OCR anyway
+   16 page already has text! - rasterizing text and running OCR anyway
+   17 page already has text! - rasterizing text and running OCR anyway
+   18 page already has text! - rasterizing text and running OCR anyway
+   19 page already has text! - rasterizing text and running OCR anyway
+   20 page already has text! - rasterizing text and running OCR anyway
+   21 page already has text! - rasterizing text and running OCR anyway
+   22 page already has text! - rasterizing text and running OCR anyway
+   23 page already has text! - rasterizing text and running OCR anyway
+   24 page already has text! - rasterizing text and running OCR anyway
+   25 page already has text! - rasterizing text and running OCR anyway
+   26 page already has text! - rasterizing text and running OCR anyway
+   27 page already has text! - rasterizing text and running OCR anyway
+   28 page already has text! - rasterizing text and running OCR anyway
+   29 page already has text! - rasterizing text and running OCR anyway
+   30 page already has text! - rasterizing text and running OCR anyway
+   31 page already has text! - rasterizing text and running OCR anyway
+   32 page already has text! - rasterizing text and running OCR anyway
+   33 page already has text! - rasterizing text and running OCR anyway
+   34 page already has text! - rasterizing text and running OCR anyway
+   35 page already has text! - rasterizing text and running OCR anyway
+   36 page already has text! - rasterizing text and running OCR anyway
+   37 page already has text! - rasterizing text and running OCR anyway
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.54 savings: 35.2%
+Total file size ratio: 0.89 savings: -12.2%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Teaser
+++ b/prototypes/ocr_demo/logs/Teaser
@ -0,0 +1,12 @@
+
+Start processing 10 pages concurrently
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 2.13 savings: 53.0%
+Total file size ratio: 2.83 savings: 64.7%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Teaser
+++ b/prototypes/ocr_demo/logs/Teaser
@ -0,0 +1,14 @@
+
+Start processing 10 pages concurrently
+   27 [tesseract] lots of diacritics - possibly poor OCR
+   45 [tesseract] lots of diacritics - possibly poor OCR
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 1.68 savings: 40.3%
+Total file size ratio: 2.30 savings: 56.5%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/logs/Teaser
+++ b/prototypes/ocr_demo/logs/Teaser
@ -0,0 +1,12 @@
+
+Start processing 10 pages concurrently
+
+Postprocessing...
+Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
+
+
+
+
+Image optimization ratio: 2.03 savings: 50.7%
+Total file size ratio: 3.15 savings: 68.2%
+Output file is a PDF/A-2B (as expected)
--- a/prototypes/ocr_demo/ocr.py
+++ b/prototypes/ocr_demo/ocr.py
@ -0,0 +1,43 @@
+import os
+import subprocess
+from pathlib import Path
+
+input_folder = Path("../../pitch-books")
+output_folder = Path("output")
+log_folder = Path("logs")
+
+for folder in [output_folder, log_folder]:
+    folder.mkdir(parents=True, exist_ok=True)
+
+def ocr_pdf(input_file: Path):
+    output_file = output_folder / f"{input_file.stem}-OCR.pdf"
+    log_file = log_folder / f"{input_file.stem}.log"
+
+    cmd = [
+        "ocrmypdf",
+        "--force-ocr",
+        "--output-type", "pdfa",
+        "--language", "deu+eng",  
+        str(input_file),
+        str(output_file)
+    ]
+
+    with open(log_file, "w") as log:
+        result = subprocess.run(cmd, stdout=log, stderr=log)
+
+    if result.returncode == 0:
+        print(f"✅ OCR complete: {output_file.name}")
+    else:
+        print(f"❌ OCR failed. See log: {log_file}")
+
+if __name__ == "__main__":
+    if not input_folder.exists():
+        print("Input folder does not exist!")
+    else:
+        pdfs = list(input_folder.glob("*.pdf"))
+        if not pdfs:
+            print("No PDFs found in input folder.")
+        else:
+            for pdf in pdfs:
+                print(f"Processing: {pdf.name}")
+                ocr_pdf(pdf)
--- a/prototypes/ocr_demo/output/Pitchbook
+++ b/prototypes/ocr_demo/output/Pitchbook
--- a/prototypes/ocr_demo/output/Pitchbook
+++ b/prototypes/ocr_demo/output/Pitchbook
--- a/prototypes/ocr_demo/output/Pitchbook
+++ b/prototypes/ocr_demo/output/Pitchbook
--- a/prototypes/ocr_demo/output/Teaser
+++ b/prototypes/ocr_demo/output/Teaser
--- a/prototypes/ocr_demo/output/Teaser
+++ b/prototypes/ocr_demo/output/Teaser
--- a/prototypes/ocr_demo/output/Teaser
+++ b/prototypes/ocr_demo/output/Teaser
--- a/prototypes/ocr_demo/output/Teaser
+++ b/prototypes/ocr_demo/output/Teaser
--- a/prototypes/ocr_demo/output/Teaser
+++ b/prototypes/ocr_demo/output/Teaser
--- a/prototypes/ocr_demo/requirements.txt
+++ b/prototypes/ocr_demo/requirements.txt
@ -0,0 +1,3 @@
+ocrmypdf
+pdfplumber
+PyMuPDF