Recommit: Fix OCR and add PDF tools
parent
ae52759871
commit
4f1c8ea005
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
2 page already has text! - rasterizing text and running OCR anyway
|
||||
3 page already has text! - rasterizing text and running OCR anyway
|
||||
4 page already has text! - rasterizing text and running OCR anyway
|
||||
5 page already has text! - rasterizing text and running OCR anyway
|
||||
6 page already has text! - rasterizing text and running OCR anyway
|
||||
7 page already has text! - rasterizing text and running OCR anyway
|
||||
8 page already has text! - rasterizing text and running OCR anyway
|
||||
9 page already has text! - rasterizing text and running OCR anyway
|
||||
10 page already has text! - rasterizing text and running OCR anyway
|
||||
11 page already has text! - rasterizing text and running OCR anyway
|
||||
12 page already has text! - rasterizing text and running OCR anyway
|
||||
13 page already has text! - rasterizing text and running OCR anyway
|
||||
14 page already has text! - rasterizing text and running OCR anyway
|
||||
15 page already has text! - rasterizing text and running OCR anyway
|
||||
16 page already has text! - rasterizing text and running OCR anyway
|
||||
17 page already has text! - rasterizing text and running OCR anyway
|
||||
18 page already has text! - rasterizing text and running OCR anyway
|
||||
11 [tesseract] lots of diacritics - possibly poor OCR
|
||||
19 page already has text! - rasterizing text and running OCR anyway
|
||||
20 page already has text! - rasterizing text and running OCR anyway
|
||||
21 page already has text! - rasterizing text and running OCR anyway
|
||||
22 page already has text! - rasterizing text and running OCR anyway
|
||||
23 page already has text! - rasterizing text and running OCR anyway
|
||||
24 page already has text! - rasterizing text and running OCR anyway
|
||||
25 page already has text! - rasterizing text and running OCR anyway
|
||||
26 page already has text! - rasterizing text and running OCR anyway
|
||||
27 page already has text! - rasterizing text and running OCR anyway
|
||||
28 page already has text! - rasterizing text and running OCR anyway
|
||||
29 page already has text! - rasterizing text and running OCR anyway
|
||||
30 page already has text! - rasterizing text and running OCR anyway
|
||||
31 page already has text! - rasterizing text and running OCR anyway
|
||||
20 [tesseract] lots of diacritics - possibly poor OCR
|
||||
32 page already has text! - rasterizing text and running OCR anyway
|
||||
33 page already has text! - rasterizing text and running OCR anyway
|
||||
34 page already has text! - rasterizing text and running OCR anyway
|
||||
35 page already has text! - rasterizing text and running OCR anyway
|
||||
36 page already has text! - rasterizing text and running OCR anyway
|
||||
26 [tesseract] lots of diacritics - possibly poor OCR
|
||||
37 page already has text! - rasterizing text and running OCR anyway
|
||||
31 [tesseract] lots of diacritics - possibly poor OCR
|
||||
38 page already has text! - rasterizing text and running OCR anyway
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.64 savings: 39.0%
|
||||
Total file size ratio: 0.59 savings: -68.4%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
The output file size is 1.68× larger than the input file.
|
||||
Possible reasons for this include:
|
||||
--force-ocr was issued, causing transcoding.
|
||||
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
|
||||
The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted.
|
||||
PDF/A conversion was enabled. (Try `--output-type pdf`.)
|
||||
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
12 [tesseract] lots of diacritics - possibly poor OCR
|
||||
15 [tesseract] lots of diacritics - possibly poor OCR
|
||||
37 [tesseract] lots of diacritics - possibly poor OCR
|
||||
47 [tesseract] lots of diacritics - possibly poor OCR
|
||||
49 [tesseract] lots of diacritics - possibly poor OCR
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.26 savings: 20.7%
|
||||
Total file size ratio: 0.91 savings: -9.6%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
21 [tesseract] lots of diacritics - possibly poor OCR
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.21 savings: 17.6%
|
||||
Total file size ratio: 0.97 savings: -3.3%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
1 page already has text! - rasterizing text and running OCR anyway
|
||||
2 page already has text! - rasterizing text and running OCR anyway
|
||||
3 page already has text! - rasterizing text and running OCR anyway
|
||||
4 page already has text! - rasterizing text and running OCR anyway
|
||||
5 page already has text! - rasterizing text and running OCR anyway
|
||||
6 page already has text! - rasterizing text and running OCR anyway
|
||||
7 page already has text! - rasterizing text and running OCR anyway
|
||||
8 page already has text! - rasterizing text and running OCR anyway
|
||||
9 page already has text! - rasterizing text and running OCR anyway
|
||||
10 page already has text! - rasterizing text and running OCR anyway
|
||||
11 page already has text! - rasterizing text and running OCR anyway
|
||||
12 page already has text! - rasterizing text and running OCR anyway
|
||||
13 page already has text! - rasterizing text and running OCR anyway
|
||||
14 page already has text! - rasterizing text and running OCR anyway
|
||||
15 page already has text! - rasterizing text and running OCR anyway
|
||||
16 page already has text! - rasterizing text and running OCR anyway
|
||||
17 page already has text! - rasterizing text and running OCR anyway
|
||||
18 page already has text! - rasterizing text and running OCR anyway
|
||||
19 page already has text! - rasterizing text and running OCR anyway
|
||||
20 page already has text! - rasterizing text and running OCR anyway
|
||||
1 [tesseract] lots of diacritics - possibly poor OCR
|
||||
21 page already has text! - rasterizing text and running OCR anyway
|
||||
22 page already has text! - rasterizing text and running OCR anyway
|
||||
23 page already has text! - rasterizing text and running OCR anyway
|
||||
24 page already has text! - rasterizing text and running OCR anyway
|
||||
25 page already has text! - rasterizing text and running OCR anyway
|
||||
26 page already has text! - rasterizing text and running OCR anyway
|
||||
27 page already has text! - rasterizing text and running OCR anyway
|
||||
28 page already has text! - rasterizing text and running OCR anyway
|
||||
29 page already has text! - rasterizing text and running OCR anyway
|
||||
30 page already has text! - rasterizing text and running OCR anyway
|
||||
31 page already has text! - rasterizing text and running OCR anyway
|
||||
32 page already has text! - rasterizing text and running OCR anyway
|
||||
26 [tesseract] lots of diacritics - possibly poor OCR
|
||||
33 page already has text! - rasterizing text and running OCR anyway
|
||||
21 [tesseract] lots of diacritics - possibly poor OCR
|
||||
35 page already has text! - rasterizing text and running OCR anyway
|
||||
36 page already has text! - rasterizing text and running OCR anyway
|
||||
37 page already has text! - rasterizing text and running OCR anyway
|
||||
38 page already has text! - rasterizing text and running OCR anyway
|
||||
38 [tesseract] lots of diacritics - possibly poor OCR
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.62 savings: 38.4%
|
||||
Total file size ratio: 0.83 savings: -20.3%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
2 page already has text! - rasterizing text and running OCR anyway
|
||||
3 page already has text! - rasterizing text and running OCR anyway
|
||||
4 page already has text! - rasterizing text and running OCR anyway
|
||||
5 page already has text! - rasterizing text and running OCR anyway
|
||||
6 page already has text! - rasterizing text and running OCR anyway
|
||||
7 page already has text! - rasterizing text and running OCR anyway
|
||||
8 page already has text! - rasterizing text and running OCR anyway
|
||||
9 page already has text! - rasterizing text and running OCR anyway
|
||||
10 page already has text! - rasterizing text and running OCR anyway
|
||||
11 page already has text! - rasterizing text and running OCR anyway
|
||||
12 page already has text! - rasterizing text and running OCR anyway
|
||||
13 page already has text! - rasterizing text and running OCR anyway
|
||||
14 page already has text! - rasterizing text and running OCR anyway
|
||||
15 page already has text! - rasterizing text and running OCR anyway
|
||||
16 page already has text! - rasterizing text and running OCR anyway
|
||||
17 page already has text! - rasterizing text and running OCR anyway
|
||||
18 page already has text! - rasterizing text and running OCR anyway
|
||||
19 page already has text! - rasterizing text and running OCR anyway
|
||||
20 page already has text! - rasterizing text and running OCR anyway
|
||||
21 page already has text! - rasterizing text and running OCR anyway
|
||||
22 page already has text! - rasterizing text and running OCR anyway
|
||||
23 page already has text! - rasterizing text and running OCR anyway
|
||||
24 page already has text! - rasterizing text and running OCR anyway
|
||||
25 page already has text! - rasterizing text and running OCR anyway
|
||||
26 page already has text! - rasterizing text and running OCR anyway
|
||||
27 page already has text! - rasterizing text and running OCR anyway
|
||||
28 page already has text! - rasterizing text and running OCR anyway
|
||||
29 page already has text! - rasterizing text and running OCR anyway
|
||||
30 page already has text! - rasterizing text and running OCR anyway
|
||||
31 page already has text! - rasterizing text and running OCR anyway
|
||||
32 page already has text! - rasterizing text and running OCR anyway
|
||||
33 page already has text! - rasterizing text and running OCR anyway
|
||||
34 page already has text! - rasterizing text and running OCR anyway
|
||||
35 page already has text! - rasterizing text and running OCR anyway
|
||||
36 page already has text! - rasterizing text and running OCR anyway
|
||||
37 page already has text! - rasterizing text and running OCR anyway
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.54 savings: 35.2%
|
||||
Total file size ratio: 0.89 savings: -12.2%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 2.13 savings: 53.0%
|
||||
Total file size ratio: 2.83 savings: 64.7%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
27 [tesseract] lots of diacritics - possibly poor OCR
|
||||
45 [tesseract] lots of diacritics - possibly poor OCR
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 1.68 savings: 40.3%
|
||||
Total file size ratio: 2.30 savings: 56.5%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
Start processing 10 pages concurrently
|
||||
|
||||
Postprocessing...
|
||||
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
|
||||
|
||||
|
||||
|
||||
|
||||
Image optimization ratio: 2.03 savings: 50.7%
|
||||
Total file size ratio: 3.15 savings: 68.2%
|
||||
Output file is a PDF/A-2B (as expected)
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
input_folder = Path("../../pitch-books")
|
||||
output_folder = Path("output")
|
||||
log_folder = Path("logs")
|
||||
|
||||
for folder in [output_folder, log_folder]:
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def ocr_pdf(input_file: Path):
|
||||
output_file = output_folder / f"{input_file.stem}-OCR.pdf"
|
||||
log_file = log_folder / f"{input_file.stem}.log"
|
||||
|
||||
cmd = [
|
||||
"ocrmypdf",
|
||||
"--force-ocr",
|
||||
"--output-type", "pdfa",
|
||||
"--language", "deu+eng",
|
||||
str(input_file),
|
||||
str(output_file)
|
||||
]
|
||||
|
||||
with open(log_file, "w") as log:
|
||||
result = subprocess.run(cmd, stdout=log, stderr=log)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✅ OCR complete: {output_file.name}")
|
||||
else:
|
||||
print(f"❌ OCR failed. See log: {log_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not input_folder.exists():
|
||||
print("Input folder does not exist!")
|
||||
else:
|
||||
pdfs = list(input_folder.glob("*.pdf"))
|
||||
if not pdfs:
|
||||
print("No PDFs found in input folder.")
|
||||
else:
|
||||
for pdf in pdfs:
|
||||
print(f"Processing: {pdf.name}")
|
||||
ocr_pdf(pdf)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,3 @@
|
|||
ocrmypdf
|
||||
pdfplumber
|
||||
PyMuPDF
|
||||
Loading…
Reference in New Issue