Test PyMuPdf and PDFMiner

2025-04-21 14:14:18 +02:00 · 2025-04-21 14:14:18 +02:00 · ae52759871
parent bfcbc9cca0
commit ae52759871
8 changed files with 112 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 .DS_Store
--- a/prototypes/PDFMiner/Pitchbook3.pdf
+++ b/prototypes/PDFMiner/Pitchbook3.pdf
--- a/prototypes/PDFMiner/Teaser_5_OCR-MY-PDF.pdf
+++ b/prototypes/PDFMiner/Teaser_5_OCR-MY-PDF.pdf
--- a/prototypes/PDFMiner/test.py
+++ b/prototypes/PDFMiner/test.py
@ -0,0 +1,21 @@
 from pdfminer.layout import LAParams, LTTextBox
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
 rsrcmgr = PDFResourceManager()
 laparams = LAParams()
 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 interpreter = PDFPageInterpreter(rsrcmgr, device)
 pages = PDFPage.get_pages(fp)
 for page in pages:
    print('Processing next page...')
    interpreter.process_page(page)
    layout = device.get_result()
    for lobj in layout:
        if isinstance(lobj, LTTextBox):
            x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
            print('At %r is text: %s' % ((x, y), text))
--- a/prototypes/PyMuPdf/Teaser_5_OCR-MY-PDF
+++ b/prototypes/PyMuPdf/Teaser_5_OCR-MY-PDF
--- a/prototypes/PyMuPdf/Teaser_5_OCR-MY-PDF
+++ b/prototypes/PyMuPdf/Teaser_5_OCR-MY-PDF
--- a/prototypes/PyMuPdf/kennzahlen.json
+++ b/prototypes/PyMuPdf/kennzahlen.json
@ -0,0 +1,10 @@
 [
    {"Kennzahl": "Risikoprofil", "Value": "Core", "Seite": 3},
    {"Kennzahl": "Risikoprofil", "Value": "Core", "Seite": 16},
    {"Kennzahl": "Marktwert", "Value": "188m", "Seite": 3},
    {"Kennzahl": "Test", "Value": "structure", "Seite": 5},
    {"Kennzahl": "Test", "Value": "Netherlands", "Seite": 6},
    {"Kennzahl": "Test", "Value": "Sweden", "Seite": 9},
    {"Kennzahl": "Test", "Value": "Sound Public Finances", "Seite":  10},
    {"Kennzahl": "Test", "Value": "Very Good”", "Seite": 16}
 ]
--- a/prototypes/PyMuPdf/prototype.py
+++ b/prototypes/PyMuPdf/prototype.py
@ -0,0 +1,80 @@
 import streamlit as st
 import fitz  # PyMuPDF
 import base64
 import json
 import pandas as pd
 import os
 st.set_page_config(layout="wide")
 st.title("PDF-Highlighter für Kennzahlen")
 # PDF-Datei im Ordner
 pdf_filename = "Teaser_5_OCR-MY-PDF Kopie.pdf"
 json_filename = "kennzahlen.json"
 # PDF öffnen
 if not os.path.exists(pdf_filename):
    st.error(f"PDF-Datei '{pdf_filename}' nicht gefunden.")
    st.stop()
 if not os.path.exists(json_filename):
    st.error(f"JSON-Datei '{json_filename}' nicht gefunden.")
    st.stop()
 doc = fitz.open(pdf_filename)
 # JSON laden
 with open(json_filename, "r") as f:
    kennzahlen = json.load(f)
 # Highlights einfügen
 for eintrag in kennzahlen:
    value = eintrag["Value"]
    try:
        seite = int(str(eintrag["Seite"]).strip()) - 1
        if 0 <= seite < len(doc):
            page = doc[seite]
            matches = page.search_for(value)
            for rect in matches:
                highlight = page.add_highlight_annot(rect)
                highlight.update()
        else:
            st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
    except Exception as e:
        st.error(f" Fehler bei Eintrag {eintrag}: {e}")
 # Neues PDF speichern
 highlighted_path = pdf_filename.replace(".pdf", "_highlighted.pdf")
 doc.save(highlighted_path)
 doc.close()
 # Seite-Auswahl
 st.subheader("Kennzahlen-Tabelle")
 df = pd.DataFrame(kennzahlen)
 # Streamlit Tabelle mit klickbaren Seitenzahlen
 def make_clickable(seite):
    return f'<a href="?seite={seite}" target="_self">{seite}</a>'
 df["Seite"] = df["Seite"].apply(make_clickable)
 st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
 # Seite aus URL laden
 query_params = st.query_params
 aktuelle_seite = int(query_params.get("seite", 1))
 # PDF anzeigen mit Scroll zu aktueller Seite
 st.subheader(f"Vorschau")
 with open(highlighted_path, "rb") as f:
    base64_pdf = base64.b64encode(f.read()).decode('utf-8')
 # Seite direkt ansteuern
 pdf_display = f'''
 <iframe 
    src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}" 
    width="100%" height="800px" type="application/pdf">
 </iframe>
 '''
 st.markdown(pdf_display, unsafe_allow_html=True)