Test PyMuPdf and PDFMiner
parent
bfcbc9cca0
commit
ae52759871
|
|
@ -0,0 +1 @@
|
||||||
|
.DS_Store
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,21 @@
|
||||||
|
from pdfminer.layout import LAParams, LTTextBox
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
|
||||||
|
fp = open('Teaser_5_OCR-MY-PDF.pdf', 'rb')
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
pages = PDFPage.get_pages(fp)
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
print('Processing next page...')
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
for lobj in layout:
|
||||||
|
if isinstance(lobj, LTTextBox):
|
||||||
|
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
|
||||||
|
print('At %r is text: %s' % ((x, y), text))
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,10 @@
|
||||||
|
[
|
||||||
|
{"Kennzahl": "Risikoprofil", "Value": "Core", "Seite": 3},
|
||||||
|
{"Kennzahl": "Risikoprofil", "Value": "Core", "Seite": 16},
|
||||||
|
{"Kennzahl": "Marktwert", "Value": "188m", "Seite": 3},
|
||||||
|
{"Kennzahl": "Test", "Value": "structure", "Seite": 5},
|
||||||
|
{"Kennzahl": "Test", "Value": "Netherlands", "Seite": 6},
|
||||||
|
{"Kennzahl": "Test", "Value": "Sweden", "Seite": 9},
|
||||||
|
{"Kennzahl": "Test", "Value": "Sound Public Finances", "Seite": 10},
|
||||||
|
{"Kennzahl": "Test", "Value": "Very Good”", "Seite": 16}
|
||||||
|
]
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
import streamlit as st
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
st.set_page_config(layout="wide")
|
||||||
|
st.title("PDF-Highlighter für Kennzahlen")
|
||||||
|
|
||||||
|
# PDF-Datei im Ordner
|
||||||
|
pdf_filename = "Teaser_5_OCR-MY-PDF Kopie.pdf"
|
||||||
|
json_filename = "kennzahlen.json"
|
||||||
|
|
||||||
|
# PDF öffnen
|
||||||
|
if not os.path.exists(pdf_filename):
|
||||||
|
st.error(f"PDF-Datei '{pdf_filename}' nicht gefunden.")
|
||||||
|
st.stop()
|
||||||
|
if not os.path.exists(json_filename):
|
||||||
|
st.error(f"JSON-Datei '{json_filename}' nicht gefunden.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
doc = fitz.open(pdf_filename)
|
||||||
|
|
||||||
|
# JSON laden
|
||||||
|
with open(json_filename, "r") as f:
|
||||||
|
kennzahlen = json.load(f)
|
||||||
|
|
||||||
|
# Highlights einfügen
|
||||||
|
for eintrag in kennzahlen:
|
||||||
|
value = eintrag["Value"]
|
||||||
|
try:
|
||||||
|
seite = int(str(eintrag["Seite"]).strip()) - 1
|
||||||
|
if 0 <= seite < len(doc):
|
||||||
|
page = doc[seite]
|
||||||
|
matches = page.search_for(value)
|
||||||
|
for rect in matches:
|
||||||
|
highlight = page.add_highlight_annot(rect)
|
||||||
|
highlight.update()
|
||||||
|
else:
|
||||||
|
st.warning(f" Seite {seite + 1} existiert nicht (PDF hat nur {len(doc)} Seiten)")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f" Fehler bei Eintrag {eintrag}: {e}")
|
||||||
|
|
||||||
|
# Neues PDF speichern
|
||||||
|
highlighted_path = pdf_filename.replace(".pdf", "_highlighted.pdf")
|
||||||
|
doc.save(highlighted_path)
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# Seite-Auswahl
|
||||||
|
st.subheader("Kennzahlen-Tabelle")
|
||||||
|
df = pd.DataFrame(kennzahlen)
|
||||||
|
|
||||||
|
|
||||||
|
# Streamlit Tabelle mit klickbaren Seitenzahlen
|
||||||
|
def make_clickable(seite):
|
||||||
|
return f'<a href="?seite={seite}" target="_self">{seite}</a>'
|
||||||
|
|
||||||
|
|
||||||
|
df["Seite"] = df["Seite"].apply(make_clickable)
|
||||||
|
st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
||||||
|
|
||||||
|
# Seite aus URL laden
|
||||||
|
query_params = st.query_params
|
||||||
|
|
||||||
|
aktuelle_seite = int(query_params.get("seite", 1))
|
||||||
|
|
||||||
|
# PDF anzeigen mit Scroll zu aktueller Seite
|
||||||
|
st.subheader(f"Vorschau")
|
||||||
|
with open(highlighted_path, "rb") as f:
|
||||||
|
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
||||||
|
|
||||||
|
# Seite direkt ansteuern
|
||||||
|
pdf_display = f'''
|
||||||
|
<iframe
|
||||||
|
src="data:application/pdf;base64,{base64_pdf}#page={aktuelle_seite}"
|
||||||
|
width="100%" height="800px" type="application/pdf">
|
||||||
|
</iframe>
|
||||||
|
'''
|
||||||
|
st.markdown(pdf_display, unsafe_allow_html=True)
|
||||||
Loading…
Reference in New Issue