Neue Prototyp-Dateien hinzugefügt
parent
14b66a31b8
commit
89bbc68c2a
|
|
@ -0,0 +1,64 @@
|
||||||
|
#########################################################
|
||||||
|
#Run: in Terminal -> streamlit run PyMuPdf_st.py
|
||||||
|
#########################################################
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
import io
|
||||||
|
|
||||||
|
st.title("🔍 PDF Kennzahlen-Finder")
|
||||||
|
|
||||||
|
# PDF hochladen
|
||||||
|
uploaded_file = st.file_uploader("PDF hochladen", type="pdf")
|
||||||
|
|
||||||
|
# Suchwort eingeben
|
||||||
|
suchwort = st.text_input("Suchwort (z. B. wie)", value="wie")
|
||||||
|
|
||||||
|
if uploaded_file and suchwort:
|
||||||
|
# PDF öffnen
|
||||||
|
pdf_bytes = uploaded_file.read()
|
||||||
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||||
|
|
||||||
|
fundstellen = []
|
||||||
|
|
||||||
|
# Suche durch alle Seiten
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc.load_page(page_num)
|
||||||
|
rects = page.search_for(suchwort)
|
||||||
|
|
||||||
|
for rect in rects:
|
||||||
|
fundstellen.append({
|
||||||
|
"seite": page_num,
|
||||||
|
"rect": rect
|
||||||
|
})
|
||||||
|
|
||||||
|
if fundstellen:
|
||||||
|
st.success(f"🔎 {len(fundstellen)} Fundstelle(n) für „{suchwort}“ gefunden.")
|
||||||
|
|
||||||
|
# Auswahl der Fundstelle
|
||||||
|
auswahl = st.selectbox(
|
||||||
|
"Fundstelle auswählen:",
|
||||||
|
[f"Seite {f['seite'] + 1}" for f in fundstellen]
|
||||||
|
)
|
||||||
|
|
||||||
|
index = [f"Seite {f['seite'] + 1}" for f in fundstellen].index(auswahl)
|
||||||
|
fund = fundstellen[index]
|
||||||
|
seite = doc.load_page(fund["seite"])
|
||||||
|
rect = fund["rect"]
|
||||||
|
|
||||||
|
# Seite als Bild rendern
|
||||||
|
zoom = 2 # Qualität
|
||||||
|
pix = seite.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
|
||||||
|
# Markierung zeichnen
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
scale = pix.width / seite.rect.width
|
||||||
|
r = rect
|
||||||
|
rect_scaled = [r.x0 * scale, r.y0 * scale, r.x1 * scale, r.y1 * scale]
|
||||||
|
draw.rectangle(rect_scaled, outline="red", width=3)
|
||||||
|
|
||||||
|
st.image(img, caption=f"Markierte Fundstelle auf Seite {fund['seite'] + 1}")
|
||||||
|
else:
|
||||||
|
st.warning("Keine Fundstellen gefunden.")
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
pdf_path = "\pse2_ff\pitch-books\Teaser 2 FINAL.pdf"
|
||||||
|
|
||||||
|
# with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
# # Access the pages
|
||||||
|
# for page in pdf.pages:
|
||||||
|
# # Extract all text from the page with detailed positional data
|
||||||
|
# page_text = page.extract_text(layout=True)
|
||||||
|
|
||||||
|
# # Print the extracted text with preserved structure
|
||||||
|
# print(f"Page {page.page_number}:\n{page_text}\n")
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
|
||||||
|
for i, page in enumerate(pdf.pages):
|
||||||
|
tables = page.extract_tables()
|
||||||
|
|
||||||
|
if tables:
|
||||||
|
print(f"\n Tabellen auf Seite {i + 1} gefunden:")
|
||||||
|
for t_index, table in enumerate(tables):
|
||||||
|
print(f"\nTabelle {t_index + 1}:\n")
|
||||||
|
for row in table:
|
||||||
|
print(row)
|
||||||
Loading…
Reference in New Issue