Added page number and process

pull/40/head
s8613 2025-04-21 12:11:06 +02:00
parent 14364100b0
commit bfcbc9cca0
3 changed files with 1319 additions and 1119 deletions

View File

@ -1,5 +1,5 @@
# https://github.com/explosion/spacy-layout
### Run with: python extract_pitchbooks.py
import spacy
from spacy_layout import spaCyLayout
from pathlib import Path
@ -13,40 +13,63 @@ output_dir.mkdir(exist_ok=True)
nlp = spacy.load("de_core_news_lg")
layout = spaCyLayout(nlp)
doc = layout(str(input_pdf))
# 1. Save full extracted text
(output_dir / "text.txt").write_text(doc.text, encoding="utf-8")
# 2. Save tables to CSV files
for i, table in enumerate(doc._.tables, 1):
df = table._.data
df.to_csv(output_dir / f"table_{i}.csv", index=False)
# 3. NER on full text
# 1. NER on full text
ner_text_results = []
doc_ner = nlp(doc.text)
ents = [{"text": ent.text, "label": ent.label_} for ent in doc_ner.ents]
(output_dir / "ner_text.json").write_text(json.dumps(ents, indent=2, ensure_ascii=False))
for ent in doc_ner.ents:
page_number = None
for i, (page_layout, spans) in enumerate(doc._.pages, 1):
for span in spans:
if span.start_char <= ent.start_char < span.end_char:
page_number = i
break
if page_number:
break
# 4. NER on table cells
if ent.text.strip():
ner_text_results.append({
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number
})
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
# 2. NER on table cells
table_ner_results = []
for i, table in enumerate(doc._.tables, 1):
table_ents = []
df = table._.data
for row in df.astype(str).values:
page_number = None
for pg_num, (page_layout, spans) in enumerate(doc._.pages, 1):
for span in spans:
if span.start <= table.start < span.end:
page_number = pg_num
break
if page_number:
break
for row in df.astype(str).values:
for cell in row:
doc_cell = nlp(cell)
table_ents.extend({
"table": i,
"cell_text": cell,
"entity": ent.text,
"label": ent.label_
} for ent in doc_cell.ents)
table_ner_results.extend(table_ents)
for ent in doc_cell.ents:
if ent.text.strip():
table_ner_results.append({
"label": ent.label_,
"entity": ent.text.strip(),
"page": page_number,
"table": i
})
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
print("✅ Done! Extracted data saved to /output")
### Run with: python extract_pitchbooks.py
print("✅ Done! Extracted data saved to /output")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff