Added page number and process
parent
14364100b0
commit
bfcbc9cca0
|
|
@ -1,5 +1,5 @@
|
|||
# https://github.com/explosion/spacy-layout
|
||||
|
||||
### Run with: python extract_pitchbooks.py
|
||||
import spacy
|
||||
from spacy_layout import spaCyLayout
|
||||
from pathlib import Path
|
||||
|
|
@ -13,40 +13,63 @@ output_dir.mkdir(exist_ok=True)
|
|||
|
||||
nlp = spacy.load("de_core_news_lg")
|
||||
layout = spaCyLayout(nlp)
|
||||
|
||||
doc = layout(str(input_pdf))
|
||||
|
||||
# 1. Save full extracted text
|
||||
(output_dir / "text.txt").write_text(doc.text, encoding="utf-8")
|
||||
|
||||
# 2. Save tables to CSV files
|
||||
for i, table in enumerate(doc._.tables, 1):
|
||||
df = table._.data
|
||||
df.to_csv(output_dir / f"table_{i}.csv", index=False)
|
||||
|
||||
# 3. NER on full text
|
||||
# 1. NER on full text
|
||||
ner_text_results = []
|
||||
doc_ner = nlp(doc.text)
|
||||
ents = [{"text": ent.text, "label": ent.label_} for ent in doc_ner.ents]
|
||||
(output_dir / "ner_text.json").write_text(json.dumps(ents, indent=2, ensure_ascii=False))
|
||||
for ent in doc_ner.ents:
|
||||
page_number = None
|
||||
for i, (page_layout, spans) in enumerate(doc._.pages, 1):
|
||||
for span in spans:
|
||||
if span.start_char <= ent.start_char < span.end_char:
|
||||
page_number = i
|
||||
break
|
||||
if page_number:
|
||||
break
|
||||
|
||||
# 4. NER on table cells
|
||||
if ent.text.strip():
|
||||
ner_text_results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number
|
||||
})
|
||||
|
||||
print(f"Saving {len(ner_text_results)} NER results from full text to ner_text.json")
|
||||
(output_dir / "ner_text.json").write_text(json.dumps(ner_text_results, indent=2, ensure_ascii=False))
|
||||
|
||||
# 2. NER on table cells
|
||||
table_ner_results = []
|
||||
for i, table in enumerate(doc._.tables, 1):
|
||||
table_ents = []
|
||||
df = table._.data
|
||||
for row in df.astype(str).values:
|
||||
page_number = None
|
||||
|
||||
for pg_num, (page_layout, spans) in enumerate(doc._.pages, 1):
|
||||
for span in spans:
|
||||
if span.start <= table.start < span.end:
|
||||
page_number = pg_num
|
||||
break
|
||||
if page_number:
|
||||
break
|
||||
|
||||
for row in df.astype(str).values:
|
||||
for cell in row:
|
||||
doc_cell = nlp(cell)
|
||||
table_ents.extend({
|
||||
"table": i,
|
||||
"cell_text": cell,
|
||||
"entity": ent.text,
|
||||
"label": ent.label_
|
||||
} for ent in doc_cell.ents)
|
||||
table_ner_results.extend(table_ents)
|
||||
for ent in doc_cell.ents:
|
||||
if ent.text.strip():
|
||||
table_ner_results.append({
|
||||
"label": ent.label_,
|
||||
"entity": ent.text.strip(),
|
||||
"page": page_number,
|
||||
"table": i
|
||||
})
|
||||
|
||||
print(f"Saving {len(table_ner_results)} NER results from tables to ner_tables.json")
|
||||
(output_dir / "ner_tables.json").write_text(json.dumps(table_ner_results, indent=2, ensure_ascii=False))
|
||||
|
||||
print("✅ Done! Extracted data saved to /output")
|
||||
|
||||
### Run with: python extract_pitchbooks.py
|
||||
print("✅ Done! Extracted data saved to /output")
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue