31 lines
711 B
Python
31 lines
711 B
Python
import spacy
|
|
import os
|
|
import json
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
model_path = os.path.join(current_dir, "../spacy_training/output/model-last")
|
|
nlp = spacy.load(model_path)
|
|
|
|
|
|
def extract(pages_json):
|
|
|
|
results = []
|
|
|
|
for page in pages_json:
|
|
text = page.get("text", "").strip()
|
|
page_num = page.get("page")
|
|
|
|
if not text:
|
|
continue
|
|
|
|
spacy_result = nlp(text)
|
|
for ent in spacy_result.ents:
|
|
results.append({
|
|
"label": ent.label_,
|
|
"entity": ent.text,
|
|
"page": page_num
|
|
})
|
|
|
|
json_result = json.dumps(results, indent=2, ensure_ascii=False)
|
|
return json_result
|