pse2_ff/project/backend/spacy-service/services/extract.py

31 lines
711 B
Python

import spacy
import os
import json
current_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(current_dir, "../spacy_training/output/model-last")
nlp = spacy.load(model_path)
def extract(pages_json):
results = []
for page in pages_json:
text = page.get("text", "").strip()
page_num = page.get("page")
if not text:
continue
spacy_result = nlp(text)
for ent in spacy_result.ents:
results.append({
"label": ent.label_,
"entity": ent.text,
"page": page_num
})
json_result = json.dumps(results, indent=2, ensure_ascii=False)
return json_result