pse2_ff/project/backend/spacy-service/spacy_training/training_model.py

41 lines
948 B
Python

import os
from pathlib import Path
import spacy
from spacy.cli.train import train
from spacy.tokens import DocBin
from tqdm import tqdm
from training_data import TRAINING_DATA
nlp = spacy.blank("de")
# create a DocBin object
db = DocBin()
for text, annot in tqdm(TRAINING_DATA):
doc = nlp.make_doc(text)
ents = []
# add character indexes
for start, end, label in annot["entities"]:
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
else:
ents.append(span)
# label the text with the ents
doc.ents = ents
db.add(doc)
# save the DocBin object
os.makedirs("./data", exist_ok=True)
db.to_disk("./data/train.spacy")
config_path = Path("config.cfg")
output_path = Path("output")
print("Starte Training...")
train(config_path, output_path)