41 lines
948 B
Python
41 lines
948 B
Python
import os
|
|
from pathlib import Path
|
|
|
|
import spacy
|
|
from spacy.cli.train import train
|
|
|
|
from spacy.tokens import DocBin
|
|
|
|
from tqdm import tqdm
|
|
|
|
from training_data import TRAINING_DATA
|
|
|
|
nlp = spacy.blank("de")
|
|
|
|
# create a DocBin object
|
|
db = DocBin()
|
|
|
|
for text, annot in tqdm(TRAINING_DATA):
|
|
doc = nlp.make_doc(text)
|
|
ents = []
|
|
# add character indexes
|
|
for start, end, label in annot["entities"]:
|
|
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
|
if span is None:
|
|
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
|
else:
|
|
ents.append(span)
|
|
# label the text with the ents
|
|
doc.ents = ents
|
|
db.add(doc)
|
|
|
|
# save the DocBin object
|
|
os.makedirs("./data", exist_ok=True)
|
|
db.to_disk("./data/train.spacy")
|
|
|
|
config_path = Path("config.cfg")
|
|
output_path = Path("output")
|
|
|
|
print("Starte Training...")
|
|
train(config_path, output_path)
|