import os from pathlib import Path import spacy from spacy.cli.train import train from spacy.tokens import DocBin from tqdm import tqdm from training_data import TRAINING_DATA nlp = spacy.blank("de") # create a DocBin object db = DocBin() for text, annot in tqdm(TRAINING_DATA): doc = nlp.make_doc(text) ents = [] # add character indexes for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}") else: ents.append(span) # label the text with the ents doc.ents = ents db.add(doc) # save the DocBin object os.makedirs("./data", exist_ok=True) db.to_disk("./data/train.spacy") config_path = Path("config.cfg") output_path = Path("output") print("Starte Training...") train(config_path, output_path)