32 lines
764 B
Python
32 lines
764 B
Python
import os
|
|
import spacy
|
|
|
|
from spacy.tokens import DocBin
|
|
|
|
from tqdm import tqdm
|
|
|
|
from training_data import TRAINING_DATA
|
|
|
|
nlp = spacy.blank("de")
|
|
|
|
# create a DocBin object
|
|
db = DocBin()
|
|
|
|
for text, annot in tqdm(TRAINING_DATA):
|
|
doc = nlp.make_doc(text)
|
|
ents = []
|
|
# add character indexes
|
|
for start, end, label in annot["entities"]:
|
|
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
|
if span is None:
|
|
print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
|
|
else:
|
|
ents.append(span)
|
|
# label the text with the ents
|
|
doc.ents = ents
|
|
db.add(doc)
|
|
|
|
# save the DocBin object
|
|
os.makedirs("./data", exist_ok=True)
|
|
db.to_disk("./data/train.spacy")
|