Test spacy training with riskprofile

2025-04-22 20:05:37 +02:00 · 2025-04-22 20:05:37 +02:00 · 7cf96232e9
parent b8882ae99f
commit 7cf96232e9
37 changed files with 4507 additions and 0 deletions
--- a/prototypes/fine_tuning_spaCy/.python-version
+++ b/prototypes/fine_tuning_spaCy/.python-version
@ -0,0 +1 @@
 3.11.8
--- a/prototypes/fine_tuning_spaCy/pycache/training_data.cpython-311.pyc
+++ b/prototypes/fine_tuning_spaCy/pycache/training_data.cpython-311.pyc
--- a/prototypes/fine_tuning_spaCy/base_config.cfg
+++ b/prototypes/fine_tuning_spaCy/base_config.cfg
@ -0,0 +1,85 @@
 # This is an auto-generated partial config. To use it with 'spacy train'
 # you can run spacy init fill-config to auto-fill all default settings:
 # python -m spacy init fill-config ./base_config.cfg ./config.cfg
 [paths]
 train = ./data/train.spacy
 dev = ./data/train.spacy
 vectors = null
 [system]
 gpu_allocator = null
 [nlp]
 lang = "de"
 pipeline = ["tok2vec","ner"]
 batch_size = 1000
 [components]
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 rows = [5000, 1000, 2500, 2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [components.ner]
 factory = "ner"
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 [corpora]
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 [training.optimizer]
@optimizers = "Adam.v1"
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 [initialize]
 vectors = ${paths.vectors}
--- a/prototypes/fine_tuning_spaCy/config.cfg
+++ b/prototypes/fine_tuning_spaCy/config.cfg
@ -0,0 +1,145 @@
 [paths]
 train = "./data/train.spacy"
 dev = "./data/train.spacy"
 vectors = null
 init_tok2vec = null
 [system]
 gpu_allocator = null
 seed = 0
 [nlp]
 lang = "de"
 pipeline = ["tok2vec","ner"]
 batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,1000,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 before_update = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 ents_f = 1.0
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/convert_to_spacy.py
+++ b/prototypes/fine_tuning_spaCy/convert_to_spacy.py
@ -0,0 +1,20 @@
 import spacy
 from spacy.tokens import DocBin
 from training_data import TRAINING_DATA
 nlp = spacy.blank("de") 
 doc_bin = DocBin()
 for text, annotations in TRAINING_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"⚠️ Skipping entity: ({start}, {end}, {label}) in: {text}")
        else:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
 doc_bin.to_disk("data/train.spacy")
--- a/prototypes/fine_tuning_spaCy/data/train.spacy
+++ b/prototypes/fine_tuning_spaCy/data/train.spacy
--- a/prototypes/fine_tuning_spaCy/entities_output.json
+++ b/prototypes/fine_tuning_spaCy/entities_output.json
--- a/prototypes/fine_tuning_spaCy/output/model-best/config.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/config.cfg
@ -0,0 +1,145 @@
 [paths]
 train = "./data/train.spacy"
 dev = "./data/train.spacy"
 vectors = null
 init_tok2vec = null
 [system]
 gpu_allocator = null
 seed = 0
 [nlp]
 lang = "de"
 pipeline = ["tok2vec","ner"]
 batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,1000,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 before_update = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 ents_f = 1.0
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/output/model-best/meta.json
+++ b/prototypes/fine_tuning_spaCy/output/model-best/meta.json
@ -0,0 +1,52 @@
 {
  "lang":"de",
  "name":"pipeline",
  "version":"0.0.0",
  "spacy_version":">=3.7.2,<3.8.0",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "spacy_git_version":"a89eae928",
  "vectors":{
    "width":0,
    "vectors":0,
    "keys":0,
    "name":null,
    "mode":"default"
  },
  "labels":{
    "tok2vec":[
    ],
    "ner":[
      "RISIKOPROFIL"
    ]
  },
  "pipeline":[
    "tok2vec",
    "ner"
  ],
  "components":[
    "tok2vec",
    "ner"
  ],
  "disabled":[
  ],
  "performance":{
    "ents_f":1.0,
    "ents_p":1.0,
    "ents_r":1.0,
    "ents_per_type":{
      "RISIKOPROFIL":{
        "p":1.0,
        "r":1.0,
        "f":1.0
      }
    },
    "tok2vec_loss":0.000000011,
    "ner_loss":0.0000000457
  }
 }
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/cfg
@ -0,0 +1,13 @@
 {
  "moves":null,
  "update_with_oracle_cut_size":100,
  "multitasks":[
  ],
  "min_action_freq":1,
  "learn_tokens":false,
  "beam_width":1,
  "beam_density":0.0,
  "beam_update_prob":0.0,
  "incorrect_spans_key":null
 }
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/model
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/model
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves
@ -0,0 +1 @@
 ‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ
--- a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/cfg
@ -0,0 +1,3 @@
 {
 }
--- a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model
--- a/prototypes/fine_tuning_spaCy/output/model-best/tokenizer
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tokenizer
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/key2row
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/key2row
@ -0,0 +1 @@
 <EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/lookups.bin
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/lookups.bin
@ -0,0 +1 @@
 <EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors.cfg
@ -0,0 +1,3 @@
 {
  "mode":"default"
 }
--- a/prototypes/fine_tuning_spaCy/output/model-last/config.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/config.cfg
@ -0,0 +1,145 @@
 [paths]
 train = "./data/train.spacy"
 dev = "./data/train.spacy"
 vectors = null
 init_tok2vec = null
 [system]
 gpu_allocator = null
 seed = 0
 [nlp]
 lang = "de"
 pipeline = ["tok2vec","ner"]
 batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,1000,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 before_update = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 ents_f = 1.0
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/output/model-last/meta.json
+++ b/prototypes/fine_tuning_spaCy/output/model-last/meta.json
@ -0,0 +1,52 @@
 {
  "lang":"de",
  "name":"pipeline",
  "version":"0.0.0",
  "spacy_version":">=3.7.2,<3.8.0",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "spacy_git_version":"a89eae928",
  "vectors":{
    "width":0,
    "vectors":0,
    "keys":0,
    "name":null,
    "mode":"default"
  },
  "labels":{
    "tok2vec":[
    ],
    "ner":[
      "RISIKOPROFIL"
    ]
  },
  "pipeline":[
    "tok2vec",
    "ner"
  ],
  "components":[
    "tok2vec",
    "ner"
  ],
  "disabled":[
  ],
  "performance":{
    "ents_f":1.0,
    "ents_p":1.0,
    "ents_r":1.0,
    "ents_per_type":{
      "RISIKOPROFIL":{
        "p":1.0,
        "r":1.0,
        "f":1.0
      }
    },
    "tok2vec_loss":0.000000011,
    "ner_loss":0.0000000457
  }
 }
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/cfg
@ -0,0 +1,13 @@
 {
  "moves":null,
  "update_with_oracle_cut_size":100,
  "multitasks":[
  ],
  "min_action_freq":1,
  "learn_tokens":false,
  "beam_width":1,
  "beam_density":0.0,
  "beam_update_prob":0.0,
  "incorrect_spans_key":null
 }
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/model
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/model
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves
@ -0,0 +1 @@
 ‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ
--- a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/cfg
@ -0,0 +1,3 @@
 {
 }
--- a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model
--- a/prototypes/fine_tuning_spaCy/output/model-last/tokenizer
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tokenizer
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/key2row
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/key2row
@ -0,0 +1 @@
 <EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/lookups.bin
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/lookups.bin
@ -0,0 +1 @@
 <EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors.cfg
@ -0,0 +1,3 @@
 {
  "mode":"default"
 }
--- a/prototypes/fine_tuning_spaCy/requirements.txt
+++ b/prototypes/fine_tuning_spaCy/requirements.txt
@ -0,0 +1,4 @@
 spacy==3.7.2
 spacy-transformers==1.3.3
 transformers==4.35.2
 torch==2.1.0
--- a/prototypes/fine_tuning_spaCy/test_model.py
+++ b/prototypes/fine_tuning_spaCy/test_model.py
@ -0,0 +1,27 @@
 import spacy
 import fitz
 import json
 from pathlib import Path
 nlp = spacy.load("output/model-last")
 input_pdf = Path("../../pitch-books/Pitchbook 1.pdf")
 doc = fitz.open(input_pdf)
 results = []
 for page_number in range(len(doc)):
    page = doc.load_page(page_number)
    text = page.get_text()
    spacy_doc = nlp(text)
    for ent in spacy_doc.ents:
        results.append({
            "label": ent.label_,
            "entity": ent.text.strip(),
            "page": page_number + 1 
        })
 with open("entities_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
 print("✅ Extraction completed. Results saved to 'entities_output.json'")
--- a/prototypes/fine_tuning_spaCy/training_data.py
+++ b/prototypes/fine_tuning_spaCy/training_data.py
@ -0,0 +1,26 @@
 TRAINING_DATA = [
    (
        "Core",{"entities":[[0,4,"RISIKOPROFIL"]]},
    ),
    (
        "Core+",{"entities":[[0,5,"RISIKOPROFIL"]]},
    ),
    (
        "Core/Core+",{"entities":[[0,10,"RISIKOPROFIL"]]},
    ),
    (
        "Value Add",{"entities":[[0,9,"RISIKOPROFIL"]]},
    ),
     (
        "Core/Value Add",{"entities":[[0,14,"RISIKOPROFIL"]]},
    ),
     (
        "Core+/Value Add",{"entities":[[0,15,"RISIKOPROFIL"]]},
    ),
     (
        "Core/Core+/Value Add",{"entities":[[0,20,"RISIKOPROFIL"]]},
    ),  
    (
        "The RE portfolio of the fund is a good illustration of Fond expertise in European core/core+ investments .",{"entities":[[82,92,"RISIKOPROFIL"]]},
    ),
 ] 
--- a/prototypes/fine_tuning_spaCy/training_model.py
+++ b/prototypes/fine_tuning_spaCy/training_model.py
@ -0,0 +1,35 @@
 # Dreji18 (2024): GitHub: NER-Training-Spacy-3.0. https://github.com/dreji18/NER-Training-Spacy-3.0 (10.05.2024).
 # SpaCy (2024): SpaCy Training Pipelines & Models. https://spacy.io/usage/training (10.05.2024).
 import os
 import spacy
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import DocBin
 from spacy.util import compile_infix_regex
 from tqdm import tqdm
 from training_data import TRAINING_DATA
 nlp = spacy.load("de_core_news_sm")
 # create a DocBin object
 db = DocBin()
 for text, annot in tqdm(TRAINING_DATA):
    doc = nlp.make_doc(text)
    ents = []
    # add character indexes
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
        else:
            ents.append(span)
            print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
    # label the text with the ents
    doc.ents = ents
    db.add(doc)
 # save the DocBin object
 os.makedirs("./data", exist_ok=True)
 db.to_disk("./data/train.spacy")
		`@ -0,0 +1 @@`
							`‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ`