Test spacy training with riskprofile

2025-04-22 20:05:37 +02:00 · 2025-04-22 20:05:37 +02:00 · 7cf96232e9
parent b8882ae99f
commit 7cf96232e9
37 changed files with 4507 additions and 0 deletions
--- a/prototypes/fine_tuning_spaCy/.python-version
+++ b/prototypes/fine_tuning_spaCy/.python-version
@ -0,0 +1 @@
+3.11.8
--- a/prototypes/fine_tuning_spaCy/pycache/training_data.cpython-311.pyc
+++ b/prototypes/fine_tuning_spaCy/pycache/training_data.cpython-311.pyc
--- a/prototypes/fine_tuning_spaCy/base_config.cfg
+++ b/prototypes/fine_tuning_spaCy/base_config.cfg
@ -0,0 +1,85 @@
+# This is an auto-generated partial config. To use it with 'spacy train'
+# you can run spacy init fill-config to auto-fill all default settings:
+# python -m spacy init fill-config ./base_config.cfg ./config.cfg
+[paths]
+train = ./data/train.spacy
+dev = ./data/train.spacy
+vectors = null
+[system]
+gpu_allocator = null
+
+[nlp]
+lang = "de"
+pipeline = ["tok2vec","ner"]
+batch_size = 1000
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 1000, 2500, 2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.ner]
+factory = "ner"
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[initialize]
+vectors = ${paths.vectors}
--- a/prototypes/fine_tuning_spaCy/config.cfg
+++ b/prototypes/fine_tuning_spaCy/config.cfg
@ -0,0 +1,145 @@
+[paths]
+train = "./data/train.spacy"
+dev = "./data/train.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["tok2vec","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,1000,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/convert_to_spacy.py
+++ b/prototypes/fine_tuning_spaCy/convert_to_spacy.py
@ -0,0 +1,20 @@
+import spacy
+from spacy.tokens import DocBin
+from training_data import TRAINING_DATA
+
+nlp = spacy.blank("de") 
+doc_bin = DocBin()
+
+for text, annotations in TRAINING_DATA:
+    doc = nlp.make_doc(text)
+    ents = []
+    for start, end, label in annotations["entities"]:
+        span = doc.char_span(start, end, label=label)
+        if span is None:
+            print(f"⚠️ Skipping entity: ({start}, {end}, {label}) in: {text}")
+        else:
+            ents.append(span)
+    doc.ents = ents
+    doc_bin.add(doc)
+
+doc_bin.to_disk("data/train.spacy")
--- a/prototypes/fine_tuning_spaCy/data/train.spacy
+++ b/prototypes/fine_tuning_spaCy/data/train.spacy
--- a/prototypes/fine_tuning_spaCy/entities_output.json
+++ b/prototypes/fine_tuning_spaCy/entities_output.json
--- a/prototypes/fine_tuning_spaCy/output/model-best/config.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/config.cfg
@ -0,0 +1,145 @@
+[paths]
+train = "./data/train.spacy"
+dev = "./data/train.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["tok2vec","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,1000,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/output/model-best/meta.json
+++ b/prototypes/fine_tuning_spaCy/output/model-best/meta.json
@ -0,0 +1,52 @@
+{
+  "lang":"de",
+  "name":"pipeline",
+  "version":"0.0.0",
+  "spacy_version":">=3.7.2,<3.8.0",
+  "description":"",
+  "author":"",
+  "email":"",
+  "url":"",
+  "license":"",
+  "spacy_git_version":"a89eae928",
+  "vectors":{
+    "width":0,
+    "vectors":0,
+    "keys":0,
+    "name":null,
+    "mode":"default"
+  },
+  "labels":{
+    "tok2vec":[
+
+    ],
+    "ner":[
+      "RISIKOPROFIL"
+    ]
+  },
+  "pipeline":[
+    "tok2vec",
+    "ner"
+  ],
+  "components":[
+    "tok2vec",
+    "ner"
+  ],
+  "disabled":[
+
+  ],
+  "performance":{
+    "ents_f":1.0,
+    "ents_p":1.0,
+    "ents_r":1.0,
+    "ents_per_type":{
+      "RISIKOPROFIL":{
+        "p":1.0,
+        "r":1.0,
+        "f":1.0
+      }
+    },
+    "tok2vec_loss":0.000000011,
+    "ner_loss":0.0000000457
+  }
+}
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/cfg
@ -0,0 +1,13 @@
+{
+  "moves":null,
+  "update_with_oracle_cut_size":100,
+  "multitasks":[
+
+  ],
+  "min_action_freq":1,
+  "learn_tokens":false,
+  "beam_width":1,
+  "beam_density":0.0,
+  "beam_update_prob":0.0,
+  "incorrect_spans_key":null
+}
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/model
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/model
--- a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves
+++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves
@ -0,0 +1 @@
+‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ
--- a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/cfg
@ -0,0 +1,3 @@
+{
+
+}
--- a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model
--- a/prototypes/fine_tuning_spaCy/output/model-best/tokenizer
+++ b/prototypes/fine_tuning_spaCy/output/model-best/tokenizer
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/key2row
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/key2row
@ -0,0 +1 @@
+<EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/lookups.bin
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/lookups.bin
@ -0,0 +1 @@
+<EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors
--- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/vectors.cfg
@ -0,0 +1,3 @@
+{
+  "mode":"default"
+}
--- a/prototypes/fine_tuning_spaCy/output/model-last/config.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/config.cfg
@ -0,0 +1,145 @@
+[paths]
+train = "./data/train.spacy"
+dev = "./data/train.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["tok2vec","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,1000,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
--- a/prototypes/fine_tuning_spaCy/output/model-last/meta.json
+++ b/prototypes/fine_tuning_spaCy/output/model-last/meta.json
@ -0,0 +1,52 @@
+{
+  "lang":"de",
+  "name":"pipeline",
+  "version":"0.0.0",
+  "spacy_version":">=3.7.2,<3.8.0",
+  "description":"",
+  "author":"",
+  "email":"",
+  "url":"",
+  "license":"",
+  "spacy_git_version":"a89eae928",
+  "vectors":{
+    "width":0,
+    "vectors":0,
+    "keys":0,
+    "name":null,
+    "mode":"default"
+  },
+  "labels":{
+    "tok2vec":[
+
+    ],
+    "ner":[
+      "RISIKOPROFIL"
+    ]
+  },
+  "pipeline":[
+    "tok2vec",
+    "ner"
+  ],
+  "components":[
+    "tok2vec",
+    "ner"
+  ],
+  "disabled":[
+
+  ],
+  "performance":{
+    "ents_f":1.0,
+    "ents_p":1.0,
+    "ents_r":1.0,
+    "ents_per_type":{
+      "RISIKOPROFIL":{
+        "p":1.0,
+        "r":1.0,
+        "f":1.0
+      }
+    },
+    "tok2vec_loss":0.000000011,
+    "ner_loss":0.0000000457
+  }
+}
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/cfg
@ -0,0 +1,13 @@
+{
+  "moves":null,
+  "update_with_oracle_cut_size":100,
+  "multitasks":[
+
+  ],
+  "min_action_freq":1,
+  "learn_tokens":false,
+  "beam_width":1,
+  "beam_density":0.0,
+  "beam_update_prob":0.0,
+  "incorrect_spans_key":null
+}
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/model
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/model
--- a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves
+++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves
@ -0,0 +1 @@
+‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ
--- a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/cfg
@ -0,0 +1,3 @@
+{
+
+}
--- a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model
--- a/prototypes/fine_tuning_spaCy/output/model-last/tokenizer
+++ b/prototypes/fine_tuning_spaCy/output/model-last/tokenizer
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/key2row
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/key2row
@ -0,0 +1 @@
+<EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/lookups.bin
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/lookups.bin
@ -0,0 +1 @@
+<EFBFBD>
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors
--- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors.cfg
+++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/vectors.cfg
@ -0,0 +1,3 @@
+{
+  "mode":"default"
+}
--- a/prototypes/fine_tuning_spaCy/requirements.txt
+++ b/prototypes/fine_tuning_spaCy/requirements.txt
@ -0,0 +1,4 @@
+spacy==3.7.2
+spacy-transformers==1.3.3
+transformers==4.35.2
+torch==2.1.0
--- a/prototypes/fine_tuning_spaCy/test_model.py
+++ b/prototypes/fine_tuning_spaCy/test_model.py
@ -0,0 +1,27 @@
+import spacy
+import fitz
+import json
+from pathlib import Path
+
+nlp = spacy.load("output/model-last")
+input_pdf = Path("../../pitch-books/Pitchbook 1.pdf")
+doc = fitz.open(input_pdf)
+
+
+results = []
+
+for page_number in range(len(doc)):
+    page = doc.load_page(page_number)
+    text = page.get_text()
+    spacy_doc = nlp(text)
+    for ent in spacy_doc.ents:
+        results.append({
+            "label": ent.label_,
+            "entity": ent.text.strip(),
+            "page": page_number + 1 
+        })
+
+with open("entities_output.json", "w", encoding="utf-8") as f:
+    json.dump(results, f, indent=2, ensure_ascii=False)
+
+print("✅ Extraction completed. Results saved to 'entities_output.json'")
--- a/prototypes/fine_tuning_spaCy/training_data.py
+++ b/prototypes/fine_tuning_spaCy/training_data.py
@ -0,0 +1,26 @@
+TRAINING_DATA = [
+    (
+        "Core",{"entities":[[0,4,"RISIKOPROFIL"]]},
+    ),
+    (
+        "Core+",{"entities":[[0,5,"RISIKOPROFIL"]]},
+    ),
+    (
+        "Core/Core+",{"entities":[[0,10,"RISIKOPROFIL"]]},
+    ),
+    (
+        "Value Add",{"entities":[[0,9,"RISIKOPROFIL"]]},
+    ),
+     (
+        "Core/Value Add",{"entities":[[0,14,"RISIKOPROFIL"]]},
+    ),
+     (
+        "Core+/Value Add",{"entities":[[0,15,"RISIKOPROFIL"]]},
+    ),
+     (
+        "Core/Core+/Value Add",{"entities":[[0,20,"RISIKOPROFIL"]]},
+    ),  
+    (
+        "The RE portfolio of the fund is a good illustration of Fond expertise in European core/core+ investments .",{"entities":[[82,92,"RISIKOPROFIL"]]},
+    ),
+] 
--- a/prototypes/fine_tuning_spaCy/training_model.py
+++ b/prototypes/fine_tuning_spaCy/training_model.py
@ -0,0 +1,35 @@
+# Dreji18 (2024): GitHub: NER-Training-Spacy-3.0. https://github.com/dreji18/NER-Training-Spacy-3.0 (10.05.2024).
+# SpaCy (2024): SpaCy Training Pipelines & Models. https://spacy.io/usage/training (10.05.2024).
+
+import os
+import spacy
+from spacy.tokenizer import Tokenizer
+from spacy.tokens import DocBin
+from spacy.util import compile_infix_regex
+from tqdm import tqdm
+
+from training_data import TRAINING_DATA
+
+nlp = spacy.load("de_core_news_sm")
+
+# create a DocBin object
+db = DocBin()
+
+for text, annot in tqdm(TRAINING_DATA):
+    doc = nlp.make_doc(text)
+    ents = []
+    # add character indexes
+    for start, end, label in annot["entities"]:
+        span = doc.char_span(start, end, label=label, alignment_mode="contract")
+        if span is None:
+            print(f"Skipping entity: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+        else:
+            ents.append(span)
+            print(f"Entity sucessful: |{text[start:end]}| Start: {start}, End: {end}, Label: {label}")
+    # label the text with the ents
+    doc.ents = ents
+    db.add(doc)
+
+# save the DocBin object
+os.makedirs("./data", exist_ok=True)
+db.to_disk("./data/train.spacy")
				`@ -0,0 +1 @@`
				`‚ĄmovesŮx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}Łcfg<66>§neg_keyŔ`