diff --git a/prototypes/Parser/Pitchbook3.pdf b/prototypes/Parser/Pitchbook3.pdf new file mode 100644 index 0000000..ea60f84 Binary files /dev/null and b/prototypes/Parser/Pitchbook3.pdf differ diff --git a/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-312.pyc b/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-312.pyc new file mode 100644 index 0000000..2f55aa3 Binary files /dev/null and b/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-312.pyc differ diff --git a/prototypes/fine_tuning_spaCy/data/train.spacy b/prototypes/fine_tuning_spaCy/data/train.spacy index 7d9dfbb..e488733 100644 Binary files a/prototypes/fine_tuning_spaCy/data/train.spacy and b/prototypes/fine_tuning_spaCy/data/train.spacy differ diff --git a/prototypes/fine_tuning_spaCy/entities_output.json b/prototypes/fine_tuning_spaCy/entities_output.json index 6d98983..fb7268b 100644 --- a/prototypes/fine_tuning_spaCy/entities_output.json +++ b/prototypes/fine_tuning_spaCy/entities_output.json @@ -1,167 +1,232 @@ [ { - "label": "RISIKOPROFIL", - "entity": "Core and Core+", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,0 %", "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "core, core+, value-added", - "page": 7 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5Mio.€", + "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "Core/Core+", - "page": 10 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "1,40 %", + "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "core/core+", - "page": 10 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,91 %", + "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "Core/Core+", - "page": 10 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,0 %", + "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "UK, DE, BE, NL, LU,", - "page": 10 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "0,1%", + "page": 4 }, { - "label": "RISIKOPROFIL", - "entity": "Core / Core +", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,0%", + "page": 5 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4 %", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "800.000", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "600.000", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "400.000", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "500.000 |", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4", + "page": 9 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5.20%", + "page": 11 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5%", + "page": 11 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4%", + "page": 11 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5.20%", "page": 12 }, { - "label": "RISIKOPROFIL", - "entity": "core\n/ core+", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5%", "page": 12 }, { - "label": "RISIKOPROFIL", - "entity": "core", - "page": 12 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "559 BGB", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "Term / core+", - "page": 12 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4 Soa", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "core/core+", - "page": 12 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4 —", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "6,4 6,4", - "page": 13 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "558 BGB", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "Country /", - "page": 14 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "557 BGB", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore", - "page": 14 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "3-6", + "page": 16 }, { - "label": "RISIKOPROFIL", - "entity": "Country /", - "page": 15 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "Vor NK-Optimlerung", + "page": 17 }, { - "label": "RISIKOPROFIL", - "entity": "Core\nCore\nCore\nCore\nCore\nCore", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "countries, giving", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "K.", "page": 18 }, { - "label": "RISIKOPROFIL", - "entity": "core/core+", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4 an", "page": 20 }, { "label": "RISIKOPROFIL", - "entity": "core/core+", - "page": 20 + "entity": "Value Adjustments", + "page": 25 }, { "label": "RISIKOPROFIL", - "entity": "D, and", - "page": 21 + "entity": "Dach/", + "page": 28 }, { "label": "RISIKOPROFIL", - "entity": "UK, DE, BE, NL, LU,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "core or", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core +", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "kgCO,e", + "entity": "CO2-Emissionen", "page": 30 }, { - "label": "RISIKOPROFIL", - "entity": "C,", - "page": 32 + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "UM\\", + "page": 30 }, { - "label": "RISIKOPROFIL", - "entity": "KfW, Dwp", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5%", + "page": 30 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5%", + "page": 31 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "5%", + "page": 31 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "1%", "page": 35 }, { - "label": "RISIKOPROFIL", - "entity": "Bank,", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "CO2-Emissionen", "page": 38 }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "84,0%", + "page": 42 + }, { "label": "RISIKOPROFIL", - "entity": "Core", - "page": 38 + "entity": "Core, Core +", + "page": 42 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "0,1 %", + "page": 42 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "0,44 %", + "page": 42 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "1,40 %", + "page": 42 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "491%", + "page": 42 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,0%", + "page": 42 + }, + { + "label": "RISIKOPROFIL", + "entity": "Core, Core +", + "page": 44 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "1,40%", + "page": 45 + }, + { + "label": "AUSSCHÜTTUNGSRENDITE", + "entity": "4,0%", + "page": 45 } ] \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/meta.json b/prototypes/fine_tuning_spaCy/output/model-best/meta.json index daabc76..7e74640 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/meta.json +++ b/prototypes/fine_tuning_spaCy/output/model-best/meta.json @@ -21,6 +21,7 @@ ], "ner":[ + "AUSSCH\u00dcTTUNGSRENDITE", "RISIKOPROFIL" ] }, @@ -36,17 +37,22 @@ ], "performance":{ - "ents_f":1.0, - "ents_p":1.0, + "ents_f":0.9107142857, + "ents_p":0.8360655738, "ents_r":1.0, "ents_per_type":{ "RISIKOPROFIL":{ "p":1.0, "r":1.0, "f":1.0 + }, + "AUSSCH\u00dcTTUNGSRENDITE":{ + "p":0.6296296296, + "r":1.0, + "f":0.7727272727 } }, - "tok2vec_loss":0.000000029, - "ner_loss":0.0000000614 + "tok2vec_loss":34.4831294568, + "ner_loss":1020.9595334249 } } \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/ner/model b/prototypes/fine_tuning_spaCy/output/model-best/ner/model index 8a0c5ae..af79c1c 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-best/ner/model and b/prototypes/fine_tuning_spaCy/output/model-best/ner/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves index e72ba15..ed9a1b8 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves +++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves @@ -1 +1 @@ -movesx{"0":{},"1":{"RISIKOPROFIL":45},"2":{"RISIKOPROFIL":45},"3":{"RISIKOPROFIL":45},"4":{"RISIKOPROFIL":45,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file +moves{"0":{},"1":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"2":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"3":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"4":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model index 1cfa6a5..10673a9 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model and b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json index 7569f1d..dabd0da 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json +++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json @@ -6,6 +6,7 @@ "\"", "$", "%", + "&", "'", "''", "'-(", @@ -50,6 +51,8 @@ "*", "+", ",", + ",00", + ",25", "-", "-((", "-))", @@ -113,6 +116,7 @@ "0%+", "0.0", "0.o", + "018", "022", "032", "034", @@ -122,21 +126,41 @@ "1.", "10", "12", + "14", "2", "2.", "20", + "200", + "2018", "2022", "2032", "2034", "250", "3", + "3,8", "3.", "33", "333", "35", + "4", + "4,0", + "4,5", + "4,6", + "4,7", + "4,8", + "4,9", + "40", "5", + "5,0", + "5,00", + "5,25", + "5,3", + "5,7", "50", + "6", + "60", "7", + "7,5", "7,50%+", "8", "8)", @@ -243,6 +267,7 @@ ">:o", ">:x", "><(((*>", + "?", "@", "@_@", "A", @@ -250,6 +275,8 @@ "A.C.", "A.D.", "A.G.", + "AIN", + "AUSSCH\u00dcTTUNGSRENDITE", "Abb", "Abb.", "Abbildung", @@ -262,34 +289,51 @@ "Abt.", "Abteilung", "Add", + "Aktive", "Aktueller", + "AlF", "Allgemeine", "Amsterdam", + "Ankaufs-", + "Anlagebedingungen", "Anlagestrategien", + "Anlagesusschuss", "Anlagevehikels", "Ansprechpartners", "Antagevehikels", "Apr", "Apr.", "April", + "Area", "Art", + "Artikel", "Assets", "Aufl\u00f6sung", "Aufwertung", "Aug", "Aug.", "August", + "Auschittungsrendite", + "Auschu\u0308ttungsrendite", + "Ausschu\u0308ttung", + "Ausschu\u0308ttungsintervalle", + "Ausschu\u0308ttungsrandite", + "Ausschu\u0308ttungsrendite", "B", "B.", "B.A.", "B.Sc", "B.Sc.", + "BELGIUM", "Bahnhof", "Band", + "Baumanagementgebahr", "Bd", "Bd.", "Beispiel", + "Benefits", "Berlin", + "Bestandsentwicklung", "Bestandsentwicklungen", "Betr", "Betr.", @@ -300,11 +344,13 @@ "Bhf.", "Biol", "Biol.", + "Brussels", "Bsp", "Bsp.", "C", "C++", "C.", + "CBD", "CDU", "CDU/CSU", "CSU", @@ -313,15 +359,19 @@ "Cie", "Cie.", "Cities", + "Closed", "Closings", "Co", "Co.", + "Considering", "Core", "Core+", "Core+/Value", + "Country", "D", "D.", "D.C.", + "Deutsche", "Deutschland", "Deutschlands", "Dez", @@ -335,22 +385,29 @@ "Dipl.", "Dipl.-Ing", "Dipl.-Ing.", + "Dis", "Do", "Do.", "Donnerstag", "Dr", "Dr.", + "Drawbacks", "D\u00e4nemark", "E", "E-Mail", "E.", + "ERD", + "Einmalige", "Einw", + "Einwohner", + "Estate", "Europe", "European", "Exit", "F", "F.", "FIL", + "FRANCE", "Fa", "Fa.", "Fam", @@ -364,8 +421,10 @@ "Fonds", "Fr", "Fr.", + "France", "Frankreich", "Frau", + "French", "Frl", "Frl.", "Fr\u00e4ulein", @@ -374,9 +433,13 @@ "G.", "G.m.b", "G.m.b.H.", + "GEDis", + "GERD", + "GRRE", "Gateway", "Gebr", "Gebr.", + "Gebu\u0308hren", "Geplantes", "Gesamtrendite", "H", @@ -390,6 +453,8 @@ "Herrn", "Hg", "Hg.", + "Hotel", + "Hotels", "Hr", "Hr.", "Hrn", @@ -402,10 +467,14 @@ "II.", "III", "III.", + "INK", "INREV", "IRR", + "ITE", + "IUM", "IV", "IV.", + "Im", "Immobilien", "Immobilien-Spezialfonds", "Inc", @@ -414,6 +483,8 @@ "Ing", "Ing.", "Investmentmanagers", + "Investtionszeltraum", + "It", "J", "Jahr", "Jahre", @@ -438,13 +509,20 @@ "K.O.", "Kaufen", "Key", + "Kosten", "L", "L'", "L.A.", + "LT", "LTV", "LTY", "Laufzeit", + "Levallois", + "Limited", + "Lisbon", + "Logistics", "London", + "Lux-based", "L\u00e4nderallokation", "L\u2019", "M", @@ -452,16 +530,22 @@ "M.A.", "M.Sc", "M.Sc.", + "MGallery", + "Madrid", + "Manage", "Manager", "Maximaler", "Mehrwertsteuer", + "Metropolregianen", "Metropolregionen", "Mi", "Mi.", "Milliarde", "Million", + "Mindestanlage", "Mio", "Mio.", + "Mitgliedschaft", "Mittwoch", "Mo", "Mo.", @@ -480,8 +564,12 @@ "N", "N.Y.", "N.Y.C.", + "NCE", + "NDS", + "NETHERLANDS", "Nachvermietungsstrategie", "Name", + "Navigate", "Niederlande", "Nov", "Nov.", @@ -489,15 +577,19 @@ "Nr", "Nr.", "Nummer", + "Nuremberg", "O", "O.", "O.K.", "O.O", "O.o", + "OPCI", "O_O", "O_o", + "Of", "Offen", "Offener", + "Offices", "Okt", "Okt.", "Oktober", @@ -507,13 +599,18 @@ "Original", "P", "P.S.", + "PCI", + "Parformanceabh\u00e4ngige", "Paris", "Pkt", "Pkt.", + "Prime", "Prof", "Prof.", "Professor", + "Prognostizierte", "Punkt", + "Qin", "R", "R.", "R.I.P.", @@ -521,19 +618,29 @@ "REV", "RISIKOPROFIL", "ROOT", + "RRE", + "Real", "Rechtsform", "Red", "Red.", "Redaktion", + "Regionen", + "Rents", + "Residential", + "Retail", "Risikoprofil", "Risk", + "Rotterdam", "S", "S'", + "SPAIN", + "SRI", "Sa", "Sa.", "Samstag", "Sc.", "Schweden", + "Sector", "Sep", "Sep.", "Sept", @@ -544,28 +651,35 @@ "So", "So.", "Sonntag", + "Spezial-AlF", "St", "St.", "Standortaufwertungsstrategie", "Standorte", "Std", "Std.", + "Steuern", "Stil", "Str", "Str.", "Strategie", "Stra\u00dfe", "Struktur", + "Strukturierungsgebu\u0308hr", "Stunde", + "St\u00e4dten", "S\u2019", "T", "T.", "Tausend", + "Teflimmobilfe)-", "Tel", "Tel.", "Telefon", "Telefonnummer", + "Tenant", "The", + "There", "Tr", "Tr.", "Tsd", @@ -575,6 +689,7 @@ "U.S.", "U.S.A.", "U.S.S.", + "USt", "Univ", "Univ.", "Universit\u00e4t", @@ -583,10 +698,14 @@ "V.V", "V_V", "Value", + "Vergu\u0308tung", + "Verkaufs", + "Verkaufs-", "Vol", "Vol.", "W", "Wertstabile", + "Why", "Wohnimmobilien", "X'", "X++", @@ -608,6 +727,9 @@ "XXX.", "XXX/XXX", "XXXX", + "XXXxx", + "XXx", + "XXxxxx", "X_X", "X_x", "Xx", @@ -615,6 +737,7 @@ "XxXx", "XxXx.", "Xxx", + "Xxx-xxxx", "Xxx.", "Xxxx", "Xxxx+", @@ -625,8 +748,14 @@ "Xxxx.-Xxx", "Xxxx.-Xxx.", "Xxxxx", + "Xxxxx)-", "Xxxxx)/Xxxx", + "Xxxxx-", + "Xxxxx-XxX", "Xxxxx-Xxxxx", + "Xxxxx\u0308xx", + "Xxxxx\u0308xxxx", + "Xxxx\u0308xxxx", "Xx\u0308xxxx", "X\u2019", "Z", @@ -635,6 +764,7 @@ "Ziel-Netto-IRR", "Zielanlagestrategie", "Zielregionfen)/Jand", + "Zielrendite", "Zielsektoren", "Zielvolumen", "Ziirraiaein", @@ -688,56 +818,98 @@ "abzgl.", "abz\u00fcglich", "ace", + "ach", + "act", + "active", "add", "adv", "adv.", + "aft", + "age", + "aha", "ahr", "ail", "aiming", + "ain", + "ake", + "aktive", "aktueller", "al", "al.", + "alf", + "all", "allg", "allg.", "allgemein", "allgemeine", + "also", "am.", "ame", "amsterdam", + "an", "an.", "and", + "ankaufs-", + "anlagebedingungen", "anlagestrategien", + "anlagesusschuss", "anlagevehikels", + "ans", "ansprechpartners", + "ant", "antagevehikels", "apr", "apr.", + "are", + "area", "ark", + "ars", "art", + "artikel", + "as", "asset", + "assetor", "assets", "at", "at.", + "ate", "ath", + "attractive", "auf", "aufl\u00f6sung", "aufwertung", "auf\u2019m", "aug", "aug.", + "aum", + "auschittungsrendite", + "auschu\u0308ttungsrendite", "ausgew\u00e4hlte", + "ausschu\u0308ttung", + "ausschu\u0308ttungsintervalle", + "ausschu\u0308ttungsrandite", + "ausschu\u0308ttungsrendite", + "aussch\u00fcttungsrendite", + "ave", "ax.", "b", "b.", "b.a.", "b.sc", "b.sc.", + "balanced", + "basis", + "baumanagementgebahr", "bb.", "bd", "bd.", + "be", "beispielsweise", + "belgium", + "benefits", "ber", "berlin", + "bestandsentwicklung", "bestandsentwicklungen", "betr", "betr.", @@ -750,13 +922,18 @@ "biol", "biol.", "bk.", + "bon", "br.", + "brussels", "bs.", "bsp", "bsp.", "bspw", "bspw.", "bt.", + "build", + "buildings", + "but", "bzgl", "bzgl.", "bzw", @@ -768,27 +945,49 @@ "ca", "ca.", "cal", + "can", + "capital", + "cbd", "cdu", "cdu/csu", "ce>", + "ced", + "ces", + "cey", + "characteristic", + "che", "chr", "chr.", "cht", "cie", "cie.", "cities", + "city", + "cks", "cl.", + "closed", "closings", "co", "co.", + "collective", + "combines", + "competition", + "considered", + "considering", "core", "core+", "core+/value", + "could", + "country", + "creation", "csu", "cts", + "current", "d", "d'", "d)", + "d,d", + "d,dd", "d,dd%+", "d-", "d-)", @@ -808,11 +1007,14 @@ "dddd", "de", "ded", + "deep", + "defined", "dem", "den", "der", "dergleichen", "des", + "deutsche", "deutschland", "deutschlands", "development", @@ -827,17 +1029,23 @@ "dipl.", "dipl.-ing", "dipl.-ing.", + "dis", + "diversification", + "diversity", + "dividend", "do", "do.", "don", "dr", "dr.", + "drawbacks", "du", "du\u2019s", "dv.", "d\u00e4nemark", "d\u2019", "e", + "e)-", "e-mail", "e.", "e.V.", @@ -845,13 +1053,16 @@ "e.g", "e.g.", "e.v.", + "eal", "ean", "eb.", "ebd", "ebd.", "ebenda", "ebr", + "economic", "ed.", + "eep", "egr", "egy", "ehem", @@ -863,12 +1074,18 @@ "eine", "einem", "einen", + "einer", + "einmalige", "einw", + "einwohner", "eit", "el.", + "eld", "els", "em.", "en.", + "end", + "ended", "engl", "engl.", "englisch", @@ -876,17 +1093,25 @@ "ent", "entspr", "entspr.", + "eons", "ep.", "ept", "er", "er.", + "erd", + "ere", + "erg", "erm", "erm.", + "ern", "err", "ers", "ersten", + "ery", "er\u2019s", "es", + "ess", + "estate", "etc", "etc.", "etr", @@ -899,8 +1124,10 @@ "evtl", "evtl.", "exit", + "experienced", "expertise", "exposure", + "extract", "ez.", "e\u2019s", "f", @@ -912,18 +1139,26 @@ "feb", "feb.", "fen", + "festen", "festgelegter", + "fierce", "fil", + "flagship", "fond", "fonds", + "for", + "form", "fr", "fr.", + "france", "frankreich", "franz\u00f6sisch", + "french", "frl", "frl.", "frz", "frz.", + "fs-", "fs.", "fund", "fu\u0308hrende", @@ -937,6 +1172,9 @@ "geb.", "gebr", "gebr.", + "gebu\u0308hren", + "ged", + "gedis", "gegebenenfalls", "gegen\u00fcber", "gegr", @@ -945,8 +1183,10 @@ "gem", "gem.", "gen", + "geographies", "geplantes", "ger", + "gerd", "gesamtrendite", "gf.", "gfs", @@ -956,11 +1196,14 @@ "ggfs.", "gg\u00fc", "gg\u00fc.", + "ght", "gie", "gl.", "good", "gr.", + "grre", "gtl", + "guarantee", "g\u00fc.", "h", "h.", @@ -968,17 +1211,23 @@ "h.c.", "halten", "halten-strategie", + "have", "hbf", "hbf.", "hd.", "hed", "hem", + "here", "hf.", "hg", "hg.", + "high", "hil", "hinter", "hinter\u2019m", + "hip", + "his", + "hotel", "hotels", "hr", "hr.", @@ -1005,12 +1254,18 @@ "i.o.", "i.tr.", "i.v.", + "ial", + "ice", "ich", "ich\u2019s", + "ics", + "ide", "ie.", "ien", "ies", "ig.", + "ige", + "igh", "ihr", "ihr\u2019s", "ii", @@ -1018,11 +1273,14 @@ "iii", "iii.", "il.", + "ild", "ile", "illustration", + "im", "ime", "immobilien", "immobilien-spezialfonds", + "improve", "in", "in.", "inc", @@ -1035,15 +1293,21 @@ "ing", "ing.", "initiatives", + "ink", "inkl", "inkl.", "inklusive", "inrev", + "ins", "insb", "insb.", "insbesondere", + "interesting", + "investment", "investmentmanagers", "investments", + "investor", + "investtionszeltraum", "inw", "io.", "iol", @@ -1054,13 +1318,16 @@ "ise", "isk", "iss", + "it", "ite", "ith", + "its", "ity", "itz", "ium", "iv", "iv.", + "ive", "j", "j.", "jahr", @@ -1079,6 +1346,7 @@ "jun.", "jur", "jur.", + "j\u00e4hrliche", "k", "k.", "k.o.", @@ -1086,9 +1354,11 @@ "kath.", "katholisch", "kaufen", + "kel", "ket", "key", "kl.", + "kosten", "kt.", "l", "l'", @@ -1100,21 +1370,38 @@ "laufzeit", "laut", "le.g", + "leave", "ler", + "less", + "levallois", "level", + "leverage", + "leveraged", + "leveraging", "lg.", + "limited", "lin", "lio", + "liquid", + "liquidity", + "lisbon", + "listed", + "lle", "llg", + "lls", "llt", "llv", + "logistics", "london", + "low", + "lso", "lt", "lt.", "lte", "ltv", "lty", "lue", + "lux-based", "lv.", "l\u00e4nderallokation", "l\u2019", @@ -1128,30 +1415,41 @@ "m.m.", "m.sc", "m.sc.", + "madrid", + "main", + "make", "mal", + "manage", "management", "manager", "market", + "markets", "max", "max.", "maximal", "maximaler", + "means", "men", "mer", + "metropolregianen", "metropolregionen", + "mgallery", "mi", "mi.", + "mic", "million", "min", "min.", "mind", "mind.", + "mindestanlage", "mindestens", "minimal", "minor", "mio", "mio.", "mit", + "mitgliedschaft", "mo", "mo.", "monatlich", @@ -1163,6 +1461,7 @@ "mrz.", "mtl", "mtl.", + "must", "mwst", "mwst.", "m\u00e4r", @@ -1174,32 +1473,42 @@ "n.r", "n.y.", "n.y.c.", + "nach", "nachvermietungsstrategie", "name", "nat", "nat.", + "navigate", "nc.", + "nce", + "nch", "ncl", "nd.", "nde", "nds", "ne", + "ned", "nem", "nen", "ner", + "nes", + "netherlands", "ng.", "ngl", "ngs", "niederlande", "niv", "nkl", + "no", "nor", "nov", "nov.", "nr", "nr.", "nsb", + "nse", "nts", + "nuremberg", "o", "o.", "o.0", @@ -1216,24 +1525,45 @@ "o_0", "o_O", "o_o", + "objectives", "of", "of.", + "ofQin", "offen", "offener", + "offers", + "office", + "offices", + "ofqin", "og.", + "ois", "okt", "okt.", "ol.", + "ome", + "on", "ond", + "ons", "ood", + "oom", + "opci", "opco", "ope", + "open", + "open-ended", + "operators", + "opportunities", + "or", + "order", "ore", "orig", "orig.", "original", "orm", + "ors", + "outside", "ov.", + "ove", "over", "p", "p.", @@ -1242,51 +1572,88 @@ "p.s", "p.s.", "pCo", + "pan-European", + "pan-european", + "parformanceabh\u00e4ngige", "paris", + "pci", "pco", + "pen", "pers", "pers.", "phil", "phil.", + "pid", "pkt", "pkt.", "pl.", "portfolio", + "potential", "pr.", "premium", + "preservation", + "pricey", + "prime", "prof", "prof.", "profile", + "prognostizierte", + "program", "projects", + "providing", "pt.", "pw.", "q", "q.", "q.e.d", "q.e.d.", + "qin", "quality", "r", "r.", "r.i.p.", + "ram", + "rapid", + "rce", + "rd", "rd.", "re", "re+", + "rea", + "real", + "recent", "rechtsform", + "recovery", "red", "red.", + "region", + "regionen", + "remains", "ren", + "rent", + "rents", "rer", "rer.", + "residential", + "respective", + "retail", "rev", + "reversion", + "rid", "rig", + "right", "ris", "risikoprofil", "risk", "rl.", "rm.", + "rms", "rn.", "rof", + "room", + "rotterdam", "rr.", + "rre", "rs.", "rsg", "rte", @@ -1307,8 +1674,13 @@ "sb.", "schweden", "sd.", + "sector", + "sectors", + "sed", + "segment", "sen", "sen.", + "sense", "sep", "sep.", "sept", @@ -1318,16 +1690,22 @@ "sg.", "sie", "sie\u2019s", + "sis", "sitz", "skandinavien", + "small", "so", "so.", "sog", "sog.", + "some", "sp.", "space", + "spain", + "spezial-alf", "spr", "spw", + "sri", "ss.", "st", "st.", @@ -1337,34 +1715,53 @@ "std.", "stellv", "stellv.", + "steuern", "stil", "str", "str.", "strategie", + "strategies", "strategy", "struktur", + "strukturierungsgebu\u0308hr", + "st\u00e4dten", "sw.", "s\u2019", "t", "t.", "tactical", + "tal", + "targeting", + "targets", "tc.", "td.", + "ted", + "tee", + "teflimmobilfe)-", "tel", "tel.", "telefonnummer", "ten", + "tenant", "ter", + "terms", "tes", "th.", "the", + "there", + "this", + "through", + "tic", "tig", "til", "time", "tl.", "to", + "top", + "tor", "tr", "tr.", + "try", "tsd", "tsd.", "tur", @@ -1384,23 +1781,34 @@ "u.u.", "u.v.m", "u.v.m.", + "ufs", "ug.", + "ugh", + "uid", "ul.", + "uld", + "umliegende", "un.", "und", + "under", + "undertaking", "ung", "univ", "univ.", + "unless", "unter", "unter\u2019m", "ur.", "ure", "usf", "usf.", + "uss", + "ust", "usw", "usw.", "uvm", "uvm.", + "u\u0308ber", "u\u2019s", "v", "v.", @@ -1415,9 +1823,14 @@ "v_v", "value", "value-added", + "var", "vel", "ver", "vergleiche", + "vergu\u0308tung", + "verkaufs", + "verkaufs-", + "very", "ves", "vgl", "vgl.", @@ -1430,6 +1843,7 @@ "vol", "vol.", "vom", + "von", "vor", "vor\u2019m", "vs", @@ -1438,9 +1852,11 @@ "w", "w.", "wSt", + "walls", "way", "well-established", "wertstabile", + "why", "wir", "wir\u2019s", "wiss", @@ -1477,24 +1893,34 @@ "xx", "xx.", "xx.x", + "xxXxx", "xxx", + "xxx-Xxxxx", + "xxx-xxxx", "xxx.", "xxxx", + "xxxx)-", "xxxx)/xxxx", "xxxx+", "xxxx+/xxxx", + "xxxx-", "xxxx-xxx", "xxxx-xxxx", "xxxx-xxxx-xxx", "xxxx.", + "xxxx\u0308xx", + "xxxx\u0308xxxx", "xxxx\u2019x", "xxx\u2019x", "xx\u0308xxxx", "xx\u2019x", + "x\u0308xxx", "x\u2019", "x\ufe35x", "y", "y.", + "years", + "yield", "z", "z.", "z.B.", @@ -1513,6 +1939,7 @@ "ziel-netto-irr", "zielanlagestrategie", "zielregionfen)/jand", + "zielrendite", "zielsektoren", "zielvolumen", "ziirraiaein", @@ -1520,6 +1947,7 @@ "zw.", "zzgl", "zzgl.", + "{", "|", "}", "\u00a0", @@ -1554,6 +1982,7 @@ "\u00fcber", "\u00fcbersicht", "\u00fcber\u2019m", + "\u0308hr", "\u0ca0", "\u0ca0_\u0ca0", "\u0ca0\ufe35\u0ca0", @@ -1574,6 +2003,8 @@ "\u2019xxx", "\u2019\u2019", "\u201a", + "\u201c", + "\u201d", "\u20ac", "\u2501", "\u253b", diff --git a/prototypes/fine_tuning_spaCy/output/model-last/meta.json b/prototypes/fine_tuning_spaCy/output/model-last/meta.json index daabc76..7e74640 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/meta.json +++ b/prototypes/fine_tuning_spaCy/output/model-last/meta.json @@ -21,6 +21,7 @@ ], "ner":[ + "AUSSCH\u00dcTTUNGSRENDITE", "RISIKOPROFIL" ] }, @@ -36,17 +37,22 @@ ], "performance":{ - "ents_f":1.0, - "ents_p":1.0, + "ents_f":0.9107142857, + "ents_p":0.8360655738, "ents_r":1.0, "ents_per_type":{ "RISIKOPROFIL":{ "p":1.0, "r":1.0, "f":1.0 + }, + "AUSSCH\u00dcTTUNGSRENDITE":{ + "p":0.6296296296, + "r":1.0, + "f":0.7727272727 } }, - "tok2vec_loss":0.000000029, - "ner_loss":0.0000000614 + "tok2vec_loss":34.4831294568, + "ner_loss":1020.9595334249 } } \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-last/ner/model b/prototypes/fine_tuning_spaCy/output/model-last/ner/model index 8a0c5ae..af79c1c 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-last/ner/model and b/prototypes/fine_tuning_spaCy/output/model-last/ner/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves index e72ba15..ed9a1b8 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves +++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves @@ -1 +1 @@ -movesx{"0":{},"1":{"RISIKOPROFIL":45},"2":{"RISIKOPROFIL":45},"3":{"RISIKOPROFIL":45},"4":{"RISIKOPROFIL":45,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file +moves{"0":{},"1":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"2":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"3":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40},"4":{"RISIKOPROFIL":91,"AUSSCH\u00dcTTUNGSRENDITE":40,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model index 1cfa6a5..10673a9 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model and b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json index 7569f1d..dabd0da 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json +++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json @@ -6,6 +6,7 @@ "\"", "$", "%", + "&", "'", "''", "'-(", @@ -50,6 +51,8 @@ "*", "+", ",", + ",00", + ",25", "-", "-((", "-))", @@ -113,6 +116,7 @@ "0%+", "0.0", "0.o", + "018", "022", "032", "034", @@ -122,21 +126,41 @@ "1.", "10", "12", + "14", "2", "2.", "20", + "200", + "2018", "2022", "2032", "2034", "250", "3", + "3,8", "3.", "33", "333", "35", + "4", + "4,0", + "4,5", + "4,6", + "4,7", + "4,8", + "4,9", + "40", "5", + "5,0", + "5,00", + "5,25", + "5,3", + "5,7", "50", + "6", + "60", "7", + "7,5", "7,50%+", "8", "8)", @@ -243,6 +267,7 @@ ">:o", ">:x", "><(((*>", + "?", "@", "@_@", "A", @@ -250,6 +275,8 @@ "A.C.", "A.D.", "A.G.", + "AIN", + "AUSSCH\u00dcTTUNGSRENDITE", "Abb", "Abb.", "Abbildung", @@ -262,34 +289,51 @@ "Abt.", "Abteilung", "Add", + "Aktive", "Aktueller", + "AlF", "Allgemeine", "Amsterdam", + "Ankaufs-", + "Anlagebedingungen", "Anlagestrategien", + "Anlagesusschuss", "Anlagevehikels", "Ansprechpartners", "Antagevehikels", "Apr", "Apr.", "April", + "Area", "Art", + "Artikel", "Assets", "Aufl\u00f6sung", "Aufwertung", "Aug", "Aug.", "August", + "Auschittungsrendite", + "Auschu\u0308ttungsrendite", + "Ausschu\u0308ttung", + "Ausschu\u0308ttungsintervalle", + "Ausschu\u0308ttungsrandite", + "Ausschu\u0308ttungsrendite", "B", "B.", "B.A.", "B.Sc", "B.Sc.", + "BELGIUM", "Bahnhof", "Band", + "Baumanagementgebahr", "Bd", "Bd.", "Beispiel", + "Benefits", "Berlin", + "Bestandsentwicklung", "Bestandsentwicklungen", "Betr", "Betr.", @@ -300,11 +344,13 @@ "Bhf.", "Biol", "Biol.", + "Brussels", "Bsp", "Bsp.", "C", "C++", "C.", + "CBD", "CDU", "CDU/CSU", "CSU", @@ -313,15 +359,19 @@ "Cie", "Cie.", "Cities", + "Closed", "Closings", "Co", "Co.", + "Considering", "Core", "Core+", "Core+/Value", + "Country", "D", "D.", "D.C.", + "Deutsche", "Deutschland", "Deutschlands", "Dez", @@ -335,22 +385,29 @@ "Dipl.", "Dipl.-Ing", "Dipl.-Ing.", + "Dis", "Do", "Do.", "Donnerstag", "Dr", "Dr.", + "Drawbacks", "D\u00e4nemark", "E", "E-Mail", "E.", + "ERD", + "Einmalige", "Einw", + "Einwohner", + "Estate", "Europe", "European", "Exit", "F", "F.", "FIL", + "FRANCE", "Fa", "Fa.", "Fam", @@ -364,8 +421,10 @@ "Fonds", "Fr", "Fr.", + "France", "Frankreich", "Frau", + "French", "Frl", "Frl.", "Fr\u00e4ulein", @@ -374,9 +433,13 @@ "G.", "G.m.b", "G.m.b.H.", + "GEDis", + "GERD", + "GRRE", "Gateway", "Gebr", "Gebr.", + "Gebu\u0308hren", "Geplantes", "Gesamtrendite", "H", @@ -390,6 +453,8 @@ "Herrn", "Hg", "Hg.", + "Hotel", + "Hotels", "Hr", "Hr.", "Hrn", @@ -402,10 +467,14 @@ "II.", "III", "III.", + "INK", "INREV", "IRR", + "ITE", + "IUM", "IV", "IV.", + "Im", "Immobilien", "Immobilien-Spezialfonds", "Inc", @@ -414,6 +483,8 @@ "Ing", "Ing.", "Investmentmanagers", + "Investtionszeltraum", + "It", "J", "Jahr", "Jahre", @@ -438,13 +509,20 @@ "K.O.", "Kaufen", "Key", + "Kosten", "L", "L'", "L.A.", + "LT", "LTV", "LTY", "Laufzeit", + "Levallois", + "Limited", + "Lisbon", + "Logistics", "London", + "Lux-based", "L\u00e4nderallokation", "L\u2019", "M", @@ -452,16 +530,22 @@ "M.A.", "M.Sc", "M.Sc.", + "MGallery", + "Madrid", + "Manage", "Manager", "Maximaler", "Mehrwertsteuer", + "Metropolregianen", "Metropolregionen", "Mi", "Mi.", "Milliarde", "Million", + "Mindestanlage", "Mio", "Mio.", + "Mitgliedschaft", "Mittwoch", "Mo", "Mo.", @@ -480,8 +564,12 @@ "N", "N.Y.", "N.Y.C.", + "NCE", + "NDS", + "NETHERLANDS", "Nachvermietungsstrategie", "Name", + "Navigate", "Niederlande", "Nov", "Nov.", @@ -489,15 +577,19 @@ "Nr", "Nr.", "Nummer", + "Nuremberg", "O", "O.", "O.K.", "O.O", "O.o", + "OPCI", "O_O", "O_o", + "Of", "Offen", "Offener", + "Offices", "Okt", "Okt.", "Oktober", @@ -507,13 +599,18 @@ "Original", "P", "P.S.", + "PCI", + "Parformanceabh\u00e4ngige", "Paris", "Pkt", "Pkt.", + "Prime", "Prof", "Prof.", "Professor", + "Prognostizierte", "Punkt", + "Qin", "R", "R.", "R.I.P.", @@ -521,19 +618,29 @@ "REV", "RISIKOPROFIL", "ROOT", + "RRE", + "Real", "Rechtsform", "Red", "Red.", "Redaktion", + "Regionen", + "Rents", + "Residential", + "Retail", "Risikoprofil", "Risk", + "Rotterdam", "S", "S'", + "SPAIN", + "SRI", "Sa", "Sa.", "Samstag", "Sc.", "Schweden", + "Sector", "Sep", "Sep.", "Sept", @@ -544,28 +651,35 @@ "So", "So.", "Sonntag", + "Spezial-AlF", "St", "St.", "Standortaufwertungsstrategie", "Standorte", "Std", "Std.", + "Steuern", "Stil", "Str", "Str.", "Strategie", "Stra\u00dfe", "Struktur", + "Strukturierungsgebu\u0308hr", "Stunde", + "St\u00e4dten", "S\u2019", "T", "T.", "Tausend", + "Teflimmobilfe)-", "Tel", "Tel.", "Telefon", "Telefonnummer", + "Tenant", "The", + "There", "Tr", "Tr.", "Tsd", @@ -575,6 +689,7 @@ "U.S.", "U.S.A.", "U.S.S.", + "USt", "Univ", "Univ.", "Universit\u00e4t", @@ -583,10 +698,14 @@ "V.V", "V_V", "Value", + "Vergu\u0308tung", + "Verkaufs", + "Verkaufs-", "Vol", "Vol.", "W", "Wertstabile", + "Why", "Wohnimmobilien", "X'", "X++", @@ -608,6 +727,9 @@ "XXX.", "XXX/XXX", "XXXX", + "XXXxx", + "XXx", + "XXxxxx", "X_X", "X_x", "Xx", @@ -615,6 +737,7 @@ "XxXx", "XxXx.", "Xxx", + "Xxx-xxxx", "Xxx.", "Xxxx", "Xxxx+", @@ -625,8 +748,14 @@ "Xxxx.-Xxx", "Xxxx.-Xxx.", "Xxxxx", + "Xxxxx)-", "Xxxxx)/Xxxx", + "Xxxxx-", + "Xxxxx-XxX", "Xxxxx-Xxxxx", + "Xxxxx\u0308xx", + "Xxxxx\u0308xxxx", + "Xxxx\u0308xxxx", "Xx\u0308xxxx", "X\u2019", "Z", @@ -635,6 +764,7 @@ "Ziel-Netto-IRR", "Zielanlagestrategie", "Zielregionfen)/Jand", + "Zielrendite", "Zielsektoren", "Zielvolumen", "Ziirraiaein", @@ -688,56 +818,98 @@ "abzgl.", "abz\u00fcglich", "ace", + "ach", + "act", + "active", "add", "adv", "adv.", + "aft", + "age", + "aha", "ahr", "ail", "aiming", + "ain", + "ake", + "aktive", "aktueller", "al", "al.", + "alf", + "all", "allg", "allg.", "allgemein", "allgemeine", + "also", "am.", "ame", "amsterdam", + "an", "an.", "and", + "ankaufs-", + "anlagebedingungen", "anlagestrategien", + "anlagesusschuss", "anlagevehikels", + "ans", "ansprechpartners", + "ant", "antagevehikels", "apr", "apr.", + "are", + "area", "ark", + "ars", "art", + "artikel", + "as", "asset", + "assetor", "assets", "at", "at.", + "ate", "ath", + "attractive", "auf", "aufl\u00f6sung", "aufwertung", "auf\u2019m", "aug", "aug.", + "aum", + "auschittungsrendite", + "auschu\u0308ttungsrendite", "ausgew\u00e4hlte", + "ausschu\u0308ttung", + "ausschu\u0308ttungsintervalle", + "ausschu\u0308ttungsrandite", + "ausschu\u0308ttungsrendite", + "aussch\u00fcttungsrendite", + "ave", "ax.", "b", "b.", "b.a.", "b.sc", "b.sc.", + "balanced", + "basis", + "baumanagementgebahr", "bb.", "bd", "bd.", + "be", "beispielsweise", + "belgium", + "benefits", "ber", "berlin", + "bestandsentwicklung", "bestandsentwicklungen", "betr", "betr.", @@ -750,13 +922,18 @@ "biol", "biol.", "bk.", + "bon", "br.", + "brussels", "bs.", "bsp", "bsp.", "bspw", "bspw.", "bt.", + "build", + "buildings", + "but", "bzgl", "bzgl.", "bzw", @@ -768,27 +945,49 @@ "ca", "ca.", "cal", + "can", + "capital", + "cbd", "cdu", "cdu/csu", "ce>", + "ced", + "ces", + "cey", + "characteristic", + "che", "chr", "chr.", "cht", "cie", "cie.", "cities", + "city", + "cks", "cl.", + "closed", "closings", "co", "co.", + "collective", + "combines", + "competition", + "considered", + "considering", "core", "core+", "core+/value", + "could", + "country", + "creation", "csu", "cts", + "current", "d", "d'", "d)", + "d,d", + "d,dd", "d,dd%+", "d-", "d-)", @@ -808,11 +1007,14 @@ "dddd", "de", "ded", + "deep", + "defined", "dem", "den", "der", "dergleichen", "des", + "deutsche", "deutschland", "deutschlands", "development", @@ -827,17 +1029,23 @@ "dipl.", "dipl.-ing", "dipl.-ing.", + "dis", + "diversification", + "diversity", + "dividend", "do", "do.", "don", "dr", "dr.", + "drawbacks", "du", "du\u2019s", "dv.", "d\u00e4nemark", "d\u2019", "e", + "e)-", "e-mail", "e.", "e.V.", @@ -845,13 +1053,16 @@ "e.g", "e.g.", "e.v.", + "eal", "ean", "eb.", "ebd", "ebd.", "ebenda", "ebr", + "economic", "ed.", + "eep", "egr", "egy", "ehem", @@ -863,12 +1074,18 @@ "eine", "einem", "einen", + "einer", + "einmalige", "einw", + "einwohner", "eit", "el.", + "eld", "els", "em.", "en.", + "end", + "ended", "engl", "engl.", "englisch", @@ -876,17 +1093,25 @@ "ent", "entspr", "entspr.", + "eons", "ep.", "ept", "er", "er.", + "erd", + "ere", + "erg", "erm", "erm.", + "ern", "err", "ers", "ersten", + "ery", "er\u2019s", "es", + "ess", + "estate", "etc", "etc.", "etr", @@ -899,8 +1124,10 @@ "evtl", "evtl.", "exit", + "experienced", "expertise", "exposure", + "extract", "ez.", "e\u2019s", "f", @@ -912,18 +1139,26 @@ "feb", "feb.", "fen", + "festen", "festgelegter", + "fierce", "fil", + "flagship", "fond", "fonds", + "for", + "form", "fr", "fr.", + "france", "frankreich", "franz\u00f6sisch", + "french", "frl", "frl.", "frz", "frz.", + "fs-", "fs.", "fund", "fu\u0308hrende", @@ -937,6 +1172,9 @@ "geb.", "gebr", "gebr.", + "gebu\u0308hren", + "ged", + "gedis", "gegebenenfalls", "gegen\u00fcber", "gegr", @@ -945,8 +1183,10 @@ "gem", "gem.", "gen", + "geographies", "geplantes", "ger", + "gerd", "gesamtrendite", "gf.", "gfs", @@ -956,11 +1196,14 @@ "ggfs.", "gg\u00fc", "gg\u00fc.", + "ght", "gie", "gl.", "good", "gr.", + "grre", "gtl", + "guarantee", "g\u00fc.", "h", "h.", @@ -968,17 +1211,23 @@ "h.c.", "halten", "halten-strategie", + "have", "hbf", "hbf.", "hd.", "hed", "hem", + "here", "hf.", "hg", "hg.", + "high", "hil", "hinter", "hinter\u2019m", + "hip", + "his", + "hotel", "hotels", "hr", "hr.", @@ -1005,12 +1254,18 @@ "i.o.", "i.tr.", "i.v.", + "ial", + "ice", "ich", "ich\u2019s", + "ics", + "ide", "ie.", "ien", "ies", "ig.", + "ige", + "igh", "ihr", "ihr\u2019s", "ii", @@ -1018,11 +1273,14 @@ "iii", "iii.", "il.", + "ild", "ile", "illustration", + "im", "ime", "immobilien", "immobilien-spezialfonds", + "improve", "in", "in.", "inc", @@ -1035,15 +1293,21 @@ "ing", "ing.", "initiatives", + "ink", "inkl", "inkl.", "inklusive", "inrev", + "ins", "insb", "insb.", "insbesondere", + "interesting", + "investment", "investmentmanagers", "investments", + "investor", + "investtionszeltraum", "inw", "io.", "iol", @@ -1054,13 +1318,16 @@ "ise", "isk", "iss", + "it", "ite", "ith", + "its", "ity", "itz", "ium", "iv", "iv.", + "ive", "j", "j.", "jahr", @@ -1079,6 +1346,7 @@ "jun.", "jur", "jur.", + "j\u00e4hrliche", "k", "k.", "k.o.", @@ -1086,9 +1354,11 @@ "kath.", "katholisch", "kaufen", + "kel", "ket", "key", "kl.", + "kosten", "kt.", "l", "l'", @@ -1100,21 +1370,38 @@ "laufzeit", "laut", "le.g", + "leave", "ler", + "less", + "levallois", "level", + "leverage", + "leveraged", + "leveraging", "lg.", + "limited", "lin", "lio", + "liquid", + "liquidity", + "lisbon", + "listed", + "lle", "llg", + "lls", "llt", "llv", + "logistics", "london", + "low", + "lso", "lt", "lt.", "lte", "ltv", "lty", "lue", + "lux-based", "lv.", "l\u00e4nderallokation", "l\u2019", @@ -1128,30 +1415,41 @@ "m.m.", "m.sc", "m.sc.", + "madrid", + "main", + "make", "mal", + "manage", "management", "manager", "market", + "markets", "max", "max.", "maximal", "maximaler", + "means", "men", "mer", + "metropolregianen", "metropolregionen", + "mgallery", "mi", "mi.", + "mic", "million", "min", "min.", "mind", "mind.", + "mindestanlage", "mindestens", "minimal", "minor", "mio", "mio.", "mit", + "mitgliedschaft", "mo", "mo.", "monatlich", @@ -1163,6 +1461,7 @@ "mrz.", "mtl", "mtl.", + "must", "mwst", "mwst.", "m\u00e4r", @@ -1174,32 +1473,42 @@ "n.r", "n.y.", "n.y.c.", + "nach", "nachvermietungsstrategie", "name", "nat", "nat.", + "navigate", "nc.", + "nce", + "nch", "ncl", "nd.", "nde", "nds", "ne", + "ned", "nem", "nen", "ner", + "nes", + "netherlands", "ng.", "ngl", "ngs", "niederlande", "niv", "nkl", + "no", "nor", "nov", "nov.", "nr", "nr.", "nsb", + "nse", "nts", + "nuremberg", "o", "o.", "o.0", @@ -1216,24 +1525,45 @@ "o_0", "o_O", "o_o", + "objectives", "of", "of.", + "ofQin", "offen", "offener", + "offers", + "office", + "offices", + "ofqin", "og.", + "ois", "okt", "okt.", "ol.", + "ome", + "on", "ond", + "ons", "ood", + "oom", + "opci", "opco", "ope", + "open", + "open-ended", + "operators", + "opportunities", + "or", + "order", "ore", "orig", "orig.", "original", "orm", + "ors", + "outside", "ov.", + "ove", "over", "p", "p.", @@ -1242,51 +1572,88 @@ "p.s", "p.s.", "pCo", + "pan-European", + "pan-european", + "parformanceabh\u00e4ngige", "paris", + "pci", "pco", + "pen", "pers", "pers.", "phil", "phil.", + "pid", "pkt", "pkt.", "pl.", "portfolio", + "potential", "pr.", "premium", + "preservation", + "pricey", + "prime", "prof", "prof.", "profile", + "prognostizierte", + "program", "projects", + "providing", "pt.", "pw.", "q", "q.", "q.e.d", "q.e.d.", + "qin", "quality", "r", "r.", "r.i.p.", + "ram", + "rapid", + "rce", + "rd", "rd.", "re", "re+", + "rea", + "real", + "recent", "rechtsform", + "recovery", "red", "red.", + "region", + "regionen", + "remains", "ren", + "rent", + "rents", "rer", "rer.", + "residential", + "respective", + "retail", "rev", + "reversion", + "rid", "rig", + "right", "ris", "risikoprofil", "risk", "rl.", "rm.", + "rms", "rn.", "rof", + "room", + "rotterdam", "rr.", + "rre", "rs.", "rsg", "rte", @@ -1307,8 +1674,13 @@ "sb.", "schweden", "sd.", + "sector", + "sectors", + "sed", + "segment", "sen", "sen.", + "sense", "sep", "sep.", "sept", @@ -1318,16 +1690,22 @@ "sg.", "sie", "sie\u2019s", + "sis", "sitz", "skandinavien", + "small", "so", "so.", "sog", "sog.", + "some", "sp.", "space", + "spain", + "spezial-alf", "spr", "spw", + "sri", "ss.", "st", "st.", @@ -1337,34 +1715,53 @@ "std.", "stellv", "stellv.", + "steuern", "stil", "str", "str.", "strategie", + "strategies", "strategy", "struktur", + "strukturierungsgebu\u0308hr", + "st\u00e4dten", "sw.", "s\u2019", "t", "t.", "tactical", + "tal", + "targeting", + "targets", "tc.", "td.", + "ted", + "tee", + "teflimmobilfe)-", "tel", "tel.", "telefonnummer", "ten", + "tenant", "ter", + "terms", "tes", "th.", "the", + "there", + "this", + "through", + "tic", "tig", "til", "time", "tl.", "to", + "top", + "tor", "tr", "tr.", + "try", "tsd", "tsd.", "tur", @@ -1384,23 +1781,34 @@ "u.u.", "u.v.m", "u.v.m.", + "ufs", "ug.", + "ugh", + "uid", "ul.", + "uld", + "umliegende", "un.", "und", + "under", + "undertaking", "ung", "univ", "univ.", + "unless", "unter", "unter\u2019m", "ur.", "ure", "usf", "usf.", + "uss", + "ust", "usw", "usw.", "uvm", "uvm.", + "u\u0308ber", "u\u2019s", "v", "v.", @@ -1415,9 +1823,14 @@ "v_v", "value", "value-added", + "var", "vel", "ver", "vergleiche", + "vergu\u0308tung", + "verkaufs", + "verkaufs-", + "very", "ves", "vgl", "vgl.", @@ -1430,6 +1843,7 @@ "vol", "vol.", "vom", + "von", "vor", "vor\u2019m", "vs", @@ -1438,9 +1852,11 @@ "w", "w.", "wSt", + "walls", "way", "well-established", "wertstabile", + "why", "wir", "wir\u2019s", "wiss", @@ -1477,24 +1893,34 @@ "xx", "xx.", "xx.x", + "xxXxx", "xxx", + "xxx-Xxxxx", + "xxx-xxxx", "xxx.", "xxxx", + "xxxx)-", "xxxx)/xxxx", "xxxx+", "xxxx+/xxxx", + "xxxx-", "xxxx-xxx", "xxxx-xxxx", "xxxx-xxxx-xxx", "xxxx.", + "xxxx\u0308xx", + "xxxx\u0308xxxx", "xxxx\u2019x", "xxx\u2019x", "xx\u0308xxxx", "xx\u2019x", + "x\u0308xxx", "x\u2019", "x\ufe35x", "y", "y.", + "years", + "yield", "z", "z.", "z.B.", @@ -1513,6 +1939,7 @@ "ziel-netto-irr", "zielanlagestrategie", "zielregionfen)/jand", + "zielrendite", "zielsektoren", "zielvolumen", "ziirraiaein", @@ -1520,6 +1947,7 @@ "zw.", "zzgl", "zzgl.", + "{", "|", "}", "\u00a0", @@ -1554,6 +1982,7 @@ "\u00fcber", "\u00fcbersicht", "\u00fcber\u2019m", + "\u0308hr", "\u0ca0", "\u0ca0_\u0ca0", "\u0ca0\ufe35\u0ca0", @@ -1574,6 +2003,8 @@ "\u2019xxx", "\u2019\u2019", "\u201a", + "\u201c", + "\u201d", "\u20ac", "\u2501", "\u253b", diff --git a/prototypes/fine_tuning_spaCy/requirements.txt b/prototypes/fine_tuning_spaCy/requirements.txt index 675d470..6367b84 100644 --- a/prototypes/fine_tuning_spaCy/requirements.txt +++ b/prototypes/fine_tuning_spaCy/requirements.txt @@ -1,4 +1,4 @@ spacy==3.7.2 spacy-transformers==1.3.3 transformers==4.35.2 -torch==2.1.0 \ No newline at end of file +torch \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/test_model.py b/prototypes/fine_tuning_spaCy/test_model.py index 7286d43..277bd1e 100644 --- a/prototypes/fine_tuning_spaCy/test_model.py +++ b/prototypes/fine_tuning_spaCy/test_model.py @@ -4,7 +4,7 @@ import json from pathlib import Path nlp = spacy.load("output/model-last") -input_pdf = Path("../ocr/output/Pitchbook 1-OCR.pdf") +input_pdf = Path("../ocr/output/Pitchbook 3-OCR.pdf") doc = fitz.open(input_pdf) diff --git a/prototypes/fine_tuning_spaCy/training_data.py b/prototypes/fine_tuning_spaCy/training_data.py index ed2e4d3..246b2d0 100644 --- a/prototypes/fine_tuning_spaCy/training_data.py +++ b/prototypes/fine_tuning_spaCy/training_data.py @@ -1,66 +1,210 @@ TRAINING_DATA = [ ( "Core", - {"entities":[[0,4,"RISIKOPROFIL"]]}, + {"entities": [[0, 4, "RISIKOPROFIL"]]}, ), ( "Core+", - {"entities":[[0,5,"RISIKOPROFIL"]]}, + {"entities": [[0, 5, "RISIKOPROFIL"]]}, ), ( "Core/Core+", - {"entities":[[0,10,"RISIKOPROFIL"]]}, + {"entities": [[0, 10, "RISIKOPROFIL"]]}, ), ( "Value Add", - {"entities":[[0,9,"RISIKOPROFIL"]]}, + {"entities": [[0, 9, "RISIKOPROFIL"]]}, ), - ( + ( "Core/Value Add", - {"entities":[[0,14,"RISIKOPROFIL"]]}, + {"entities": [[0, 14, "RISIKOPROFIL"]]}, ), - ( + ( "Core+/Value Add", - {"entities":[[0,15,"RISIKOPROFIL"]]}, + {"entities": [[0, 15, "RISIKOPROFIL"]]}, ), - ( + ( "Core/Core+/Value Add", - {"entities":[[0,20,"RISIKOPROFIL"]]}, - ), + {"entities": [[0, 20, "RISIKOPROFIL"]]}, + ), ( "The RE portfolio of the fund is a good illustration of Fond expertise in European core/core+ investments .", - {"entities":[[82,92,"RISIKOPROFIL"]]}, + {"entities": [[82, 92, "RISIKOPROFIL"]]}, ), ( "Risk level: Core/Core+", - {"entities":[[12,22,"RISIKOPROFIL"]]}, + {"entities": [[12, 22, "RISIKOPROFIL"]]}, ), - ( + ( "Different risk profile (core, core+, value-added)", - {"entities":[[24,48,"RISIKOPROFIL"]]}, - ), + {"entities": [[24, 48, "RISIKOPROFIL"]]}, + ), ( "Core/Core+ with OpCo premium", - {"entities":[[0,10,"RISIKOPROFIL"]]}, + {"entities": [[0, 10, "RISIKOPROFIL"]]}, ), - ( + ( "Core /Core+ Assets, well-established = Key Gateway Cities in Europe le.g. hotels in the market with minor asset London, Paris, Amsterdam, Berlin] management initiatives", - {"entities":[[0,11,"RISIKOPROFIL"]]}, + {"entities": [[0, 11, "RISIKOPROFIL"]]}, ), ( "Risikoprofil: Core, Core +", - {"entities":[[14,26,"RISIKOPROFIL"]]}, + {"entities": [[14, 26, "RISIKOPROFIL"]]}, ), ( "Name des Fonds Name des Investmentmanagers Allgemeine Informationen Name des Ansprechpartners Telefonnummer des Ansprechpartners E-Mail des Ansprechpartners Art des Anlagevehikels Struktur des Anlagevehikels Sitz des Anlagevehikels Struktur des Antagevehikels vom Manager festgelegter Stil Rechtsform Jahr des ersten Closings Laufzeit Geplantes Jahr der Auflösung Ziel-Netto-IRR / Gesamtrendite* Zielvolumen des Anlagevehikels Ziel-LTY ‚Aktueller LTV Ziirraiaein Maximaler LTV Zielregionfen)/Jand Zielsektoren Zielanlagestrategie INREV Fonds Offen Deutschland Core, Core + Offener Immobilien-Spezialfonds 2022 10 - 12 Jahre 2032 - 2034 7,50%+ 250 Mio. € 20% 0% 20% Führende Metropolregionen Deutschlands und ausgewählte Standorte >50T Einw. Wohnimmobilien Wertstabile Wohnimmobilien (mit Bestandsentwicklungen)", - {"entities":[[560,572,"RISIKOPROFIL"]]}, + {"entities": [[560, 572, "RISIKOPROFIL"]]}, ), ( "Core/Core+ strategy, with tactical exposure to development projects aiming at enhancing the quality of the portfolio over time", - {"entities":[[0,10,"RISIKOPROFIL"]]}, + {"entities": [[0, 10, "RISIKOPROFIL"]]}, ), ( "Strategie - Übersicht Risikoprofil Core+ Halten-Strategie Kaufen — Halten (langfristig) — Exit 1. Nachvermietungsstrategie Anlagestrategien 2. Standortaufwertungsstrategie 3. Strategie der Aufwertung der Immobilien Niederlande (max. 35 %) Länderallokation Frankreich (max. 35 %) (in % vom Zielvolumen) Skandinavien (Schweden, Dänemark) (max. 35 %) Deutschland (<= 10 %)", - {"entities":[[35,40,"RISIKOPROFIL"]]}, + {"entities": [[35, 40, "RISIKOPROFIL"]]}, + ), + ( + "Core and Core+", + {"entities": [[0, 14, "RISIKOPROFIL"]]}, + ), + ( + "core, core+, value-added", + {"entities": [[0, 24, "RISIKOPROFIL"]]}, + ), + ( + "Manage to Core: max 20%", + {"entities": [[10, 14, "RISIKOPROFIL"]]}, + ), + ( + "Benefits of the core/ core+ segment", + {"entities": [[16, 27, "RISIKOPROFIL"]]}, + ), + ( + "Drawbacks of the core/ core+ segment", + {"entities": [[17, 28, "RISIKOPROFIL"]]}, + ), + ( + "Why a Core / Core + investment program?", + {"entities": [[6, 19, "RISIKOPROFIL"]]}, + ), + ( + "Different risk profile (core, core+, value-added)", + {"entities": [[24, 48, "RISIKOPROFIL"]]}, + ), + ( + "INK MGallery Hotel Area: Amsterdam Core Tenant: Closed in 2018", + {"entities": [[35, 39, "RISIKOPROFIL"]]}, + ), + ( + "A strategy targeting high quality Core and Core+ buildings, with defined SRI objectives, in order to extract value through an active asset management.", + {"entities": [[34, 48, "RISIKOPROFIL"]]}, + ), + ( + "Navigate the diversity of the Core/Core+ investment opportunities in European Prime Cities", + {"entities": [[30, 40, "RISIKOPROFIL"]]}, + ), + ( + "GEDis an open-ended Lux-based fund providing an attractive core/core+ real estate exposure, leveraging GRRE expertise in European RE markets. It offers diversification in terms of pan-European geographies and sectors: Offices, Retail and Hotels.", + {"entities": [[59, 69, "RISIKOPROFIL"]]}, + ), + ( + "Core assets leave less room for active asset management value creation", + {"entities": [[0, 4, "RISIKOPROFIL"]]}, + ), + ( + "capital preservation is defined here as a characteristic of core/core+ investments. There is no guarantee of capital.", + {"entities": [[60, 70, "RISIKOPROFIL"]]}, + ), + ( + "Country / city BELGIUM Brussels BELGIUM Brussels SPAIN Madrid FRANCE Levallois FRANCE Paris 14 BELGIUM Brussels NETHERLANDS Rotterdam NETHERLANDS Rotterdam Sector Offices Offices Offices Offices Offices Offices Offices Logistics Risk Core", + {"entities": [[234, 238, "RISIKOPROFIL"]]}, + ), + ( + "GERD(a balanced pan-European open ended retail fund — under the form of a French collective undertaking for Real Estate investments “OPCI”) is the flagship ofQin France and combines RE and listed assets (respective targets of 60% and 40%) with max. 40% leverage. The RE portfolio of the fund is a good illustration Of expertise in European core/core+ investments.", + {"entities": [[340, 350, "RISIKOPROFIL"]]}, + ), + ( + "Prime office assets in Prime markets are very pricey unless rent reversion is real. Risk premium remains attractive on a leveraged basis. Manage to core or build to core can make sense as a LT investor in main cities. Residential is also attractive", + {"entities": [[148, 152, "RISIKOPROFIL"]]}, + ), + ( + "Paris region is a deep and liquid market. Rents have some potential to improve. Considering current low yield and fierce competition, office right outside CBD for Core + assets can be considered. Manage to core strategies could make sense.", + {"entities": [[163, 169, "RISIKOPROFIL"]]}, + ), + ( + "Lisbon is a small market but it experienced a rapid economic recovery in recent years and is interesting for Core Offices, quality Retail assetor Hotel walls with top operators. Limited liquidity of this market means investment must be small", + {"entities": [[109, 113, "RISIKOPROFIL"]]}, + ), + ( + "4,0 %", + {"entities": [[0, 5, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Prognostizierte jährliche Ausschüttung von 4,0%", + {"entities": [[44, 48, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "20% über einer @ Ausschüttungsrendite von 4,0%", + {"entities": [[44, 48, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Prognostizierte Ausschüttungsrandite* Mindestanlage Mitgliedschaft Im Anlagesusschuss Ankaufs- / Verkaufs- / Verkaufs(Teflimmobilfe)- / Baumanagementgebahr (inkl. USt.) Parformanceabhängige Vergütung Einmalige Strukturierungsgebühr Laufzeit / Investtionszeltraum Ausschüttungsintervalle Deutsche Metropolregianen und umliegende Regionen mit Städten >50T Einwohner Artikel 8 Wohnimmobilien Deutschland ‚Aktive Bestandsentwicklung Offener Spezial-AlF mit festen Anlagebedingungen rd. 200 Mio. € / max. 20% rd. 250 Mio. € 7,5 % (nach Kosten & Gebühren, vor Steuern) 8 4,0 % {nach Kosten & Gebühren, var Steuern}", + {"entities": [[570, 575, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "5,00-5,25 % Ausschüttungsrendite", + {"entities": [[0, 11, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Zielrendite 5,00-5,25 % Ausschüttungsrendite", + {"entities": [[12, 23, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 4,9% 5,3%", + {"entities": [[21, 25, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 4,9% 5,3%", + {"entities": [[26, 30, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschittungsrendite 3,8% 5,7%", + {"entities": [[20, 24, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschittungsrendite 3,8% 5,7%", + {"entities": [[25, 29, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 4,5% 4,6%", + {"entities": [[21, 25, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 4,5% 4,6%", + {"entities": [[26, 30, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 5,0% 4,7%", + {"entities": [[26, 30, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite 5,0% 4,7%", + {"entities": [[21, 25, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite “eons a Nuremberg aha 5,0 % 4,8 %", + {"entities": [[43, 48, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "Auschüttungsrendite “eons a Nuremberg aha 5,0 % 4,8 %", + {"entities": [[49, 54, "AUSSCHÜTTUNGSRENDITE"]]}, + ), + ( + "3-4% dividend yield", + {"entities": [[0, 4, "AUSSCHÜTTUNGSRENDITE"]]}, ) -] \ No newline at end of file +] + + + +