diff --git a/lego/lego_graph_rebrickable.ipynb b/lego/lego_graph.ipynb similarity index 77% rename from lego/lego_graph_rebrickable.ipynb rename to lego/lego_graph.ipynb index 60c5ce4..4ae387a 100644 --- a/lego/lego_graph_rebrickable.ipynb +++ b/lego/lego_graph.ipynb @@ -17,8 +17,7 @@ "source": [ "from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal\n", "import pandas as pd\n", - "from datetime import datetime\n", - "import os" + "from datetime import datetime" ] }, { @@ -88,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "ae505704", "metadata": {}, "outputs": [], @@ -96,6 +95,7 @@ "for color in re_colors.itertuples(index=False):\n", " color_ref = thm[f\"color/{color.id}\"]\n", "\n", + " g.add((color_ref, RDF.type, THM.Color ))\n", " g.add((color_ref, RDFS.label, Literal(color.name, lang=\"en\")))\n", " g.add((color_ref, THM.color, Literal(color.rgb)))\n", " g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))\n", @@ -126,6 +126,7 @@ "for part_category in re_part_categories.itertuples(index=False):\n", " part_category_ref = thm[f\"part_category/{part_category.id}\"]\n", "\n", + " g.add((part_category_ref, RDF.type, THM.PartCategory ))\n", " g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang=\"en\")))" ] }, @@ -147,6 +148,7 @@ "for part in re_parts.itertuples(index=False):\n", " part_ref = thm[f\"part/{part.part_num}\"]\n", "\n", + " g.add((part_ref, RDF.type, THM.Part))\n", " g.add((part_ref, RDFS.label, Literal(part.name, lang=\"en\")))\n", " g.add((part_ref, THM.part_category, thm[f\"part_category/{part.part_cat_id}\"]))\n", " g.add((part_ref, THM.part_material, Literal(part.part_material)))" @@ -162,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "579b1d67", "metadata": {}, "outputs": [], @@ -171,6 +173,7 @@ " part_ref = thm[f\"part/{element.part_num}\"]\n", " color_ref = thm[f\"color/{element.color_id}\"]\n", "\n", + " g.add((part_ref, RDF.type, THM.Element))\n", " g.add((part_ref, THM.has_color, color_ref))" ] }, @@ -193,7 +196,8 @@ " part_ref_parent = thm[f\"part/{part_relationship.parent_part_num}\"]\n", " part_ref_child = thm[f\"part/{part_relationship.child_part_num}\"]\n", "\n", - " g.add((part_ref_parent, THM.has_child, part_ref_child))" + " g.add((part_ref_parent, THM.has_child, part_ref_child))\n", + " g.add((part_ref_child, THM.has_parent, part_ref_parent))" ] }, { @@ -214,6 +218,7 @@ "for theme in re_themes.itertuples(index=False):\n", " theme_ref = thm[f\"theme/{int(theme.id)}\"]\n", "\n", + " g.add((theme_ref, RDF.type, THM.Theme))\n", " g.add((theme_ref, RDFS.label, Literal(theme.name, lang=\"en\")))\n", "\n", " if not pd.isna(theme.parent_id):\n", @@ -238,8 +243,9 @@ "for lego_set in re_sets.itertuples(index=False):\n", " set_ref = thm[f\"set/lego/{lego_set.set_num}\"]\n", "\n", + " g.add((set_ref, RDF.type, THM.Set))\n", " g.add((set_ref, RDFS.label, Literal(lego_set.name, lang=\"en\")))\n", - " g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))\n", + " g.add((set_ref, THM.year, Literal(int(lego_set.year), datatype=XSD.integer)))\n", " g.add((set_ref, THM.theme, thm[f\"theme/{int(lego_set.theme_id)}\"]))\n", " g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))\n", " g.add((set_ref, THM.brand, Literal(\"Lego\")))" @@ -263,8 +269,9 @@ "for minifig in re_minifigs.itertuples(index=False):\n", " minifig_ref = thm[f\"minifig/{minifig.fig_num}\"]\n", "\n", - " g.add((set_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n", - " g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))" + " g.add((minifig_ref, RDF.type, THM.Minifig))\n", + " g.add((minifig_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n", + " g.add((minifig_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))" ] }, { @@ -277,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "0c97dc4d", "metadata": {}, "outputs": [], @@ -285,6 +292,7 @@ "for inventory in re_inventories.itertuples(index=False):\n", " inventory_ref = thm[f\"inventory/{inventory.id}\"]\n", "\n", + " g.add((inventory_ref, RDF.type, THM.Inventory))\n", " g.add((inventory_ref, THM.set, thm[f\"set/lego/{inventory.set_num}\"]))" ] }, @@ -302,21 +310,39 @@ "execution_count": 13, "id": "dc2ba03e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nfor inventory_part in re_inventory_parts.itertuples(index=False):\\n inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\\n\\n inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\\n part_ref = thm[f\"part/{inventory_part.part_num}\"]\\n\\n g.add((inventory_part_ref, RDF.type, THM.PartInv))\\n g.add((inventory_part_ref, RDF.type, RDF.Property))\\n\\n g.add((inventory_part_ref, RDFS.domain, THM.Inventory))\\n g.add((inventory_part_ref, RDFS.range, THM.Part))\\n\\n g.add((inventory_ref, THM.contains, inventory_part_ref))\\n g.add((part_ref, THM.belongs, inventory_part_ref))\\n\\n g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\\n g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\\n g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))\\n'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "\"\"\"\n", "for inventory_part in re_inventory_parts.itertuples(index=False):\n", " inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\n", " \n", " inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\n", " part_ref = thm[f\"part/{inventory_part.part_num}\"]\n", "\n", - " g.add((inventory_part_ref, RDFS.domain, inventory_ref))\n", - " g.add((inventory_part_ref, RDFS.range, part_ref))\n", + " g.add((inventory_part_ref, RDF.type, THM.PartInv))\n", " g.add((inventory_part_ref, RDF.type, RDF.Property))\n", + "\n", + " g.add((inventory_part_ref, RDFS.domain, THM.Inventory))\n", + " g.add((inventory_part_ref, RDFS.range, THM.Part))\n", " \n", + " g.add((inventory_ref, THM.contains, inventory_part_ref))\n", + " g.add((part_ref, THM.belongs, inventory_part_ref))\n", + "\n", " g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\n", " g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\n", - " g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))" + " g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))\n", + "\"\"\"" ] }, { @@ -332,10 +358,15 @@ " inventory_ref = thm[f\"inventory/{inventory_set.inventory_id}\"]\n", " set_ref = thm[f\"set/lego/{inventory_set.set_num}\"]\n", "\n", - " g.add((inventory_set_ref, RDFS.domain, inventory_ref))\n", - " g.add((inventory_set_ref, RDFS.range, set_ref))\n", + " g.add((inventory_set_ref, RDF.type, THM.SetInv))\n", " g.add((inventory_set_ref, RDF.type, RDF.Property))\n", "\n", + " g.add((inventory_set_ref, RDFS.domain, THM.Inventory))\n", + " g.add((inventory_set_ref, RDFS.range, THM.Set))\n", + "\n", + " g.add((inventory_ref, THM.contains, inventory_set_ref))\n", + " g.add((set_ref, THM.belongs, inventory_set_ref))\n", + " \n", " g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))" ] }, @@ -350,12 +381,17 @@ " inventory_minifig_ref = thm[f\"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}\"]\n", "\n", " inventory_ref = thm[f\"inventory/{inventory_minifig.inventory_id}\"]\n", - " minifig_ref = thm[f\"minifig/lego/{inventory_minifig.fig_num}\"]\n", + " minifig_ref = thm[f\"minifig/{inventory_minifig.fig_num}\"]\n", "\n", - " g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))\n", - " g.add((inventory_minifig_ref, RDFS.range, minifig_ref))\n", + " g.add((inventory_minifig_ref, RDF.type, THM.MinifigInv))\n", " g.add((inventory_minifig_ref, RDF.type, RDF.Property))\n", "\n", + " g.add((inventory_minifig_ref, RDFS.domain, THM.Inventory))\n", + " g.add((inventory_minifig_ref, RDFS.range, THM.Minifig))\n", + "\n", + " g.add((inventory_ref, THM.contains, inventory_minifig_ref))\n", + " g.add((minifig_ref, THM.belongs, inventory_minifig_ref))\n", + " \n", " g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))" ] }, @@ -464,14 +500,27 @@ "execution_count": 21, "id": "ef52582e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nfor bl_part in bl_parts.itertuples(index=False):\\n part_ref = thm[f\"part/{bl_part.part_id}\"]\\n\\n if not (part_ref, None, None) in g:\\n additional_entries += 1\\n g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))\\n'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "\"\"\"\n", "for bl_part in bl_parts.itertuples(index=False):\n", " part_ref = thm[f\"part/{bl_part.part_id}\"]\n", "\n", " if not (part_ref, None, None) in g:\n", " additional_entries += 1\n", - " g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))" + " g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))\n", + "\"\"\"" ] }, { @@ -479,14 +528,27 @@ "execution_count": 22, "id": "8bf0ffeb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nfor bl_minifig in bl_minifigs.itertuples(index=False):\\n minifig_ref = thm[f\"minfig/{bl_minifig.minifig_id}\"]\\n\\n if not (minifig_ref, None, None) in g:\\n additional_entries += 1\\n g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))\\n'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "\"\"\"\n", "for bl_minifig in bl_minifigs.itertuples(index=False):\n", " minifig_ref = thm[f\"minfig/{bl_minifig.minifig_id}\"]\n", "\n", " if not (minifig_ref, None, None) in g:\n", " additional_entries += 1\n", - " g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))" + " g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))\n", + "\"\"\"" ] }, { @@ -499,7 +561,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Added 107748 items\n" + "Added 4131 items\n" ] } ], @@ -548,7 +610,7 @@ { "data": { "text/plain": [ - ")>" + ")>" ] }, "execution_count": 24, @@ -557,15 +619,15 @@ } ], "source": [ - "g.bind(\"thmont\", THM)\n", + "g.bind(\"thm\", THM)\n", "\n", - "g.serialize(\"lego_graph_rebrickable.ttl\", format=\"turtle\")" + "g.serialize(\"lego_graph.ttl\", format=\"turtle\")" ] } ], "metadata": { "kernelspec": { - "display_name": "venv (3.14.4)", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/lego/paper/KGR_paper1_lego.tex b/lego/paper/KGR_paper1_lego.tex index 541db14..00bcbb3 100644 --- a/lego/paper/KGR_paper1_lego.tex +++ b/lego/paper/KGR_paper1_lego.tex @@ -30,7 +30,7 @@ a4paper,margin=25mm } -\title{\huge{Knowledgegraphen - Lego}} +\title{\huge{Knowledge Graph - Lego}} \date{\today} \author{ \begin{tabular}{ccc} @@ -127,7 +127,7 @@ \toprule & Brickset \\ \midrule URL & \url{https://brickset.com/}\\ - Beschaffung & Webscraping/CSV-Download \\ + Beschaffung & CSV-Download \\ Lizenz & nicht spezifiziert \\ Erhalt & 23.04.2026 \\ \bottomrule \end{tabularx} @@ -184,9 +184,11 @@ \begin{verbatim} https://thm.de/set/{brand}/{id} \end{verbatim} + Um die Dateigrösse des Graph zu reduzieren wurde \texttt{thm}, statt \texttt{th-mannheim} verwendet. \begin{figure}[H] - \includegraphics[width=\columnwidth]{bilder/example_part_number.png} + \centering + \includegraphics[width=0.8\columnwidth]{bilder/example_part_number.png} \caption{Lego Stein mit Teile-Nummer (Design-ID) 41769 \cite{cunninghamSellLEGOBricklink2018}} \label{fig:lego_example_part_number} \end{figure} @@ -199,14 +201,41 @@ \subsection{Pipeline} + Die Datensätze von \textit{Bricklink} und \textit{Merlins Steine} wurden durch Webscraping erhoben. Entstandene Fehler durch Ausnahmefälle mussten manuell bereinigt werden. Demnach ist dieser Teil nicht automatisierbar. Abbildung \ref{fig:pipeline} zeigt die Pipeline zur Erstellung des Knowledge Graph. + + \begin{figure}[H] + \includegraphics[width=\columnwidth]{./bilder/kgr_pipeline1.drawio.png} + \caption{Pipeline Erstellung Knowledge Graph} + \label{fig:pipeline} + \end{figure} + + \section{Evaluation} \subsection{Ergebnis} Das Projekt kann unter der URL: \url{https://gitty.informatik.hs-mannheim.de/2211275/kgr} betrachtet werden. - + Der resultierende Knowledge-Graph ist über 300 MB gross. \subsection{Beispiel-Queries} + Erhalten der Gesamtheit aller Lego Star Wars Minifiguren: + + \begin{verbatim} +SELECT DISTINCT ?name +WHERE { + ?set thmont:theme ?theme. + ?theme rdf:type thmont:Theme. + ?set rdf:type thmont:Set. + ?theme rdfs:label "Star Wars"@en. + ?inventory thmont:set ?set. + ?inventory rdf:type thmont:Inventory. + ?inventory thmont:contains ?minifig_inv. + ?minifig_inv rdf:type thmont:MinifigInv. + ?minifig thmont:belongs ?minifig_inv. + ?minifig rdfs:label ?name. +} + \end{verbatim} + \subsection{Abdeckung} \subsection{Konsistenz} diff --git a/lego/paper/bilder/kgr_pipeline1.drawio b/lego/paper/bilder/kgr_pipeline1.drawio new file mode 100644 index 0000000..3e24c56 --- /dev/null +++ b/lego/paper/bilder/kgr_pipeline1.drawio @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lego/paper/bilder/kgr_pipeline1.drawio.png b/lego/paper/bilder/kgr_pipeline1.drawio.png new file mode 100644 index 0000000..2bd7c78 Binary files /dev/null and b/lego/paper/bilder/kgr_pipeline1.drawio.png differ