knowledge graph construction with rebrickable data

pull/1/head
Roman Schöne 2026-04-25 21:27:22 +02:00
parent ca8f1f55a6
commit 383493245b
12 changed files with 310 additions and 3584 deletions

1
lego/.gitignore vendored 100644
View File

@ -0,0 +1 @@
*.ttl

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,308 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "747b245f",
"metadata": {},
"source": [
"Build the Lego Knwoledge Graph using the sources in `/data`."
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "90209948",
"metadata": {},
"outputs": [],
"source": [
"from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal\n",
"import pandas as pd\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"id": "fe91fa67",
"metadata": {},
"source": [
"Setup the requirements for building a knowledge graph"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "8e573135",
"metadata": {},
"outputs": [],
"source": [
"g = Graph()\n",
"thm = Namespace(\"https://th-mannheim.de/\")\n",
"THM = Namespace(\"https://th-mannheim.de/ont/\")"
]
},
{
"cell_type": "markdown",
"id": "d56199d5",
"metadata": {},
"source": [
"# Rebrickable"
]
},
{
"cell_type": "markdown",
"id": "d1e1abb0",
"metadata": {},
"source": [
"![Rebrickable](\\data\\rebrickable\\downloads_schema_v3.png)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d8a1fe84",
"metadata": {},
"outputs": [],
"source": [
"re_colors = pd.read_csv(\"data/rebrickable/colors.csv\")\n",
"re_elements = pd.read_csv(\"data/rebrickable/elements.csv\")\n",
"re_inventories = pd.read_csv(\"data/rebrickable/inventories.csv\")\n",
"re_inventory_minifigs = pd.read_csv(\"data/rebrickable/inventory_minifigs.csv\")\n",
"re_inventory_parts = pd.read_csv(\"data/rebrickable/inventory_parts.csv\")\n",
"re_inventory_sets = pd.read_csv(\"data/rebrickable/inventory_sets.csv\")\n",
"re_minifigs = pd.read_csv(\"data/rebrickable/minifigs.csv\")\n",
"re_part_categories = pd.read_csv(\"data/rebrickable/part_categories.csv\")\n",
"re_part_relationships = pd.read_csv(\"data/rebrickable/part_relationships.csv\")\n",
"re_parts = pd.read_csv(\"data/rebrickable/parts.csv\")\n",
"re_sets = pd.read_csv(\"data/rebrickable/sets.csv\")\n",
"re_themes = pd.read_csv(\"data/rebrickable/themes.csv\")"
]
},
{
"cell_type": "markdown",
"id": "f3677416",
"metadata": {},
"source": [
"Colors"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "ae505704",
"metadata": {},
"outputs": [],
"source": [
"for color in re_colors.itertuples(index=False):\n",
" color_ref = thm[f\"colors/{color.id}\"]\n",
"\n",
" g.add((color_ref, RDFS.label, Literal(color.name, lang=\"en\")))\n",
" g.add((color_ref, THM.color, Literal(color.rgb)))\n",
" g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))\n",
" \n",
" if not pd.isna(color.y1):\n",
" # First appearance\n",
" g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))\n",
" if not pd.isna(color.y2):\n",
" # Last appearance\n",
" g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))\n"
]
},
{
"cell_type": "markdown",
"id": "e27b2bc4",
"metadata": {},
"source": [
"Part Categories"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "fb9e17d6",
"metadata": {},
"outputs": [],
"source": [
"for part_category in re_part_categories.itertuples(index=False):\n",
" part_category_ref = thm[f\"part_category/{part_category.id}\"]\n",
"\n",
" g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang=\"en\")))"
]
},
{
"cell_type": "markdown",
"id": "ea32849b",
"metadata": {},
"source": [
"Parts"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "8fdb080e",
"metadata": {},
"outputs": [],
"source": [
"for part in re_parts.itertuples(index=False):\n",
" part_ref = thm[f\"part/{part.part_num}\"]\n",
"\n",
" g.add((part_ref, RDFS.label, Literal(part.name, lang=\"en\")))\n",
" g.add((part_ref, THM.part_category, thm[f\"part_category/{part.part_cat_id}\"]))\n",
" g.add((part_ref, THM.part_material, Literal(part.part_material)))"
]
},
{
"cell_type": "markdown",
"id": "fcaadd84",
"metadata": {},
"source": [
"Elements"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "579b1d67",
"metadata": {},
"outputs": [],
"source": [
"for element in re_elements.itertuples(index=False):\n",
" part_ref = thm[f\"part/{element.part_num}\"]\n",
" color_ref = thm[f\"colors/{element.color_id}\"]\n",
"\n",
" g.add((part_ref, THM.has_color, color_ref))"
]
},
{
"cell_type": "markdown",
"id": "44dae336",
"metadata": {},
"source": [
"Part Relationships"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "00db079a",
"metadata": {},
"outputs": [],
"source": [
"for part_relationship in re_part_relationships.itertuples(index=False):\n",
" part_ref_parent = thm[f\"part/{part_relationship.parent_part_num}\"]\n",
" part_ref_child = thm[f\"part/{part_relationship.child_part_num}\"]\n",
"\n",
" g.add((part_ref_parent, THM.has_child, part_ref_child))"
]
},
{
"cell_type": "markdown",
"id": "19dc64b8",
"metadata": {},
"source": [
"Themes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a529aae",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "f-string: unmatched ')' (1024367582.py, line 2)",
"output_type": "error",
"traceback": [
" \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[48]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[31m \u001b[39m\u001b[31mtheme_ref = thm[f\"theme/{int(theme.id))}\"]\u001b[39m\n ^\n\u001b[31mSyntaxError\u001b[39m\u001b[31m:\u001b[39m f-string: unmatched ')'\n"
]
}
],
"source": [
"for theme in re_themes.itertuples(index=False):\n",
" theme_ref = thm[f\"theme/{int(theme.id)}\"]\n",
"\n",
" g.add((theme_ref, RDFS.label, Literal(theme.name)))\n",
"\n",
" if not pd.isna(theme.parent_id):\n",
" g.add((theme_ref, THM.parent_theme, thm[f\"theme/{int(theme.parent_id)}\"]))"
]
},
{
"cell_type": "markdown",
"id": "bfab0c73",
"metadata": {},
"source": [
"Serialize the graph in turtle format"
]
},
{
"cell_type": "markdown",
"id": "2abd6894",
"metadata": {},
"source": [
"```\n",
" ___-------___\n",
" _-~~ ~~-_\n",
" _-~ /~-_\n",
" /^\\__/^\\ /~ \\ / \\\n",
" /| O|| O| / \\_______________/ \\\n",
"| |___||__| / / \\ \\\n",
"| \\ / / \\ \\\n",
"| (_______) /______/ \\_________ \\\n",
"| / / \\ / \\\n",
" \\ \\^\\\\ \\ / \\ /\n",
" \\ || \\______________/ _-_ //\\__//\n",
" \\ ||------_-~~-_ ------------- \\ --/~ ~\\ || __/\n",
" ~-----||====/~ |==================| |/~~~~~\n",
" (_(__/ ./ / \\_\\ \\.\n",
" (_(___/ \\_____)_)\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a30bff8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nf661b2e682c043188ddd822a6bca246c (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g.bind(\"thmont\", THM)\n",
"\n",
"g.serialize(\"lego_graph.ttl\", format=\"turtle\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -142,7 +142,7 @@
\subsection{Merlins Steine}
\textit{Merlins Steine} ist eine Website, die sich auf die Berichterstattung von Reviews zu Klemmbausteinen spezifiziert. Um auf entsprechende Sets zu referenzieren beinhaltet \textit{Merlins Steine} eine Datenbank an Klemmbausteinen, die über 30000 Sets unterschiedlicher Marken enthält. Merlins Steine finanziert sich über Affiliate Marketing. \cite{SetDatenbankSetDB}.\\
\textit{Merlins Steine} wurde ausgewählt, da Sets von anderen Marken im Vergleich zu vorherig genannten Quellen aufgelistet werden.
\textit{Merlins Steine} wurde ausgewählt, da Sets von anderen Marken im Vergleich zu vorherig genannten Quellen aufgelistet werden. Der Datensatz enthält Informationen zum \ac{UVP}, Teileanzahl, Release-Jahr, Thema Modelldimensionen und des Herstellers.
\begin{table}[H]
\begin{tabularx}{\columnwidth}{@{}ll@{}}