kgr/lego/lego_graph_rebrickable.ipynb

439 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "747b245f",
"metadata": {},
"source": [
"Build the Lego Knowledge Graph using the sources in `/data`."
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "90209948",
"metadata": {},
"outputs": [],
"source": [
"from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal\n",
"import pandas as pd\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"id": "fe91fa67",
"metadata": {},
"source": [
"Setup the requirements for building a knowledge graph"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "8e573135",
"metadata": {},
"outputs": [],
"source": [
"g = Graph()\n",
"thm = Namespace(\"https://thm.de/\")\n",
"THM = Namespace(\"https://thm.de/ont/\")"
]
},
{
"cell_type": "markdown",
"id": "d56199d5",
"metadata": {},
"source": [
"# Rebrickable"
]
},
{
"cell_type": "markdown",
"id": "d1e1abb0",
"metadata": {},
"source": [
"![Rebrickable](\\data\\rebrickable\\downloads_schema_v3.png)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "d8a1fe84",
"metadata": {},
"outputs": [],
"source": [
"re_colors = pd.read_csv(\"data/rebrickable/colors.csv\")\n",
"re_elements = pd.read_csv(\"data/rebrickable/elements.csv\")\n",
"re_inventories = pd.read_csv(\"data/rebrickable/inventories.csv\")\n",
"re_inventory_minifigs = pd.read_csv(\"data/rebrickable/inventory_minifigs.csv\")\n",
"re_inventory_parts = pd.read_csv(\"data/rebrickable/inventory_parts.csv\")\n",
"re_inventory_sets = pd.read_csv(\"data/rebrickable/inventory_sets.csv\")\n",
"re_minifigs = pd.read_csv(\"data/rebrickable/minifigs.csv\")\n",
"re_part_categories = pd.read_csv(\"data/rebrickable/part_categories.csv\")\n",
"re_part_relationships = pd.read_csv(\"data/rebrickable/part_relationships.csv\")\n",
"re_parts = pd.read_csv(\"data/rebrickable/parts.csv\")\n",
"re_sets = pd.read_csv(\"data/rebrickable/sets.csv\")\n",
"re_themes = pd.read_csv(\"data/rebrickable/themes.csv\")"
]
},
{
"cell_type": "markdown",
"id": "f3677416",
"metadata": {},
"source": [
"Colors"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "ae505704",
"metadata": {},
"outputs": [],
"source": [
"for color in re_colors.itertuples(index=False):\n",
" color_ref = thm[f\"colors/{color.id}\"]\n",
"\n",
" g.add((color_ref, RDFS.label, Literal(color.name, lang=\"en\")))\n",
" g.add((color_ref, THM.color, Literal(color.rgb)))\n",
" g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))\n",
" \n",
" if not pd.isna(color.y1):\n",
" # First appearance\n",
" g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))\n",
" if not pd.isna(color.y2):\n",
" # Last appearance\n",
" g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))\n"
]
},
{
"cell_type": "markdown",
"id": "e27b2bc4",
"metadata": {},
"source": [
"Part Categories"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "fb9e17d6",
"metadata": {},
"outputs": [],
"source": [
"for part_category in re_part_categories.itertuples(index=False):\n",
" part_category_ref = thm[f\"part_category/{part_category.id}\"]\n",
"\n",
" g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang=\"en\")))"
]
},
{
"cell_type": "markdown",
"id": "ea32849b",
"metadata": {},
"source": [
"Parts"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "8fdb080e",
"metadata": {},
"outputs": [],
"source": [
"for part in re_parts.itertuples(index=False):\n",
" part_ref = thm[f\"part/{part.part_num}\"]\n",
"\n",
" g.add((part_ref, RDFS.label, Literal(part.name, lang=\"en\")))\n",
" g.add((part_ref, THM.part_category, thm[f\"part_category/{part.part_cat_id}\"]))\n",
" g.add((part_ref, THM.part_material, Literal(part.part_material)))"
]
},
{
"cell_type": "markdown",
"id": "fcaadd84",
"metadata": {},
"source": [
"Elements"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "579b1d67",
"metadata": {},
"outputs": [],
"source": [
"for element in re_elements.itertuples(index=False):\n",
" part_ref = thm[f\"part/{element.part_num}\"]\n",
" color_ref = thm[f\"colors/{element.color_id}\"]\n",
"\n",
" g.add((part_ref, THM.has_color, color_ref))"
]
},
{
"cell_type": "markdown",
"id": "44dae336",
"metadata": {},
"source": [
"Part Relationships"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "00db079a",
"metadata": {},
"outputs": [],
"source": [
"for part_relationship in re_part_relationships.itertuples(index=False):\n",
" part_ref_parent = thm[f\"part/{part_relationship.parent_part_num}\"]\n",
" part_ref_child = thm[f\"part/{part_relationship.child_part_num}\"]\n",
"\n",
" g.add((part_ref_parent, THM.has_child, part_ref_child))"
]
},
{
"cell_type": "markdown",
"id": "19dc64b8",
"metadata": {},
"source": [
"Themes"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "1a529aae",
"metadata": {},
"outputs": [],
"source": [
"for theme in re_themes.itertuples(index=False):\n",
" theme_ref = thm[f\"theme/{int(theme.id)}\"]\n",
"\n",
" g.add((theme_ref, RDFS.label, Literal(theme.name, lang=\"en\")))\n",
"\n",
" if not pd.isna(theme.parent_id):\n",
" g.add((theme_ref, THM.parent_theme, thm[f\"theme/{int(theme.parent_id)}\"]))"
]
},
{
"cell_type": "markdown",
"id": "3f72c2e9",
"metadata": {},
"source": [
"Sets"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "29b357ef",
"metadata": {},
"outputs": [],
"source": [
"for lego_set in re_sets.itertuples(index=False):\n",
" set_ref = thm[f\"set/lego/{lego_set.set_num}\"]\n",
"\n",
" g.add((set_ref, RDFS.label, Literal(lego_set.name, lang=\"en\")))\n",
" g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))\n",
" g.add((set_ref, THM.theme, thm[f\"theme/{int(lego_set.theme_id)}\"]))\n",
" g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))\n",
" g.add((set_ref, THM.brand, Literal(\"Lego\")))"
]
},
{
"cell_type": "markdown",
"id": "d2616476",
"metadata": {},
"source": [
"Minifigures"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "a67b3e70",
"metadata": {},
"outputs": [],
"source": [
"for minifig in re_minifigs.itertuples(index=False):\n",
" minifig_ref = thm[f\"minifig/{minifig.fig_num}\"]\n",
"\n",
" g.add((set_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n",
" g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))"
]
},
{
"cell_type": "markdown",
"id": "2e9baff1",
"metadata": {},
"source": [
"Now the ugly part: Inventories"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "0c97dc4d",
"metadata": {},
"outputs": [],
"source": [
"for inventory in re_inventories.itertuples(index=False):\n",
" inventory_ref = thm[f\"inventory/{inventory.id}\"]\n",
"\n",
" g.add((inventory_ref, THM.set, thm[f\"sets/lego/{inventory.set_num}\"]))"
]
},
{
"cell_type": "markdown",
"id": "7c962cf0",
"metadata": {},
"source": [
"Inventories relate sets, minifigures and parts to each other, creating a kind of \"top level set\" \n",
"(this takes a lot of time)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "dc2ba03e",
"metadata": {},
"outputs": [],
"source": [
"for inventory_part in re_inventory_parts.itertuples(index=False):\n",
" inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\n",
" \n",
" inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\n",
" part_ref = thm[f\"part/{inventory_part.part_num}\"]\n",
"\n",
" g.add((inventory_part_ref, RDFS.domain, inventory_ref))\n",
" g.add((inventory_part_ref, RDFS.range, part_ref))\n",
" g.add((inventory_part_ref, RDF.type, RDF.Property))\n",
" \n",
" g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\n",
" g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\n",
" g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "8715a1cf",
"metadata": {},
"outputs": [],
"source": [
"for inventory_set in re_inventory_sets.itertuples(index=False):\n",
" inventory_set_ref = thm[f\"inventory_set/{inventory_set.inventory_id}/{inventory_set.set_num}\"]\n",
"\n",
" inventory_ref = thm[f\"inventory/{inventory_set.inventory_id}\"]\n",
" set_ref = thm[f\"set/lego/{inventory_set.set_num}\"]\n",
"\n",
" g.add((inventory_set_ref, RDFS.domain, inventory_ref))\n",
" g.add((inventory_set_ref, RDFS.range, set_ref))\n",
" g.add((inventory_set_ref, RDF.type, RDF.Property))\n",
"\n",
" g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "08c2c580",
"metadata": {},
"outputs": [],
"source": [
"for inventory_minifig in re_inventory_minifigs.itertuples(index=False):\n",
" inventory_minifig_ref = thm[f\"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}\"]\n",
"\n",
" inventory_ref = thm[f\"inventory/{inventory_minifig.inventory_id}\"]\n",
" minifig_ref = thm[f\"minifig/lego/{inventory_minifig.fig_num}\"]\n",
"\n",
" g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))\n",
" g.add((inventory_minifig_ref, RDFS.range, minifig_ref))\n",
" g.add((inventory_minifig_ref, RDF.type, RDF.Property))\n",
"\n",
" g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))"
]
},
{
"cell_type": "markdown",
"id": "bfab0c73",
"metadata": {},
"source": [
"Serialize the graph in turtle format"
]
},
{
"cell_type": "markdown",
"id": "2abd6894",
"metadata": {},
"source": [
"```\n",
" ___-------___\n",
" _-~~ ~~-_\n",
" _-~ /~-_\n",
" /^\\__/^\\ /~ \\ / \\\n",
" /| O|| O| / \\_______________/ \\\n",
"| |___||__| / / \\ \\\n",
"| \\ / / \\ \\\n",
"| (_______) /______/ \\_________ \\\n",
"| / / \\ / \\\n",
" \\ \\^\\\\ \\ / \\ /\n",
" \\ || \\______________/ _-_ //\\__//\n",
" \\ ||------_-~~-_ ------------- \\ --/~ ~\\ || __/\n",
" ~-----||====/~ |==================| |/~~~~~\n",
" (_(__/ ./ / \\_\\ \\.\n",
" (_(___/ \\_____)_)\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a30bff8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nd0322d7d995f458896746825ba0ca42f (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g.bind(\"thmont\", THM)\n",
"\n",
"g.serialize(\"lego_graph_rebrickable.ttl\", format=\"turtle\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}