{ "cells": [ { "cell_type": "markdown", "id": "747b245f", "metadata": {}, "source": [ "Build the Lego Knowledge Graph using the sources in `/data`." ] }, { "cell_type": "code", "execution_count": 9, "id": "90209948", "metadata": {}, "outputs": [], "source": [ "from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal\n", "import pandas as pd\n", "from datetime import datetime\n", "import os" ] }, { "cell_type": "markdown", "id": "fe91fa67", "metadata": {}, "source": [ "Setup the requirements for building a knowledge graph" ] }, { "cell_type": "code", "execution_count": 10, "id": "8e573135", "metadata": {}, "outputs": [], "source": [ "g = Graph()\n", "thm = Namespace(\"https://thm.de/\")\n", "THM = Namespace(\"https://thm.de/ont/\")" ] }, { "cell_type": "markdown", "id": "d56199d5", "metadata": {}, "source": [ "# Rebrickable" ] }, { "cell_type": "markdown", "id": "d1e1abb0", "metadata": {}, "source": [ "![Rebrickable](\\data\\rebrickable\\downloads_schema_v3.png)" ] }, { "cell_type": "code", "execution_count": 11, "id": "d8a1fe84", "metadata": {}, "outputs": [], "source": [ "re_colors = pd.read_csv(\"data/rebrickable/colors.csv\")\n", "re_elements = pd.read_csv(\"data/rebrickable/elements.csv\")\n", "re_inventories = pd.read_csv(\"data/rebrickable/inventories.csv\")\n", "re_inventory_minifigs = pd.read_csv(\"data/rebrickable/inventory_minifigs.csv\")\n", "re_inventory_parts = pd.read_csv(\"data/rebrickable/inventory_parts.csv\")\n", "re_inventory_sets = pd.read_csv(\"data/rebrickable/inventory_sets.csv\")\n", "re_minifigs = pd.read_csv(\"data/rebrickable/minifigs.csv\")\n", "re_part_categories = pd.read_csv(\"data/rebrickable/part_categories.csv\")\n", "re_part_relationships = pd.read_csv(\"data/rebrickable/part_relationships.csv\")\n", "re_parts = pd.read_csv(\"data/rebrickable/parts.csv\")\n", "re_sets = pd.read_csv(\"data/rebrickable/sets.csv\")\n", "re_themes = pd.read_csv(\"data/rebrickable/themes.csv\")" ] }, { "cell_type": "markdown", "id": "f3677416", "metadata": {}, "source": [ "Colors" ] }, { "cell_type": "code", "execution_count": 12, "id": "ae505704", "metadata": {}, "outputs": [], "source": [ "for color in re_colors.itertuples(index=False):\n", " color_ref = thm[f\"colors/{color.id}\"]\n", "\n", " g.add((color_ref, RDFS.label, Literal(color.name, lang=\"en\")))\n", " g.add((color_ref, THM.color, Literal(color.rgb)))\n", " g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))\n", " \n", " if not pd.isna(color.y1):\n", " # First appearance\n", " g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))\n", " if not pd.isna(color.y2):\n", " # Last appearance\n", " g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))\n" ] }, { "cell_type": "markdown", "id": "e27b2bc4", "metadata": {}, "source": [ "Part Categories" ] }, { "cell_type": "code", "execution_count": 13, "id": "fb9e17d6", "metadata": {}, "outputs": [], "source": [ "for part_category in re_part_categories.itertuples(index=False):\n", " part_category_ref = thm[f\"part_category/{part_category.id}\"]\n", "\n", " g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang=\"en\")))" ] }, { "cell_type": "markdown", "id": "ea32849b", "metadata": {}, "source": [ "Parts" ] }, { "cell_type": "code", "execution_count": 14, "id": "8fdb080e", "metadata": {}, "outputs": [], "source": [ "for part in re_parts.itertuples(index=False):\n", " part_ref = thm[f\"part/{part.part_num}\"]\n", "\n", " g.add((part_ref, RDFS.label, Literal(part.name, lang=\"en\")))\n", " g.add((part_ref, THM.part_category, thm[f\"part_category/{part.part_cat_id}\"]))\n", " g.add((part_ref, THM.part_material, Literal(part.part_material)))" ] }, { "cell_type": "markdown", "id": "fcaadd84", "metadata": {}, "source": [ "Elements" ] }, { "cell_type": "code", "execution_count": 15, "id": "579b1d67", "metadata": {}, "outputs": [], "source": [ "for element in re_elements.itertuples(index=False):\n", " part_ref = thm[f\"part/{element.part_num}\"]\n", " color_ref = thm[f\"colors/{element.color_id}\"]\n", "\n", " g.add((part_ref, THM.has_color, color_ref))" ] }, { "cell_type": "markdown", "id": "44dae336", "metadata": {}, "source": [ "Part Relationships" ] }, { "cell_type": "code", "execution_count": 16, "id": "00db079a", "metadata": {}, "outputs": [], "source": [ "for part_relationship in re_part_relationships.itertuples(index=False):\n", " part_ref_parent = thm[f\"part/{part_relationship.parent_part_num}\"]\n", " part_ref_child = thm[f\"part/{part_relationship.child_part_num}\"]\n", "\n", " g.add((part_ref_parent, THM.has_child, part_ref_child))" ] }, { "cell_type": "markdown", "id": "19dc64b8", "metadata": {}, "source": [ "Themes" ] }, { "cell_type": "code", "execution_count": 17, "id": "1a529aae", "metadata": {}, "outputs": [], "source": [ "for theme in re_themes.itertuples(index=False):\n", " theme_ref = thm[f\"theme/{int(theme.id)}\"]\n", "\n", " g.add((theme_ref, RDFS.label, Literal(theme.name, lang=\"en\")))\n", "\n", " if not pd.isna(theme.parent_id):\n", " g.add((theme_ref, THM.parent_theme, thm[f\"theme/{int(theme.parent_id)}\"]))" ] }, { "cell_type": "markdown", "id": "3f72c2e9", "metadata": {}, "source": [ "Sets" ] }, { "cell_type": "code", "execution_count": 18, "id": "29b357ef", "metadata": {}, "outputs": [], "source": [ "for lego_set in re_sets.itertuples(index=False):\n", " set_ref = thm[f\"set/lego/{lego_set.set_num}\"]\n", "\n", " g.add((set_ref, RDFS.label, Literal(lego_set.name, lang=\"en\")))\n", " g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))\n", " g.add((set_ref, THM.theme, thm[f\"theme/{int(lego_set.theme_id)}\"]))\n", " g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))\n", " g.add((set_ref, THM.brand, Literal(\"Lego\")))" ] }, { "cell_type": "markdown", "id": "d2616476", "metadata": {}, "source": [ "Minifigures" ] }, { "cell_type": "code", "execution_count": 19, "id": "a67b3e70", "metadata": {}, "outputs": [], "source": [ "for minifig in re_minifigs.itertuples(index=False):\n", " minifig_ref = thm[f\"minifig/{minifig.fig_num}\"]\n", "\n", " g.add((set_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n", " g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))" ] }, { "cell_type": "markdown", "id": "2e9baff1", "metadata": {}, "source": [ "Now the ugly part: Inventories" ] }, { "cell_type": "code", "execution_count": 20, "id": "0c97dc4d", "metadata": {}, "outputs": [], "source": [ "for inventory in re_inventories.itertuples(index=False):\n", " inventory_ref = thm[f\"inventory/{inventory.id}\"]\n", "\n", " g.add((inventory_ref, THM.set, thm[f\"sets/lego/{inventory.set_num}\"]))" ] }, { "cell_type": "markdown", "id": "7c962cf0", "metadata": {}, "source": [ "Inventories relate sets, minifigures and parts to each other, creating a kind of \"top level set\" \n", "(this takes a lot of time)" ] }, { "cell_type": "code", "execution_count": 21, "id": "dc2ba03e", "metadata": {}, "outputs": [], "source": [ "for inventory_part in re_inventory_parts.itertuples(index=False):\n", " inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\n", " \n", " inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\n", " part_ref = thm[f\"part/{inventory_part.part_num}\"]\n", "\n", " g.add((inventory_part_ref, RDFS.domain, inventory_ref))\n", " g.add((inventory_part_ref, RDFS.range, part_ref))\n", " g.add((inventory_part_ref, RDF.type, RDF.Property))\n", " \n", " g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\n", " g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\n", " g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))" ] }, { "cell_type": "code", "execution_count": 22, "id": "8715a1cf", "metadata": {}, "outputs": [], "source": [ "for inventory_set in re_inventory_sets.itertuples(index=False):\n", " inventory_set_ref = thm[f\"inventory_set/{inventory_set.inventory_id}/{inventory_set.set_num}\"]\n", "\n", " inventory_ref = thm[f\"inventory/{inventory_set.inventory_id}\"]\n", " set_ref = thm[f\"set/lego/{inventory_set.set_num}\"]\n", "\n", " g.add((inventory_set_ref, RDFS.domain, inventory_ref))\n", " g.add((inventory_set_ref, RDFS.range, set_ref))\n", " g.add((inventory_set_ref, RDF.type, RDF.Property))\n", "\n", " g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))" ] }, { "cell_type": "code", "execution_count": 23, "id": "08c2c580", "metadata": {}, "outputs": [], "source": [ "for inventory_minifig in re_inventory_minifigs.itertuples(index=False):\n", " inventory_minifig_ref = thm[f\"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}\"]\n", "\n", " inventory_ref = thm[f\"inventory/{inventory_minifig.inventory_id}\"]\n", " minifig_ref = thm[f\"minifig/lego/{inventory_minifig.fig_num}\"]\n", "\n", " g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))\n", " g.add((inventory_minifig_ref, RDFS.range, minifig_ref))\n", " g.add((inventory_minifig_ref, RDF.type, RDF.Property))\n", "\n", " g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))" ] }, { "cell_type": "markdown", "id": "dcbab237", "metadata": {}, "source": [ "# Brickset" ] }, { "cell_type": "markdown", "id": "d8fb5374", "metadata": {}, "source": [ "add for prices" ] }, { "cell_type": "code", "execution_count": 45, "id": "1e0ac437", "metadata": {}, "outputs": [], "source": [ "bs_sets = pd.read_csv(\"./data/brickset/sets.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fd944ccb", "metadata": {}, "outputs": [], "source": [ "\n", "for bs_set in bs_sets.itertuples(index=False):\n", " num = f\"{str(bs_set.Number).strip()}-{str(bs_set.Variant)}\" #Error for Set 853357\n", " set_ref = thm[f\"set/lego/{num}\"]\n", "\n", " " ] }, { "cell_type": "markdown", "id": "bfab0c73", "metadata": {}, "source": [ "Serialize the graph in turtle format" ] }, { "cell_type": "markdown", "id": "2abd6894", "metadata": {}, "source": [ "```\n", " ___-------___\n", " _-~~ ~~-_\n", " _-~ /~-_\n", " /^\\__/^\\ /~ \\ / \\\n", " /| O|| O| / \\_______________/ \\\n", "| |___||__| / / \\ \\\n", "| \\ / / \\ \\\n", "| (_______) /______/ \\_________ \\\n", "| / / \\ / \\\n", " \\ \\^\\\\ \\ / \\ /\n", " \\ || \\______________/ _-_ //\\__//\n", " \\ ||------_-~~-_ ------------- \\ --/~ ~\\ || __/\n", " ~-----||====/~ |==================| |/~~~~~\n", " (_(__/ ./ / \\_\\ \\.\n", " (_(___/ \\_____)_)\n", "```" ] }, { "cell_type": "code", "execution_count": 26, "id": "1a30bff8", "metadata": {}, "outputs": [], "source": [ "g.bind(\"thmont\", THM)\n", "\n", "#g.serialize(\"lego_graph_rebrickable.ttl\", format=\"turtle\")" ] } ], "metadata": { "kernelspec": { "display_name": "venv (3.14.4)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.4" } }, "nbformat": 4, "nbformat_minor": 5 }