kgr/lego/graph.py

213 lines
7.4 KiB
Python

# %% [markdown]
# Build the Lego Knowledge Graph using the sources in `/data`.
# %%
from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal
import pandas as pd
from datetime import datetime
# %% [markdown]
# Setup the requirements for building a knowledge graph
# %%
g = Graph()
thm = Namespace("https://thm.de/")
THM = Namespace("https://thm.de/ont/")
# %% [markdown]
# # Rebrickable
# %% [markdown]
# ![Rebrickable](\data\rebrickable\downloads_schema_v3.png)
# %%
re_colors = pd.read_csv("data/rebrickable/colors.csv")
re_elements = pd.read_csv("data/rebrickable/elements.csv")
re_inventories = pd.read_csv("data/rebrickable/inventories.csv")
re_inventory_minifigs = pd.read_csv("data/rebrickable/inventory_minifigs.csv")
re_inventory_parts = pd.read_csv("data/rebrickable/inventory_parts.csv")
re_inventory_sets = pd.read_csv("data/rebrickable/inventory_sets.csv")
re_minifigs = pd.read_csv("data/rebrickable/minifigs.csv")
re_part_categories = pd.read_csv("data/rebrickable/part_categories.csv")
re_part_relationships = pd.read_csv("data/rebrickable/part_relationships.csv")
re_parts = pd.read_csv("data/rebrickable/parts.csv")
re_sets = pd.read_csv("data/rebrickable/sets.csv")
re_themes = pd.read_csv("data/rebrickable/themes.csv")
# %% [markdown]
# Colors
# %%
for color in re_colors.itertuples(index=False):
color_ref = thm[f"colors/{color.id}"]
g.add((color_ref, RDFS.label, Literal(color.name, lang="en")))
g.add((color_ref, THM.color, Literal(color.rgb)))
g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))
if not pd.isna(color.y1):
# First appearance
g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))
if not pd.isna(color.y2):
# Last appearance
g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))
# %% [markdown]
# Part Categories
# %%
for part_category in re_part_categories.itertuples(index=False):
part_category_ref = thm[f"part_category/{part_category.id}"]
g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang="en")))
# %% [markdown]
# Parts
# %%
for part in re_parts.itertuples(index=False):
part_ref = thm[f"part/{part.part_num}"]
g.add((part_ref, RDFS.label, Literal(part.name, lang="en")))
g.add((part_ref, THM.part_category, thm[f"part_category/{part.part_cat_id}"]))
g.add((part_ref, THM.part_material, Literal(part.part_material)))
# %% [markdown]
# Elements
# %%
for element in re_elements.itertuples(index=False):
part_ref = thm[f"part/{element.part_num}"]
color_ref = thm[f"colors/{element.color_id}"]
g.add((part_ref, THM.has_color, color_ref))
# %% [markdown]
# Part Relationships
# %%
for part_relationship in re_part_relationships.itertuples(index=False):
part_ref_parent = thm[f"part/{part_relationship.parent_part_num}"]
part_ref_child = thm[f"part/{part_relationship.child_part_num}"]
g.add((part_ref_parent, THM.has_child, part_ref_child))
# %% [markdown]
# Themes
# %%
for theme in re_themes.itertuples(index=False):
theme_ref = thm[f"theme/{int(theme.id)}"]
g.add((theme_ref, RDFS.label, Literal(theme.name, lang="en")))
if not pd.isna(theme.parent_id):
g.add((theme_ref, THM.parent_theme, thm[f"theme/{int(theme.parent_id)}"]))
# %% [markdown]
# Sets
# %%
for lego_set in re_sets.itertuples(index=False):
set_ref = thm[f"set/lego/{lego_set.set_num}"]
g.add((set_ref, RDFS.label, Literal(lego_set.name, lang="en")))
g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))
g.add((set_ref, THM.theme, thm[f"theme/{int(lego_set.theme_id)}"]))
g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))
g.add((set_ref, THM.brand, Literal("Lego")))
# %% [markdown]
# Minifigures
# %%
for minifig in re_minifigs.itertuples(index=False):
minifig_ref = thm[f"minifig/{minifig.fig_num}"]
g.add((set_ref, RDFS.label, Literal(minifig.name, lang="en")))
g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))
# %% [markdown]
# Now the ugly part: Inventories
# %%
for inventory in re_inventories.itertuples(index=False):
inventory_ref = thm[f"inventory/{inventory.id}"]
g.add((inventory_ref, THM.set, thm[f"sets/lego/{inventory.set_num}"]))
# %% [markdown]
# Inventories relate sets, minifigures and parts to each other, creating a kind of "top level set"
# (this takes a lot of time)
# %%
for inventory_part in re_inventory_parts.itertuples(index=False):
inventory_part_ref = thm[f"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}"]
inventory_ref = thm[f"inventory/{inventory_part.inventory_id}"]
part_ref = thm[f"part/{inventory_part.part_num}"]
g.add((inventory_part_ref, RDFS.domain, inventory_ref))
g.add((inventory_part_ref, RDFS.range, part_ref))
g.add((inventory_part_ref, RDF.type, RDF.Property))
g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))
g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))
g.add((inventory_part_ref, THM.color, thm[f"color/{inventory_part.color_id}"]))
# %%
for inventory_set in re_inventory_sets.itertuples(index=False):
inventory_set_ref = thm[f"inventory_set/{inventory_set.inventory_id}/{inventory_set.set_num}"]
inventory_ref = thm[f"inventory/{inventory_set.inventory_id}"]
set_ref = thm[f"set/lego/{inventory_set.set_num}"]
g.add((inventory_set_ref, RDFS.domain, inventory_ref))
g.add((inventory_set_ref, RDFS.range, set_ref))
g.add((inventory_set_ref, RDF.type, RDF.Property))
g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))
# %%
for inventory_minifig in re_inventory_minifigs.itertuples(index=False):
inventory_minifig_ref = thm[f"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}"]
inventory_ref = thm[f"inventory/{inventory_minifig.inventory_id}"]
minifig_ref = thm[f"minifig/lego/{inventory_minifig.fig_num}"]
g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))
g.add((inventory_minifig_ref, RDFS.range, minifig_ref))
g.add((inventory_minifig_ref, RDF.type, RDF.Property))
g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))
# %% [markdown]
# Serialize the graph in turtle format
# %% [markdown]
# ```
# ___-------___
# _-~~ ~~-_
# _-~ /~-_
# /^\__/^\ /~ \ / \
# /| O|| O| / \_______________/ \
# | |___||__| / / \ \
# | \ / / \ \
# | (_______) /______/ \_________ \
# | / / \ / \
# \ \^\\ \ / \ /
# \ || \______________/ _-_ //\__//
# \ ||------_-~~-_ ------------- \ --/~ ~\ || __/
# ~-----||====/~ |==================| |/~~~~~
# (_(__/ ./ / \_\ \.
# (_(___/ \_____)_)
# ```
# %%
g.bind("thmont", THM)
g.serialize("lego_graph_rebrickable.ttl", format="turtle")