example-queries #3
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
212
lego/graph.py
212
lego/graph.py
|
|
@ -1,212 +0,0 @@
|
|||
# %% [markdown]
|
||||
# Build the Lego Knowledge Graph using the sources in `/data`.
|
||||
|
||||
# %%
|
||||
from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
# %% [markdown]
|
||||
# Setup the requirements for building a knowledge graph
|
||||
|
||||
# %%
|
||||
g = Graph()
|
||||
thm = Namespace("https://thm.de/")
|
||||
THM = Namespace("https://thm.de/ont/")
|
||||
|
||||
# %% [markdown]
|
||||
# # Rebrickable
|
||||
|
||||
# %% [markdown]
|
||||
# 
|
||||
|
||||
# %%
|
||||
re_colors = pd.read_csv("data/rebrickable/colors.csv")
|
||||
re_elements = pd.read_csv("data/rebrickable/elements.csv")
|
||||
re_inventories = pd.read_csv("data/rebrickable/inventories.csv")
|
||||
re_inventory_minifigs = pd.read_csv("data/rebrickable/inventory_minifigs.csv")
|
||||
re_inventory_parts = pd.read_csv("data/rebrickable/inventory_parts.csv")
|
||||
re_inventory_sets = pd.read_csv("data/rebrickable/inventory_sets.csv")
|
||||
re_minifigs = pd.read_csv("data/rebrickable/minifigs.csv")
|
||||
re_part_categories = pd.read_csv("data/rebrickable/part_categories.csv")
|
||||
re_part_relationships = pd.read_csv("data/rebrickable/part_relationships.csv")
|
||||
re_parts = pd.read_csv("data/rebrickable/parts.csv")
|
||||
re_sets = pd.read_csv("data/rebrickable/sets.csv")
|
||||
re_themes = pd.read_csv("data/rebrickable/themes.csv")
|
||||
|
||||
# %% [markdown]
|
||||
# Colors
|
||||
|
||||
# %%
|
||||
for color in re_colors.itertuples(index=False):
|
||||
color_ref = thm[f"colors/{color.id}"]
|
||||
|
||||
g.add((color_ref, RDFS.label, Literal(color.name, lang="en")))
|
||||
g.add((color_ref, THM.color, Literal(color.rgb)))
|
||||
g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))
|
||||
|
||||
if not pd.isna(color.y1):
|
||||
# First appearance
|
||||
g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))
|
||||
if not pd.isna(color.y2):
|
||||
# Last appearance
|
||||
g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
# Part Categories
|
||||
|
||||
# %%
|
||||
for part_category in re_part_categories.itertuples(index=False):
|
||||
part_category_ref = thm[f"part_category/{part_category.id}"]
|
||||
|
||||
g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang="en")))
|
||||
|
||||
# %% [markdown]
|
||||
# Parts
|
||||
|
||||
# %%
|
||||
for part in re_parts.itertuples(index=False):
|
||||
part_ref = thm[f"part/{part.part_num}"]
|
||||
|
||||
g.add((part_ref, RDFS.label, Literal(part.name, lang="en")))
|
||||
g.add((part_ref, THM.part_category, thm[f"part_category/{part.part_cat_id}"]))
|
||||
g.add((part_ref, THM.part_material, Literal(part.part_material)))
|
||||
|
||||
# %% [markdown]
|
||||
# Elements
|
||||
|
||||
# %%
|
||||
for element in re_elements.itertuples(index=False):
|
||||
part_ref = thm[f"part/{element.part_num}"]
|
||||
color_ref = thm[f"colors/{element.color_id}"]
|
||||
|
||||
g.add((part_ref, THM.has_color, color_ref))
|
||||
|
||||
# %% [markdown]
|
||||
# Part Relationships
|
||||
|
||||
# %%
|
||||
for part_relationship in re_part_relationships.itertuples(index=False):
|
||||
part_ref_parent = thm[f"part/{part_relationship.parent_part_num}"]
|
||||
part_ref_child = thm[f"part/{part_relationship.child_part_num}"]
|
||||
|
||||
g.add((part_ref_parent, THM.has_child, part_ref_child))
|
||||
|
||||
# %% [markdown]
|
||||
# Themes
|
||||
|
||||
# %%
|
||||
for theme in re_themes.itertuples(index=False):
|
||||
theme_ref = thm[f"theme/{int(theme.id)}"]
|
||||
|
||||
g.add((theme_ref, RDFS.label, Literal(theme.name, lang="en")))
|
||||
|
||||
if not pd.isna(theme.parent_id):
|
||||
g.add((theme_ref, THM.parent_theme, thm[f"theme/{int(theme.parent_id)}"]))
|
||||
|
||||
# %% [markdown]
|
||||
# Sets
|
||||
|
||||
# %%
|
||||
for lego_set in re_sets.itertuples(index=False):
|
||||
set_ref = thm[f"set/lego/{lego_set.set_num}"]
|
||||
|
||||
g.add((set_ref, RDFS.label, Literal(lego_set.name, lang="en")))
|
||||
g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))
|
||||
g.add((set_ref, THM.theme, thm[f"theme/{int(lego_set.theme_id)}"]))
|
||||
g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))
|
||||
g.add((set_ref, THM.brand, Literal("Lego")))
|
||||
|
||||
# %% [markdown]
|
||||
# Minifigures
|
||||
|
||||
# %%
|
||||
for minifig in re_minifigs.itertuples(index=False):
|
||||
minifig_ref = thm[f"minifig/{minifig.fig_num}"]
|
||||
|
||||
g.add((set_ref, RDFS.label, Literal(minifig.name, lang="en")))
|
||||
g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))
|
||||
|
||||
# %% [markdown]
|
||||
# Now the ugly part: Inventories
|
||||
|
||||
# %%
|
||||
for inventory in re_inventories.itertuples(index=False):
|
||||
inventory_ref = thm[f"inventory/{inventory.id}"]
|
||||
|
||||
g.add((inventory_ref, THM.set, thm[f"sets/lego/{inventory.set_num}"]))
|
||||
|
||||
# %% [markdown]
|
||||
# Inventories relate sets, minifigures and parts to each other, creating a kind of "top level set"
|
||||
# (this takes a lot of time)
|
||||
|
||||
# %%
|
||||
for inventory_part in re_inventory_parts.itertuples(index=False):
|
||||
inventory_part_ref = thm[f"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}"]
|
||||
|
||||
inventory_ref = thm[f"inventory/{inventory_part.inventory_id}"]
|
||||
part_ref = thm[f"part/{inventory_part.part_num}"]
|
||||
|
||||
g.add((inventory_part_ref, RDFS.domain, inventory_ref))
|
||||
g.add((inventory_part_ref, RDFS.range, part_ref))
|
||||
g.add((inventory_part_ref, RDF.type, RDF.Property))
|
||||
|
||||
g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))
|
||||
g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))
|
||||
g.add((inventory_part_ref, THM.color, thm[f"color/{inventory_part.color_id}"]))
|
||||
|
||||
# %%
|
||||
for inventory_set in re_inventory_sets.itertuples(index=False):
|
||||
inventory_set_ref = thm[f"inventory_set/{inventory_set.inventory_id}/{inventory_set.set_num}"]
|
||||
|
||||
inventory_ref = thm[f"inventory/{inventory_set.inventory_id}"]
|
||||
set_ref = thm[f"set/lego/{inventory_set.set_num}"]
|
||||
|
||||
g.add((inventory_set_ref, RDFS.domain, inventory_ref))
|
||||
g.add((inventory_set_ref, RDFS.range, set_ref))
|
||||
g.add((inventory_set_ref, RDF.type, RDF.Property))
|
||||
|
||||
g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))
|
||||
|
||||
# %%
|
||||
for inventory_minifig in re_inventory_minifigs.itertuples(index=False):
|
||||
inventory_minifig_ref = thm[f"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}"]
|
||||
|
||||
inventory_ref = thm[f"inventory/{inventory_minifig.inventory_id}"]
|
||||
minifig_ref = thm[f"minifig/lego/{inventory_minifig.fig_num}"]
|
||||
|
||||
g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))
|
||||
g.add((inventory_minifig_ref, RDFS.range, minifig_ref))
|
||||
g.add((inventory_minifig_ref, RDF.type, RDF.Property))
|
||||
|
||||
g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))
|
||||
|
||||
# %% [markdown]
|
||||
# Serialize the graph in turtle format
|
||||
|
||||
# %% [markdown]
|
||||
# ```
|
||||
# ___-------___
|
||||
# _-~~ ~~-_
|
||||
# _-~ /~-_
|
||||
# /^\__/^\ /~ \ / \
|
||||
# /| O|| O| / \_______________/ \
|
||||
# | |___||__| / / \ \
|
||||
# | \ / / \ \
|
||||
# | (_______) /______/ \_________ \
|
||||
# | / / \ / \
|
||||
# \ \^\\ \ / \ /
|
||||
# \ || \______________/ _-_ //\__//
|
||||
# \ ||------_-~~-_ ------------- \ --/~ ~\ || __/
|
||||
# ~-----||====/~ |==================| |/~~~~~
|
||||
# (_(__/ ./ / \_\ \.
|
||||
# (_(___/ \_____)_)
|
||||
# ```
|
||||
|
||||
# %%
|
||||
g.bind("thmont", THM)
|
||||
|
||||
g.serialize("lego_graph_rebrickable.ttl", format="turtle")
|
||||
|
||||
|
||||
|
|
@ -10,15 +10,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 257,
|
||||
"id": "90209948",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rdflib import Graph, Namespace, XSD, OWL, RDF, RDFS, SKOS, URIRef, Literal\n",
|
||||
"import pandas as pd\n",
|
||||
"from datetime import datetime\n",
|
||||
"import os"
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -31,7 +30,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 258,
|
||||
"id": "8e573135",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -59,7 +58,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 259,
|
||||
"id": "d8a1fe84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -88,7 +87,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 260,
|
||||
"id": "ae505704",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -96,16 +95,17 @@
|
|||
"for color in re_colors.itertuples(index=False):\n",
|
||||
" color_ref = thm[f\"color/{color.id}\"]\n",
|
||||
"\n",
|
||||
" g.add((color_ref, RDF.type, THM.Color ))\n",
|
||||
" g.add((color_ref, RDFS.label, Literal(color.name, lang=\"en\")))\n",
|
||||
" g.add((color_ref, THM.color, Literal(color.rgb)))\n",
|
||||
" g.add((color_ref, THM.rgbcolor, Literal(color.rgb)))\n",
|
||||
" g.add((color_ref, THM.is_transparent, Literal(color.is_trans, datatype=XSD.boolean)))\n",
|
||||
" \n",
|
||||
" if not pd.isna(color.y1):\n",
|
||||
" # First appearance\n",
|
||||
" g.add((color_ref, THM.first_year, Literal(datetime(year = int(color.y1), month=1, day=1))))\n",
|
||||
" g.add((color_ref, THM.first_year, Literal(int(color.y1), datatype=XSD.integer)))\n",
|
||||
" if not pd.isna(color.y2):\n",
|
||||
" # Last appearance\n",
|
||||
" g.add((color_ref, THM.last_year, Literal(datetime(year = int(color.y2), month=1, day=1))))\n"
|
||||
" g.add((color_ref, THM.last_year, Literal(int(color.y2), datatype=XSD.integer)))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -118,7 +118,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 261,
|
||||
"id": "fb9e17d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -126,6 +126,7 @@
|
|||
"for part_category in re_part_categories.itertuples(index=False):\n",
|
||||
" part_category_ref = thm[f\"part_category/{part_category.id}\"]\n",
|
||||
"\n",
|
||||
" g.add((part_category_ref, RDF.type, THM.PartCategory ))\n",
|
||||
" g.add((part_category_ref, RDFS.label, Literal(part_category_ref, lang=\"en\")))"
|
||||
]
|
||||
},
|
||||
|
|
@ -139,7 +140,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 262,
|
||||
"id": "8fdb080e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -147,6 +148,7 @@
|
|||
"for part in re_parts.itertuples(index=False):\n",
|
||||
" part_ref = thm[f\"part/{part.part_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((part_ref, RDF.type, THM.Part))\n",
|
||||
" g.add((part_ref, RDFS.label, Literal(part.name, lang=\"en\")))\n",
|
||||
" g.add((part_ref, THM.part_category, thm[f\"part_category/{part.part_cat_id}\"]))\n",
|
||||
" g.add((part_ref, THM.part_material, Literal(part.part_material)))"
|
||||
|
|
@ -162,16 +164,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 263,
|
||||
"id": "579b1d67",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for element in re_elements.itertuples(index=False):\n",
|
||||
" element_ref = thm[f\"element/{element.element_id}\"]\n",
|
||||
" part_ref = thm[f\"part/{element.part_num}\"]\n",
|
||||
" color_ref = thm[f\"color/{element.color_id}\"]\n",
|
||||
"\n",
|
||||
" g.add((part_ref, THM.has_color, color_ref))"
|
||||
" g.add((element_ref, RDF.type, RDF.Property))\n",
|
||||
" g.add((element_ref, RDF.type, THM.Element))\n",
|
||||
"\n",
|
||||
" g.add((element_ref, RDFS.domain, THM.Part))\n",
|
||||
" g.add((element_ref, RDFS.range, THM.Color))\n",
|
||||
"\n",
|
||||
" g.add((element_ref, THM.color, color_ref))\n",
|
||||
" g.add((part_ref, THM.part, part_ref))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -184,7 +194,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 264,
|
||||
"id": "00db079a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -193,7 +203,8 @@
|
|||
" part_ref_parent = thm[f\"part/{part_relationship.parent_part_num}\"]\n",
|
||||
" part_ref_child = thm[f\"part/{part_relationship.child_part_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((part_ref_parent, THM.has_child, part_ref_child))"
|
||||
" g.add((part_ref_parent, THM.has_child, part_ref_child))\n",
|
||||
" g.add((part_ref_child, THM.has_parent, part_ref_parent))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -206,7 +217,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 265,
|
||||
"id": "1a529aae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -214,6 +225,7 @@
|
|||
"for theme in re_themes.itertuples(index=False):\n",
|
||||
" theme_ref = thm[f\"theme/{int(theme.id)}\"]\n",
|
||||
"\n",
|
||||
" g.add((theme_ref, RDF.type, THM.Theme))\n",
|
||||
" g.add((theme_ref, RDFS.label, Literal(theme.name, lang=\"en\")))\n",
|
||||
"\n",
|
||||
" if not pd.isna(theme.parent_id):\n",
|
||||
|
|
@ -230,7 +242,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 266,
|
||||
"id": "29b357ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -238,8 +250,9 @@
|
|||
"for lego_set in re_sets.itertuples(index=False):\n",
|
||||
" set_ref = thm[f\"set/lego/{lego_set.set_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((set_ref, RDF.type, THM.Set))\n",
|
||||
" g.add((set_ref, RDFS.label, Literal(lego_set.name, lang=\"en\")))\n",
|
||||
" g.add((set_ref, THM.year, Literal(datetime(int(lego_set.year), 1, 1))))\n",
|
||||
" g.add((set_ref, THM.year, Literal(int(lego_set.year), datatype=XSD.integer)))\n",
|
||||
" g.add((set_ref, THM.theme, thm[f\"theme/{int(lego_set.theme_id)}\"]))\n",
|
||||
" g.add((set_ref, THM.num_parts, Literal(int(lego_set.num_parts), datatype=XSD.integer)))\n",
|
||||
" g.add((set_ref, THM.brand, Literal(\"Lego\")))"
|
||||
|
|
@ -255,7 +268,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 267,
|
||||
"id": "a67b3e70",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -263,8 +276,9 @@
|
|||
"for minifig in re_minifigs.itertuples(index=False):\n",
|
||||
" minifig_ref = thm[f\"minifig/{minifig.fig_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((set_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n",
|
||||
" g.add((set_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))"
|
||||
" g.add((minifig_ref, RDF.type, THM.Minifig))\n",
|
||||
" g.add((minifig_ref, RDFS.label, Literal(minifig.name, lang=\"en\")))\n",
|
||||
" g.add((minifig_ref, THM.num_parts, Literal(int(minifig.num_parts), datatype=XSD.integer)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -277,7 +291,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 268,
|
||||
"id": "0c97dc4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -285,6 +299,7 @@
|
|||
"for inventory in re_inventories.itertuples(index=False):\n",
|
||||
" inventory_ref = thm[f\"inventory/{inventory.id}\"]\n",
|
||||
"\n",
|
||||
" g.add((inventory_ref, RDF.type, THM.Inventory))\n",
|
||||
" g.add((inventory_ref, THM.set, thm[f\"set/lego/{inventory.set_num}\"]))"
|
||||
]
|
||||
},
|
||||
|
|
@ -299,29 +314,47 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 269,
|
||||
"id": "dc2ba03e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nfor inventory_part in re_inventory_parts.itertuples(index=False):\\n inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\\n\\n inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\\n part_ref = thm[f\"part/{inventory_part.part_num}\"]\\n\\n g.add((inventory_part_ref, RDF.type, THM.PartInv))\\n g.add((inventory_part_ref, RDF.type, RDF.Property))\\n\\n g.add((inventory_part_ref, RDFS.domain, THM.Inventory))\\n g.add((inventory_part_ref, RDFS.range, THM.Part))\\n\\n g.add((inventory_ref, THM.contains, inventory_part_ref))\\n g.add((part_ref, THM.belongs, inventory_part_ref))\\n\\n g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\\n g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\\n g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 269,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"for inventory_part in re_inventory_parts.itertuples(index=False):\n",
|
||||
" inventory_part_ref = thm[f\"inventory_part/{inventory_part.inventory_id}/{inventory_part.part_num}\"]\n",
|
||||
" \n",
|
||||
" inventory_ref = thm[f\"inventory/{inventory_part.inventory_id}\"]\n",
|
||||
" part_ref = thm[f\"part/{inventory_part.part_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((inventory_part_ref, RDFS.domain, inventory_ref))\n",
|
||||
" g.add((inventory_part_ref, RDFS.range, part_ref))\n",
|
||||
" g.add((inventory_part_ref, RDF.type, THM.PartInv))\n",
|
||||
" g.add((inventory_part_ref, RDF.type, RDF.Property))\n",
|
||||
"\n",
|
||||
" g.add((inventory_part_ref, RDFS.domain, THM.Inventory))\n",
|
||||
" g.add((inventory_part_ref, RDFS.range, THM.Part))\n",
|
||||
" \n",
|
||||
" g.add((inventory_ref, THM.contains, inventory_part_ref))\n",
|
||||
" g.add((part_ref, THM.belongs, inventory_part_ref))\n",
|
||||
"\n",
|
||||
" g.add((inventory_part_ref, THM.quantity, Literal(int(inventory_part.quantity), datatype=XSD.integer)))\n",
|
||||
" g.add((inventory_part_ref, THM.is_spare, Literal(inventory_part.is_spare, datatype=XSD.boolean)))\n",
|
||||
" g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))"
|
||||
" g.add((inventory_part_ref, THM.color, thm[f\"color/{inventory_part.color_id}\"]))\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 270,
|
||||
"id": "8715a1cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -332,16 +365,21 @@
|
|||
" inventory_ref = thm[f\"inventory/{inventory_set.inventory_id}\"]\n",
|
||||
" set_ref = thm[f\"set/lego/{inventory_set.set_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((inventory_set_ref, RDFS.domain, inventory_ref))\n",
|
||||
" g.add((inventory_set_ref, RDFS.range, set_ref))\n",
|
||||
" g.add((inventory_set_ref, RDF.type, THM.SetInv))\n",
|
||||
" g.add((inventory_set_ref, RDF.type, RDF.Property))\n",
|
||||
"\n",
|
||||
" g.add((inventory_set_ref, RDFS.domain, THM.Inventory))\n",
|
||||
" g.add((inventory_set_ref, RDFS.range, THM.Set))\n",
|
||||
"\n",
|
||||
" g.add((inventory_ref, THM.contains, inventory_set_ref))\n",
|
||||
" g.add((set_ref, THM.belongs, inventory_set_ref))\n",
|
||||
" \n",
|
||||
" g.add((inventory_set_ref, THM.quantity, Literal(int(inventory_set.quantity), datatype=XSD.integer)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 271,
|
||||
"id": "08c2c580",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -350,12 +388,17 @@
|
|||
" inventory_minifig_ref = thm[f\"inventory_minifig/{inventory_minifig.inventory_id}/{inventory_minifig.fig_num}\"]\n",
|
||||
"\n",
|
||||
" inventory_ref = thm[f\"inventory/{inventory_minifig.inventory_id}\"]\n",
|
||||
" minifig_ref = thm[f\"minifig/lego/{inventory_minifig.fig_num}\"]\n",
|
||||
" minifig_ref = thm[f\"minifig/{inventory_minifig.fig_num}\"]\n",
|
||||
"\n",
|
||||
" g.add((inventory_minifig_ref, RDFS.domain, inventory_ref))\n",
|
||||
" g.add((inventory_minifig_ref, RDFS.range, minifig_ref))\n",
|
||||
" g.add((inventory_minifig_ref, RDF.type, THM.MinifigInv))\n",
|
||||
" g.add((inventory_minifig_ref, RDF.type, RDF.Property))\n",
|
||||
"\n",
|
||||
" g.add((inventory_minifig_ref, RDFS.domain, THM.Inventory))\n",
|
||||
" g.add((inventory_minifig_ref, RDFS.range, THM.Minifig))\n",
|
||||
"\n",
|
||||
" g.add((inventory_ref, THM.contains, inventory_minifig_ref))\n",
|
||||
" g.add((minifig_ref, THM.belongs, inventory_minifig_ref))\n",
|
||||
" \n",
|
||||
" g.add((inventory_minifig_ref, THM.quantity, Literal(int(inventory_minifig.quantity), datatype=XSD.integer)))"
|
||||
]
|
||||
},
|
||||
|
|
@ -377,17 +420,26 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 272,
|
||||
"id": "1e0ac437",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"bs_sets = pd.read_csv(\"./data/brickset/sets.csv\")"
|
||||
"bs_sets = pd.read_csv(\"./data/brickset/sets.csv\")\n",
|
||||
"bs_parts = pd.read_csv(\"./data/brickset/parts.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d120c079",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Add Set prices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 273,
|
||||
"id": "fd944ccb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -397,14 +449,40 @@
|
|||
" set_ref = thm[f\"set/lego/{num}\"]\n",
|
||||
"\n",
|
||||
" if (set_ref, None, None) in g:\n",
|
||||
" if not pd.isna(bs_set.USRetailPrice):\n",
|
||||
" g.add((set_ref, THM.us_retail_price, Literal(bs_set.USRetailPrice, datatype=XSD.float)))\n",
|
||||
" if not pd.isna(bs_set.DERetailPrice):\n",
|
||||
" g.add((set_ref, THM.de_retail_price, Literal(bs_set.DERetailPrice, datatype=XSD.float)))\n",
|
||||
" if not pd.isna(bs_set.UKRetailPrice):\n",
|
||||
" g.add((set_ref, THM.us_retail_price, Literal(bs_set.UKRetailPrice, datatype=XSD.float)))\n",
|
||||
" if not pd.isna(bs_set.CARetailPrice):\n",
|
||||
" g.add((set_ref, THM.ca_retail_price, Literal(bs_set.CARetailPrice, datatype=XSD.float)))"
|
||||
" #brickset prices already in euro\n",
|
||||
" #choose the cheapest price since the usual customer wont choose the highest price\n",
|
||||
" options = [bs_set.USRetailPrice, bs_set.DERetailPrice, bs_set.UKRetailPrice, bs_set.CARetailPrice]\n",
|
||||
" options = [int(opt) for opt in options if not pd.isna(opt)]\n",
|
||||
"\n",
|
||||
" if len(options) >= 1:\n",
|
||||
" cheapest = min(options)\n",
|
||||
" g.add((set_ref, THM.price_new, Literal(cheapest, datatype=XSD.float)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cbd69fa6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Only concrete elements (parts considering their shape, color and print) can have prices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 274,
|
||||
"id": "307a3210",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"for bs_element in bs_parts.itertuples(index=False):\n",
|
||||
" element_ref = thm[f\"element/{bs_element.ElementID}\"]\n",
|
||||
" \n",
|
||||
" if (element_ref, None, None) in g:\n",
|
||||
" if not pd.isna(bs_element.BrickLinkSoldPriceNew):\n",
|
||||
" g.add((element_ref, THM.price_new, Literal(bs_element.BrickLinkSoldPriceNew, datatype=XSD.float)))\n",
|
||||
" if not pd.isna(bs_element.BrickLinkSoldPriceUsed):\n",
|
||||
" g.add((element_ref, THM.price_used, Literal(bs_element.BrickLinkSoldPriceUsed, datatype=XSD.float)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -419,7 +497,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 275,
|
||||
"id": "a8beb593",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -431,17 +509,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "b14e6e89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"additional_entries = 0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 276,
|
||||
"id": "bbf5462b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -450,10 +518,9 @@
|
|||
" set_ref = thm[f\"set/lego/{bl_set.set_id}\"]\n",
|
||||
"\n",
|
||||
" if not (set_ref, None, None) in g:\n",
|
||||
" additional_entries += 1\n",
|
||||
" g.add((set_ref, RDFS.label, Literal(lego_set.name, lang=\"en\")))\n",
|
||||
" if not pd.isna(bl_set.year) and str(bl_set.year).isnumeric():\n",
|
||||
" g.add((set_ref, THM.first_year, Literal(datetime(int(bl_set.year), 1, 1))))\n",
|
||||
" g.add((set_ref, THM.year, Literal(int(bl_set.year))))\n",
|
||||
" if not pd.isna(bl_set.parts) and str(bl_set.parts).isnumeric():\n",
|
||||
" g.add((set_ref, THM.num_parts, Literal(int(bl_set.parts), datatype=XSD.integer)))\n",
|
||||
" g.add((set_ref, THM.brand, Literal(\"Lego\")))"
|
||||
|
|
@ -461,50 +528,139 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 277,
|
||||
"id": "ef52582e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nfor bl_part in bl_parts.itertuples(index=False):\\n part_ref = thm[f\"part/{bl_part.part_id}\"]\\n\\n if not (part_ref, None, None) in g:\\n additional_entries += 1\\n g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 277,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"for bl_part in bl_parts.itertuples(index=False):\n",
|
||||
" part_ref = thm[f\"part/{bl_part.part_id}\"]\n",
|
||||
"\n",
|
||||
" if not (part_ref, None, None) in g:\n",
|
||||
" additional_entries += 1\n",
|
||||
" g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))"
|
||||
" g.add((part_ref, RDFS.label, Literal(bl_part.part_name, lang=\"en\")))\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 278,
|
||||
"id": "8bf0ffeb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nfor bl_minifig in bl_minifigs.itertuples(index=False):\\n minifig_ref = thm[f\"minfig/{bl_minifig.minifig_id}\"]\\n\\n if not (minifig_ref, None, None) in g:\\n g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 278,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"for bl_minifig in bl_minifigs.itertuples(index=False):\n",
|
||||
" minifig_ref = thm[f\"minfig/{bl_minifig.minifig_id}\"]\n",
|
||||
"\n",
|
||||
" if not (minifig_ref, None, None) in g:\n",
|
||||
" additional_entries += 1\n",
|
||||
" g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))"
|
||||
" g.add((minifig_ref, RDFS.label, Literal(bl_minifig.minifig_name, lang=\"en\")))\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e73471b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Merlins Steine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "3491b032",
|
||||
"execution_count": 279,
|
||||
"id": "ab1ec488",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"me_sets = pd.read_csv(\"./data/merlin/others.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 293,
|
||||
"id": "9bcd2956",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t = me_sets[me_sets[\"brand\"] == \"Pantasy\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 294,
|
||||
"id": "9ab21460",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t[\"ratio\"] = t[\"price_eur\"] / t[\"num_parts\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 295,
|
||||
"id": "459c3a4c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Added 107748 items\n"
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"np.float64(0.43016261640379705)"
|
||||
]
|
||||
},
|
||||
"execution_count": 295,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(f\"Added {additional_entries} items\")"
|
||||
"t[\"ratio\"].mean()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 282,
|
||||
"id": "bfcf2840",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for me_set in me_sets.itertuples(index=False):\n",
|
||||
" if not pd.isna(me_set.brand) and not pd.isna(me_set.id):\n",
|
||||
" set_ref = thm[f\"set/{me_set.brand}/{me_set.id}\"]\n",
|
||||
"\n",
|
||||
" g.add((set_ref, RDF.type, THM.Set))\n",
|
||||
" g.add((set_ref, RDFS.label, Literal(me_set.name, lang=\"en\")))\n",
|
||||
" if not pd.isna(me_set.release):\n",
|
||||
" g.add((set_ref, THM.year, Literal(int(me_set.release), datatype=XSD.integer)))\n",
|
||||
"\n",
|
||||
" if not pd.isna(me_set.num_parts):\n",
|
||||
" g.add((set_ref, THM.num_parts, Literal(int(me_set.num_parts), datatype=XSD.integer)))\n",
|
||||
" g.add((set_ref, THM.brand, Literal(me_set.brand)))\n",
|
||||
" if not pd.isna(me_set.price_eur):\n",
|
||||
" g.add((set_ref, THM.price_new, Literal(me_set.price_eur, datatype=XSD.float)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -541,25 +697,25 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 283,
|
||||
"id": "1a30bff8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<Graph identifier=N30c6d515851c45f1af93153d75c76ea9 (<class 'rdflib.graph.Graph'>)>"
|
||||
"<Graph identifier=Naee4bab906a6444290a3659ffe0fbd45 (<class 'rdflib.graph.Graph'>)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"execution_count": 283,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"g.bind(\"thmont\", THM)\n",
|
||||
"g.bind(\"thm\", THM)\n",
|
||||
"\n",
|
||||
"g.serialize(\"lego_graph_rebrickable.ttl\", format=\"turtle\")"
|
||||
"g.serialize(\"lego_graph.ttl\", format=\"turtle\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -30,7 +30,7 @@
|
|||
a4paper,margin=25mm
|
||||
}
|
||||
|
||||
\title{\huge{Knowledgegraphen - Lego}}
|
||||
\title{\huge{Knowledge Graph - Lego}}
|
||||
\date{\today}
|
||||
\author{
|
||||
\begin{tabular}{ccc}
|
||||
|
|
@ -49,7 +49,7 @@
|
|||
%... then configure it.
|
||||
\fancyhead{} % clear all header fields
|
||||
\fancyhead[L]{Lego}
|
||||
\fancyhead[R]{KGR - Knowledgegraphen}
|
||||
\fancyhead[R]{KGR - Knowledge Graphen}
|
||||
\fancyfoot{} % clear all footer fields
|
||||
\fancyfoot[LE,RO]{\thepage}
|
||||
|
||||
|
|
@ -74,16 +74,16 @@
|
|||
\begin{enumerate}
|
||||
\item \label{item:min_set_count} Was ist die minimale Anzahl an Sets, die benötigt wird um ein anderes Set zusammenzubauen?
|
||||
\item \label{item:min_set_price} Was ist der geringste Preis einer Auswahl an Sets um ein anderes Set zusammenzubauen?
|
||||
\item \label{item:set_span} Welche anderen Sets, können mit Sets, die sich schon im eigenen Besitz befinden zusammengebaut werden?
|
||||
\item Sind Sets von anderen Herstellern im Vergleich zu Lego Sets, auf den durchschnittlichen Teilepreis betrachtet billiger?
|
||||
\item Haben neuere Sets im Vergleich zu älteren Sets eine geringere Teileanzahl, da auf eine grössere Anzahl an speziell angefertigten Teilen zugegriffen werden kann?
|
||||
\item Haben Sets mit höherer Teileanzahl eine höhere Anzahl an Minifiguren?
|
||||
\item \label{item:set_span} Welche anderen Sets, können mit Sets, die sich schon im eigenen Besitz befinden zusammengebaut werden?
|
||||
\item \label{item:equivalent_part} Welche Lego-Teile besitzen äquivalente Teile von anderen Marken?
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{Nutzen}
|
||||
|
||||
Wird ein Set an Klemmbausteinen nicht mehr vertrieben und man möchte das Set dennoch haben, so ergeben sich mehrere Möglichkeiten:
|
||||
Wird ein Set an Klemmbausteinen nicht mehr vertrieben, an welchem dennoch Nachfrage besteht, existieren folgende Möglichkeiten:
|
||||
\begin{itemize}
|
||||
\item Man kauft das Set von einem Zweitanbieter
|
||||
\item Man stellt sich die benötigten Teile des Sets selbst zusammen. Dies geschieht entweder indem die Teile einzeln von Zweitanbietern gekauft werden oder durch den Erwerb von anderen Sets, welche die benötigten Teile enthalten. Siehe Fragen: \ref{item:min_set_count} und \ref{item:min_set_price}.
|
||||
|
|
@ -114,6 +114,7 @@
|
|||
\begin{figure}[H]
|
||||
\includegraphics[width=\columnwidth]{bilder/downloads_schema_v3.png}
|
||||
\caption{Datenbankschema \textit{Rebrickable} \cite{FreeLEGOCatalog}}
|
||||
\label{fig:rebrickable_scheme}
|
||||
\end{figure}
|
||||
|
||||
Der Datensatz konnte über die URL \url{https://rebrickable.com/downloads/} erhalten werden.
|
||||
|
|
@ -127,7 +128,7 @@
|
|||
\toprule
|
||||
& Brickset \\ \midrule
|
||||
URL & \url{https://brickset.com/}\\
|
||||
Beschaffung & Webscraping/CSV-Download \\
|
||||
Beschaffung & CSV-Download \\
|
||||
Lizenz & nicht spezifiziert \\
|
||||
Erhalt & 23.04.2026 \\ \bottomrule
|
||||
\end{tabularx}
|
||||
|
|
@ -179,14 +180,17 @@
|
|||
|
||||
\subsection{Integrationsprozess}
|
||||
|
||||
Jedes von Lego veröffentlichte Teil besitzt der Form zugrunde eine eindeutige Teile-Nummer, auch Design-ID genannt. Die Teilenummer wird nur aufgrund der Form eines Legosteins vergeben und kann auf dem Lego-Stein abgelesen werden. Üblicherweise besitzt eine Design-ID 4-5 Stellen. Abhängig von der Form, Farbe und des Drucks besitzt jeder Lego-Stein eine 6-7 stellige Element-ID. Diese Element-ID lässt sich Lego-Sets besitzen ebenfalls eine Set-Nummer. Allerdings gibt es spezielle Lego-Sets, welche in Teil-Sets aufgegliedert werden oder mehrere Bauvarianten besitzen \cite{FreeLEGOCatalog}. Diese sind in der Modellierung dieser Arbeit nicht weiter berücksichtigt. Stattdessen wird das zugrundeliegende Hauptset betrachtet. Lego-Minifiguren erhalten durch Lego keine eindeutige Identifikationsnummer. Zur eindeutigen Identifikation von Minifiguren wird die von \textit{Rebrickable} vergebene ID verwendet.\\
|
||||
Da die einzige Quelle für andere Hersteller nur \textit{Merlins Steine} ist und diese nur Sets enthält, wird der Hersteller in der IRI miteinbezogen.
|
||||
Jedes von Lego veröffentlichte Teil besitzt der Form zugrunde eine eindeutige Teile-Nummer, auch Design-ID genannt. Die Teilenummer wird nur aufgrund der Form eines Legosteins vergeben und kann auf dem Lego-Stein abgelesen werden. Üblicherweise besitzt eine Design-ID 4-5 Stellen. Abhängig von der Form, Farbe und des Drucks besitzt jeder Lego-Stein eine 6-7 stellige Element-ID. Die Element-IDs von Teilen eines Lego-Sets befindet sich als Auflistung aller Teile in der Bauanleitung eines Lego-Sets.\\
|
||||
Sets besitzen ebenfalls eine Set-Nummer. Einige Set-Nummern werden mit einem Suffix bspw. \textit{-1} angegeben. Dieser Suffix gibt Aufschluss über die Version des Sets. Allerdings gibt es spezielle Lego-Sets, welche in Teil-Sets aufgegliedert werden oder mehrere Bauvarianten besitzen \cite{FreeLEGOCatalog}. Diese Art von Sets wird mithilfe von Inventaren modelliert (Siehe \ref{fig:rebrickable_scheme}). Ein Inventar kann als übergeordnetes Set verstanden werden. Ein Inventar kann somit Set-, Minifiguren- und Teile-Inventare besitzen, die angeben in welcher Stückzahl ein Teil-Set, eine Minifigur oder ein Teil vorhanden ist\\
|
||||
Lego-Minifiguren erhalten durch Lego keine eindeutige Identifikationsnummer. Zur eindeutigen Identifikation von Minifiguren wird die von \textit{Rebrickable} vergebene ID verwendet. Auf Minifiguren-Seiten von \textit{Rebrickable} sind Referenzen zu IDs derselben Minifigur auf anderen Seiten enthalten. Diese Referenzen sind innerhalb des bereitgestellten Datensatzes nicht abgebildet.\\
|
||||
Da die einzige Quelle für andere Hersteller nur \textit{Merlins Steine} ist und diese nur Sets enthält, wird der Hersteller in der IRI nur für Sets miteinbezogen.
|
||||
\begin{verbatim}
|
||||
https://thm.de/set/{brand}/{id}
|
||||
https://thm.de/set/{brand}/{id}
|
||||
\end{verbatim}
|
||||
|
||||
Um die Dateigrösse des Graph zu reduzieren wurde \texttt{thm}, statt \texttt{th-mannheim} verwendet.
|
||||
\begin{figure}[H]
|
||||
\includegraphics[width=\columnwidth]{bilder/example_part_number.png}
|
||||
\centering
|
||||
\includegraphics[width=0.8\columnwidth]{bilder/example_part_number.png}
|
||||
\caption{Lego Stein mit Teile-Nummer (Design-ID) 41769 \cite{cunninghamSellLEGOBricklink2018}}
|
||||
\label{fig:lego_example_part_number}
|
||||
\end{figure}
|
||||
|
|
@ -199,22 +203,170 @@
|
|||
|
||||
\subsection{Pipeline}
|
||||
|
||||
Die Datensätze von \textit{Bricklink} und \textit{Merlins Steine} wurden durch Webscraping erhoben. Entstandene Fehler durch Ausnahmefälle mussten manuell bereinigt werden. Demnach ist dieser Teil nicht automatisierbar. Abbildung \ref{fig:pipeline} zeigt die Pipeline zur Erstellung des Knowledge Graph.
|
||||
|
||||
\begin{figure}[H]
|
||||
\includegraphics[width=\columnwidth]{./bilder/kgr_pipeline1.drawio.png}
|
||||
\caption{Pipeline Erstellung Knowledge Graph}
|
||||
\label{fig:pipeline}
|
||||
\end{figure}
|
||||
|
||||
\section{Evaluation}
|
||||
|
||||
\subsection{Ergebnis}
|
||||
|
||||
Das Projekt kann unter der URL: \url{https://gitty.informatik.hs-mannheim.de/2211275/kgr} betrachtet werden.
|
||||
|
||||
Der resultierende Knowledge-Graph ist über 300 MB gross. Die Dateigrösse lässt sich auf die Zuordnungen von Teilen zu Inventaren zurückführen.
|
||||
\subsection{Beispiel-Queries}
|
||||
Erhalten der Gesamtheit aller Lego Star Wars Minifiguren:
|
||||
\begin{verbatim}
|
||||
SELECT DISTINCT ?name
|
||||
WHERE {
|
||||
?set thmont:theme ?theme.
|
||||
?theme rdf:type thmont:Theme.
|
||||
?set rdf:type thmont:Set.
|
||||
?theme rdfs:label "Star Wars"@en.
|
||||
?inventory thmont:set ?set.
|
||||
?inventory rdf:type thmont:Inventory.
|
||||
?inventory thmont:contains ?minifig_inv.
|
||||
?minifig_inv rdf:type thmont:MinifigInv.
|
||||
?minifig thmont:belongs ?minifig_inv.
|
||||
?minifig rdfs:label ?name.
|
||||
}
|
||||
\end{verbatim}
|
||||
Anzahl aller Minifiguren enthalten in allen Lego-Sets gruppiert nach Figur.
|
||||
\begin{verbatim}
|
||||
SELECT
|
||||
(SUM(?quantity) as ?sum) ?minifig ?name
|
||||
WHERE {
|
||||
?minifig rdf:type thm:Minifig.
|
||||
?minifig_inv rdf:type thm:MinifigInv.
|
||||
?minifig thm:belongs ?minifig_inv.
|
||||
?minifig_inv thm:quantity ?quantity.
|
||||
?minifig rdfs:label ?name.
|
||||
}
|
||||
GROUP BY ?minifig ?name
|
||||
ORDER BY DESC(?sum)
|
||||
\end{verbatim}
|
||||
Durchschnittliche Anzahl an Teilen je Set gruppiert nach Jahren.
|
||||
\begin{verbatim}
|
||||
SELECT ?year (AVG(?part_count) as ?avgp)
|
||||
WHERE {
|
||||
?set thm:year ?year.
|
||||
?set thm:num_parts ?part_count.
|
||||
}
|
||||
GROUP BY ?year
|
||||
ORDER BY DESC(?avgp)
|
||||
\end{verbatim}
|
||||
Durchschnittlicher Teilepreis gruppiert nach Marken. \label{verb:ppp_query}
|
||||
\begin{verbatim}
|
||||
SELECT ?brand (AVG(?price)/AVG(?num) as ?t)
|
||||
WHERE {
|
||||
?set thm:num_parts ?num.
|
||||
?set rdfs:label ?name.
|
||||
?set rdf:type thm:Set.
|
||||
?set thm:brand ?brand.
|
||||
?set thm:price_new ?price.
|
||||
FILTER (?num > 0)
|
||||
}
|
||||
GROUP BY ?brand
|
||||
ORDER BY DESC(?t)
|
||||
\end{verbatim}
|
||||
|
||||
\subsection{Abdeckung}
|
||||
|
||||
\subsection{Konsistenz}
|
||||
Tabelle \ref{tab:coverage} zeigt einen Überblick welche der Prädikate (Graph, Name, Kategorie, Preise, Jahr) der Knowledge Graph für Minifiguren (Figs), Teile und Sets abdeckt (X=enthalten,-=Fehlt). Das Prädikat \textit{Graph} spiegelt wider, ob das Prädikat im Graph vorhanden ist.
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{@{}lllllll@{}}
|
||||
\toprule
|
||||
& \multicolumn{3}{l}{Lego} & \multicolumn{3}{l}{Andere Marken} \\ \midrule
|
||||
& Figs & Teile & Sets & Figs & Teile & Sets \\ \midrule
|
||||
Graph & X & X & X & - & - & X \\
|
||||
Name & X & X & X & - & - & X\\
|
||||
Kateg. & - & X & X & - & - & - \\
|
||||
Preise & - & X & X & - & - & X \\
|
||||
Jahr & - & - & X & - & - & X \\ \bottomrule
|
||||
\end{tabular}
|
||||
\caption{Abdeckung des Graphen für Lego und weitere Klemmbausteinmarken}
|
||||
\label{tab:coverage}
|
||||
\end{table}
|
||||
Da Lego keine IDs für Minifiguren vergibt ist das erkennen zweier gleichartiger Figuren schwieriger. Die Preise von \textit{Brickset} konnten nicht den Minifiguren aus \textit{Rebrickable} zugeordnet werden. Da \textit{Rebrickable} die Zuordnung von Minifiguren zu Sets liefert, wurde die Entscheidung getroffen auf die Preiszuordnung zu verzichten. Für Figuren und Teile weiterer Marken, waren eine Zuordnung nur schwer bis gar nicht möglich. Diese Zuordnung wäre beispielsweise durch Bilderkennung, anhand vom Hersteller bereitgestellte Bauanleitungen in Form von .PDF-Dateien möglich.
|
||||
|
||||
\subsection{Qualität}
|
||||
|
||||
\section{Ausblick}
|
||||
Es wird betrachtet, ob die ursprünglichen Fragestellungen mithilfe des Knowledge Graphen beantwortet werden können.
|
||||
|
||||
\begin{enumerate}
|
||||
\item Was ist die minimale Anzahl an Sets, die benötigt wird um ein anderes Set zusammenzubauen?
|
||||
\item Was ist der geringste Preis einer Auswahl an Sets um ein anderes Set zusammenzubauen?
|
||||
\item Welche anderen Sets, können mit Sets, die sich schon im eigenen Besitz befinden zusammengebaut werden?\\
|
||||
\textit{Der Knowledge Graph beinhaltet die Daten, sodass die Fragestellungen 1., 2., 3. beantwortet werden können. Eine effiziente Softwarelösung besitzt eine höhere geschätzte Komplexität und liegt ausserhalb des Rahmens dieser Arbeit.}
|
||||
\item Sind Sets von anderen Herstellern im Vergleich zu Lego Sets, auf den durchschnittlichen Teilepreis betrachtet billiger?\\
|
||||
\textit{Diese Frage kann mithilfe der letzten Beispiel-Query \ref{verb:ppp_query} beantwortet werden. Abbildung \ref{fig:ppp} zeigt die Teile-Preise je Hersteller. Lego hat den höchsten Teilepreis mit 0.096 \texteuro und BlueBrixx den niedrigsten mit 0.0437 \texteuro.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{./bilder/diagram_avg_part_price_brand.png}
|
||||
\caption{Hersteller sortiert nach durchschnittlichem Teile-Preis in \texteuro}
|
||||
\label{fig:ppp}
|
||||
\end{figure}
|
||||
}
|
||||
\item Haben neuere Sets im Vergleich zu älteren Sets eine geringere Teileanzahl, da auf eine grössere Anzahl an speziell angefertigten Teilen zugegriffen werden kann?\\
|
||||
\textit{Es ist das Gegenteil zu erkennen. Eine Erklärung dafür könnte sein, dass die Klemmbaustein-Marken mehrheitlich Sets für \ac{AFOL} herausbringen im Vergleich zu vorherigen Jahren. Diese Sets sind meistens komplexer und besitzen demnach eine höhere Teileanzahl.}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{./bilder/diagram_avg_parts_per_year.png}
|
||||
\caption{Durchschnittliche Teileanzahl nach Jahren aller Klemmbausteinmarken}
|
||||
\label{fig:avg_parts_per_year}
|
||||
\end{figure}
|
||||
\begin{verbatim}
|
||||
SELECT (AVG(?parts) as ?total) ?year
|
||||
WHERE {
|
||||
?set rdf:type thm:Set.
|
||||
?set thm:brand ?brand.
|
||||
?set thm:year ?year.
|
||||
?set rdfs:label ?name.
|
||||
?set thm:num_parts ?parts.
|
||||
}
|
||||
GROUP BY ?year
|
||||
ORDER BY ASC(?total)
|
||||
\end{verbatim}
|
||||
\item Haben Sets mit höherer Teileanzahl eine höhere Anzahl an Minifiguren?\\
|
||||
\textit{Zwischen der Anzahl Minifiguren $M$ und der Anzahl an Teilen $T$ eines Sets kann eine mittelstarke Korrelation $\rho_{M,T} \approx 0.5926$ festgestellt werden. Hier gab es mehrere interessante Ausreisser: \href{https://rebrickable.com/sets/BIGBOX-1/the-ultimate-battle-for-chima}{BIGBOX-1,The Ultimate Battle for Chima}, \href{https://rebrickable.com/sets/2000409-2/window-exploration-bag/}{2000409-2, Window Exploration Bag 2} und \href{https://rebrickable.com/sets/2000409-1/window-exploration-bag/}{2000409-1, Window Exploration Bag 1}.}
|
||||
\textit{
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{./bilder/diagram_correlation_parts_figs.png}
|
||||
\caption{Anzahl Minifiguren und Teile}
|
||||
\label{fig:scatter_parts_figs}
|
||||
\end{figure}
|
||||
}
|
||||
Der Datensatz konnte mithilfe folgender SPARQL-Query erhalten werden. Hier werden Sets mit 0 Teilen herausgefiltert.
|
||||
\begin{verbatim}
|
||||
SELECT ?part_num (SUM(?qty) as ?total)
|
||||
WHERE {
|
||||
?set rdf:type thm:Set.
|
||||
?set thm:brand ?brand.
|
||||
?set rdfs:label ?name.
|
||||
?set thm:num_parts ?part_num.
|
||||
?inv thm:set ?set.
|
||||
?inv rdf:type thm:Inventory.
|
||||
?inv thm:contains ?fig_inv.
|
||||
?fig_inv thm:quantity ?qty.
|
||||
FILTER(?part_num > 0).
|
||||
}
|
||||
GROUP BY ?set ?part_num ?name ?inv
|
||||
\end{verbatim}
|
||||
\item Welche Lego-Teile besitzen äquivalente Teile von anderen Marken?\\
|
||||
\textit{Der Knowledge Graph bietet keine Möglichkeit dies zu beantwortet, da keine Datensätze über Teile, die nicht von Lego produziert worden sind, in die Erstellung des Graph eingeflossen sind.}
|
||||
\end{enumerate}
|
||||
|
||||
\section{Ausblick}
|
||||
\begin{itemize}
|
||||
\item Es fehlen tiefer greifende Daten zu anderen Klemmbausteinmarken, neben Lego. Ein erhalten der Daten wäre nur durch direkte Anfrage beim Hersteller oder durch aufwändige Methodik, wie Bilderkennung möglich.
|
||||
\item identische Minifiguren könnten mit höherem Aufwand gegenseitig zugeordnet werden. So würde eine Verbindung zwischen der Inventarisierung durch \textit{Rebrickable } und Preisen von \textit{Brickset} entstehen.
|
||||
\item Zur Beantwortung der Fragestellungen 1,2,3 ist eine eigene Softwarelösung und eine Erweiterung des Graphen vonnöten. \textit{Rebrickable} bietet selber Werkzeuge \url{https://rebrickable.com/help/build-engine/} an, um diese Fragestellung zu beantworten.
|
||||
\end{itemize}
|
||||
\section*{Abkürzungsverzeichnis}
|
||||
\begin{acronym}[Abkürzungsverzeichnis]
|
||||
\acro{MOC}{My Own Creation}
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 12 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
|
|
@ -0,0 +1,31 @@
|
|||
<mxfile host="app.diagrams.net">
|
||||
<diagram name="Seite-1" id="_iVW848-2TJ0zfREs3N2">
|
||||
<mxGraphModel dx="808" dy="425" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-11" edge="1" parent="1" source="Hu6JBMkSkJOX-7uFVOw6-2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="Hu6JBMkSkJOX-7uFVOw6-3">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-2" parent="1" style="rounded=1;whiteSpace=wrap;html=1;" value="Bricklink,&nbsp;<span style="background-color: transparent; color: light-dark(rgb(0, 0, 0), rgb(255, 255, 255));">Brickset</span><div>Rebrickable,&nbsp;<span style="background-color: transparent; color: light-dark(rgb(0, 0, 0), rgb(255, 255, 255));">Merlin</span></div>" vertex="1">
|
||||
<mxGeometry height="50" width="110" x="50" y="150" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-12" edge="1" parent="1" source="Hu6JBMkSkJOX-7uFVOw6-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="Hu6JBMkSkJOX-7uFVOw6-8">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-3" parent="1" style="rounded=1;whiteSpace=wrap;html=1;" value="Datensätze als .CSV" vertex="1">
|
||||
<mxGeometry height="50" width="120" x="200" y="150" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;rounded=0;" value="Manuelle Abfrage &amp; Transformation" vertex="1">
|
||||
<mxGeometry height="30" width="100" x="130" y="210" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-8" parent="1" style="rounded=1;whiteSpace=wrap;html=1;" value="lego_graph.ttl" vertex="1">
|
||||
<mxGeometry height="50" width="120" x="360" y="150" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="Hu6JBMkSkJOX-7uFVOw6-10" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;rounded=0;" value="Jupyter Notebook" vertex="1">
|
||||
<mxGeometry height="30" width="100" x="290" y="210" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 14 KiB |
|
|
@ -0,0 +1 @@
|
|||
,Lambda/Roman,Lambda,03.05.2026 11:29,file:///C:/Users/Roman/AppData/Roaming/LibreOffice/4;
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
"brand","t"
|
||||
"Lego","0.09687904"
|
||||
"Cobi","0.07586302"
|
||||
"Pantasy","0.05728256"
|
||||
"MouldKing","0.05323224"
|
||||
"BlueBrixx","0.043733075"
|
||||
|
Binary file not shown.
|
|
@ -0,0 +1,78 @@
|
|||
"total","year"
|
||||
"0.0","2027"
|
||||
"1.0","1950"
|
||||
"100.263888888888888888888889","1973"
|
||||
"101.025641025641025641025641","1972"
|
||||
"103.288135593220338983050847","1985"
|
||||
"103.794871794871794871794872","1974"
|
||||
"104.323741007194244604316547","1992"
|
||||
"104.347058823529411764705882","1994"
|
||||
"105.636363636363636363636364","1980"
|
||||
"106.891608391608391608391608","1997"
|
||||
"113.566666666666666666666667","1967"
|
||||
"12.435897435897435897435897","1955"
|
||||
"12.5","1953"
|
||||
"122.129411764705882352941176","1988"
|
||||
"123.9125295508274231678487","1998"
|
||||
"124.003865979381443298969072","2012"
|
||||
"124.326375711574952561669829","2014"
|
||||
"125.840579710144927536231884","1993"
|
||||
"126.522070015220700152207002","2011"
|
||||
"127.65641025641025641025641","1995"
|
||||
"129.071428571428571428571429","1968"
|
||||
"131.587837837837837837837838","1991"
|
||||
"132.5","2013"
|
||||
"134.275735294117647058823529","2003"
|
||||
"134.780487804878048780487805","1975"
|
||||
"139.038461538461538461538462","2004"
|
||||
"14.25","1959"
|
||||
"140.74025974025974025974026","1978"
|
||||
"142.662337662337662337662338","1976"
|
||||
"147.16195121951219512195122","2015"
|
||||
"148.229656419529837251356239","2009"
|
||||
"159.889","2016"
|
||||
"16.0","1956"
|
||||
"161.840396753832281334535618","2017"
|
||||
"163.051724137931034482758621","1990"
|
||||
"165.224489795918367346938776","1996"
|
||||
"165.62577962577962577962578","2007"
|
||||
"166.35746606334841628959276","2006"
|
||||
"174.373493975903614457831325","2005"
|
||||
"175.333333333333333333333333","1960"
|
||||
"179.64218455743879472693032","2010"
|
||||
"18.015151515151515151515152","1958"
|
||||
"183.630669546436285097192225","2008"
|
||||
"204.192013593882752761257434","2018"
|
||||
"277.083064516129032258064516","2019"
|
||||
"34.291666666666666666666667","1957"
|
||||
"341.934300993124522536287242","2021"
|
||||
"391.574585635359116022099448","2020"
|
||||
"40.868421052631578947368421","1964"
|
||||
"403.468056489576328177538668","2022"
|
||||
"43.819672131147540983606557","1966"
|
||||
"475.79951690821256038647343","2023"
|
||||
"48.964285714285714285714286","1965"
|
||||
"500.052801724137931034482759","2026"
|
||||
"593.916924664602683178534572","2024"
|
||||
"612.001367365542388331814038","2025"
|
||||
"64.955056179775280898876404","1982"
|
||||
"65.820833333333333333333333","1987"
|
||||
"67.295081967213114754098361","1963"
|
||||
"71.92","1969"
|
||||
"75.561224489795918367346939","1984"
|
||||
"77.122807017543859649122807","1977"
|
||||
"8.357142857142857142857143","1954"
|
||||
"80.690476190476190476190476","1970"
|
||||
"85.846153846153846153846154","1962"
|
||||
"87.706521739130434782608696","2001"
|
||||
"88.775362318840579710144928","1989"
|
||||
"91.280701754385964912280702","1971"
|
||||
"92.6796875","1999"
|
||||
"93.075980392156862745098039","2000"
|
||||
"93.593939393939393939393939","1986"
|
||||
"93.785714285714285714285714","1981"
|
||||
"93.853333333333333333333333","1983"
|
||||
"94.418524871355060034305317","2002"
|
||||
"94.88","1961"
|
||||
"96.4","1949"
|
||||
"96.913978494623655913978495","1979"
|
||||
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"execution_count": 14,
|
||||
"id": "ad994162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -14,13 +14,12 @@
|
|||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"import random\n",
|
||||
"import re\n",
|
||||
"import pprint"
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"id": "b5536e8c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -30,44 +29,55 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a5daea73",
|
||||
"execution_count": 16,
|
||||
"id": "6d109e8a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
|
||||
" writer = csv.writer(producerfile)\n",
|
||||
" writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
|
||||
" for producer in producers:\n",
|
||||
" with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
|
||||
" data = json.loads(sourcefile.read())\n",
|
||||
" for row in data[\"data\"]:\n",
|
||||
" _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
|
||||
"id_to_name = dict()\n",
|
||||
"for producer in producers:\n",
|
||||
" with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n",
|
||||
" listings = json.load(prodfile)[\"data\"]\n",
|
||||
"\n",
|
||||
" writer.writerow([id, producer, name, size, parts, year])"
|
||||
" for listing in listings:\n",
|
||||
" name = listing[3] \n",
|
||||
" id = listing[1]\n",
|
||||
"\n",
|
||||
" id_to_name[id] = name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 17,
|
||||
"id": "ab997198",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# uvp preise bestimmen :(\n",
|
||||
"def get_all_ids() -> list[str]:\n",
|
||||
" df = pd.read_csv(\"./data/merlin/others.csv\")\n",
|
||||
" df = pd.read_csv(\"../data/merlin/others.csv\")\n",
|
||||
" return df[\"id\"].to_list()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 18,
|
||||
"id": "32b1fa46",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\\n for idx, id in enumerate(get_all_ids()[3663:]):\\n try:\\n small_id = id.lower()\\n\\n response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\\n soup = bs4.BeautifulSoup(response.text)\\n\\n # Prices\\n price_eur = soup.find(id=\"listprice_eur\")\\n price_usd = soup.find(id=\"listprice_usd\")\\n price_cn = soup.find(id=\"listprice_cn\")\\n bestprice_eur = soup.find(id=\"bestprice_eur\")\\n bestprice_usd = soup.find(id=\"bestprice_usd\")\\n bestprice_cn = soup.find(id=\"bestprice_cn\")\\n\\n all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\\n\\n #categories\\n other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\\n writer = csv.writer(pricefile)\\n\\n all_prices = [p.text if p != None else \"_\" for p in all_prices]\\n writer.writerow([id, *all_prices, *other_dump])\\n time.sleep(random.randint(2, 3))\\n except Exception as e:\\n print(e) '"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
|
||||
"\"\"\" with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
|
||||
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
|
||||
" try:\n",
|
||||
" small_id = id.lower()\n",
|
||||
|
|
@ -93,12 +103,12 @@
|
|||
" writer.writerow([id, *all_prices, *other_dump])\n",
|
||||
" time.sleep(random.randint(2, 3))\n",
|
||||
" except Exception as e:\n",
|
||||
" print(e)"
|
||||
" print(e) \"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 19,
|
||||
"id": "4a10a1e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -118,7 +128,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 20,
|
||||
"id": "9c00f188",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -167,79 +177,82 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 176,
|
||||
"id": "ae53869e",
|
||||
"execution_count": 21,
|
||||
"id": "9b44a0e5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'Listenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)',\n",
|
||||
" 'DetailsVon:': 'BlueBrixx',\n",
|
||||
" 'EAN:': '4060904003671',\n",
|
||||
" 'Steine von:': 'Qunlong',\n",
|
||||
" 'Kategorie:': 'EisenbahnHersteller-',\n",
|
||||
" 'Kategorien:': 'BBSpecial, BRIX',\n",
|
||||
" 'Anleitung': 'Ohne Bauabschnitte',\n",
|
||||
" 'Bewertungen': 'Bewerten',\n",
|
||||
" 'Hersteller-Videos': 'video-1',\n",
|
||||
" 'Inhalt': '205 Teile',\n",
|
||||
" 'Gewicht': ': 190 g',\n",
|
||||
" 'Keine Aufkleber': '',\n",
|
||||
" 'Keine Drucke': '',\n",
|
||||
" 'Farbverteilung': '',\n",
|
||||
" 'TeilelistenBrickLink': 'XMLRebrickable CSVLEGO PaB CSVSetDB CSV',\n",
|
||||
" 'PreiseListenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"details = {\n",
|
||||
" \"id\" : [],\n",
|
||||
" \"listprice_eur\" : [],\n",
|
||||
" \"listprice_cn\" : [],\n",
|
||||
" \"listprice_usd\" : [],\n",
|
||||
" \"bestprice_eur\" : [],\n",
|
||||
" \"bestprice_cn\" : [],\n",
|
||||
" \"bestprice_usd\" : [],\n",
|
||||
" \"brand\" : [],\n",
|
||||
" \"ean\" : [],\n",
|
||||
" \"producer\" : [],\n",
|
||||
" \"release\" : [],\n",
|
||||
" \"scale\" : [],\n",
|
||||
" \"category\" : [],\n",
|
||||
" \"producer_category\" : [],\n",
|
||||
" \"num_parts\" : [],\n",
|
||||
" \"width\" : [],\n",
|
||||
" \"height\" : [],\n",
|
||||
" \"depth\" : [],\n",
|
||||
" \"designer\" : [],\n",
|
||||
" \"weight\" : [],\n",
|
||||
" \"age\" : []\n",
|
||||
"}\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"me_details = pd.DataFrame(details)\n",
|
||||
"\n",
|
||||
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
|
||||
" reader = csv.reader(price_file)\n",
|
||||
"\n",
|
||||
" # for row in reader:\n",
|
||||
" # id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
|
||||
" \n",
|
||||
" # me_details.loc[-1] = [id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd] + list(range(0, 12))\n",
|
||||
" # me_details.index = me_details.index + 1\n",
|
||||
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = [row for row in reader][random.randint(0, 4500)]\n",
|
||||
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
|
||||
"\n",
|
||||
" pprint.pp(split_by_keywords(\"\".join(other), keywords))\n"
|
||||
"def rm_epsilon(l : list[str]) ->list[str]:\n",
|
||||
" return list(filter(lambda s : len(s) > 0, l))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "b83aa413",
|
||||
"execution_count": 56,
|
||||
"id": "ae53869e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"me_details = pd.DataFrame({\n",
|
||||
" \"id\" : [],\n",
|
||||
" \"name\" : [],\n",
|
||||
" \"price_eur\" : [],\n",
|
||||
" \"price_cn\" : [],\n",
|
||||
" \"price_us\" : [],\n",
|
||||
" \"brand\" : [],\n",
|
||||
" \"ean\" : [],\n",
|
||||
" \"producer\" : [],\n",
|
||||
" \"release\" : [],\n",
|
||||
" \"category\" : [],\n",
|
||||
" \"producer_category\" : [],\n",
|
||||
" \"num_parts\" : [],\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
|
||||
" reader = csv.reader(price_file)\n",
|
||||
"\n",
|
||||
" for row in reader:\n",
|
||||
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
|
||||
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
|
||||
"\n",
|
||||
" retrieved = split_by_keywords(\"\".join(other), keywords)\n",
|
||||
"\n",
|
||||
" brand = retrieved.get(\"DetailsVon:\", \"\").replace(\" \", \"\")\n",
|
||||
" ean = retrieved.get(\"EAN:\", \"\")\n",
|
||||
" producer = retrieved.get(\"Steine von:\", \"\")\n",
|
||||
" age = retrieved.get(\"Altersempfehlung:\", \"\")\n",
|
||||
" release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n",
|
||||
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].replace(\"Ein Teil\", \"1\").replace(\"Preise\", \"\").replace(\"Mit Fernsteuerung / Elektrik\", \"1\").replace(\"Eine Minifigur\", \"1\").replace(\"Minifiguren\", \"\").strip()\n",
|
||||
"\n",
|
||||
" category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n",
|
||||
" categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n",
|
||||
" producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n",
|
||||
" producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n",
|
||||
"\n",
|
||||
" if brand == \"\":\n",
|
||||
" continue\n",
|
||||
" me_extra = pd.DataFrame({\n",
|
||||
" \"id\" : [id],\n",
|
||||
" \"name\" : [id_to_name.get(id, \"\")],\n",
|
||||
" \"price_eur\" : [lp_eur.replace(\"_\", \"\")],\n",
|
||||
" \"price_us\" : [lp_usd.replace(\"_\", \"\")],\n",
|
||||
" \"price_cn\" : [lp_cn.replace(\"_\", \"\")],\n",
|
||||
" \"brand\" : [brand],\n",
|
||||
" \"ean\" : [ean],\n",
|
||||
" \"producer\" : [producer],\n",
|
||||
" \"release\" : [release],\n",
|
||||
" \"category\" : [categories],\n",
|
||||
" \"producer_category\" : [producer_categories],\n",
|
||||
" \"num_parts\" : [num_parts],\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" me_details = pd.concat([me_details, me_extra])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "1b5bcea6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -264,49 +277,47 @@
|
|||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>listprice_eur</th>\n",
|
||||
" <th>listprice_cn</th>\n",
|
||||
" <th>listprice_usd</th>\n",
|
||||
" <th>bestprice_eur</th>\n",
|
||||
" <th>bestprice_cn</th>\n",
|
||||
" <th>bestprice_usd</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>price_eur</th>\n",
|
||||
" <th>price_cn</th>\n",
|
||||
" <th>price_us</th>\n",
|
||||
" <th>brand</th>\n",
|
||||
" <th>ean</th>\n",
|
||||
" <th>producer</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>scale</th>\n",
|
||||
" <th>release</th>\n",
|
||||
" <th>category</th>\n",
|
||||
" <th>producer_category</th>\n",
|
||||
" <th>num_parts</th>\n",
|
||||
" <th>width</th>\n",
|
||||
" <th>height</th>\n",
|
||||
" <th>depth</th>\n",
|
||||
" <th>designer</th>\n",
|
||||
" <th>weight</th>\n",
|
||||
" <th>age</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>0 rows × 21 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Empty DataFrame\n",
|
||||
"Columns: [id, listprice_eur, listprice_cn, listprice_usd, bestprice_eur, bestprice_cn, bestprice_usd, brand, ean, producer, release, scale, category, producer_category, num_parts, width, height, depth, designer, weight, age]\n",
|
||||
"Index: []\n",
|
||||
"\n",
|
||||
"[0 rows x 21 columns]"
|
||||
"Columns: [id, name, price_eur, price_cn, price_us, brand, ean, producer, release, category, producer_category, num_parts]\n",
|
||||
"Index: []"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"me_details"
|
||||
"me_details[\"Mit Fernsteuerung / Elektrik\" == me_details[\"num_parts\"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "0fb65dec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"me_details.to_csv(\"../data/merlin/others.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
Loading…
Reference in New Issue