added bricklink + merlin

pull/1/head
Roman Schöne 2026-04-24 13:45:24 +02:00
parent c8b37e6a18
commit 16a3c3e480
11 changed files with 137875 additions and 2 deletions

View File

@ -1,3 +1,3 @@
# kgr # kgr
Vorlesung Knowledge Graphen (KGR) SS26 Vorlesung Knowledge Graphen (KGR) SS26

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,180 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ee5d9084",
"metadata": {},
"source": [
"## Bricklink"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "6a0f9ac8",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import requests as rq\n",
"from bs4 import BeautifulSoup\n",
"from bs4.element import NavigableString"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c31288a3",
"metadata": {},
"outputs": [],
"source": [
"def process_page(page : int):\n",
" with open(f\"./data/bricklink/parts_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
"\n",
" form = s.find(id=\"ItemEditForm\")\n",
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
" with open(\"data/bricklink/parts.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
" writer = csv.writer(parts_file)\n",
" writer.writerow([\"part_id\", \"part_name\", \"categories\"])\n",
" for row in list(table.children)[1:]:\n",
" try:\n",
" img, nr, description = list(row.children)\n",
" \n",
" part_id = nr.find(name=\"a\").text\n",
" part_name = description.find(name=\"strong\").text\n",
"\n",
" links = description.find_all(name=\"a\")\n",
"\n",
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n",
"\n",
" writer.writerow([part_id, part_name, categories])\n",
" except Exception as e:\n",
" print(str(row), e)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "9b58d65d",
"metadata": {},
"outputs": [],
"source": [
"def process_page_minifigs(page : int):\n",
" with open(f\"./data/bricklink/minifigs_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
"\n",
" form = s.find(id=\"ItemEditForm\")\n",
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
" with open(\"data/bricklink/minifigs.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
" writer = csv.writer(parts_file)\n",
" writer.writerow([\"minifig_id\", \"minifig_name\", \"categories\"])\n",
" for row in list(table.children)[1:]:\n",
" try:\n",
" img, nr, description = list(row.children)\n",
" \n",
" minifig_id = nr.find(name=\"a\").text\n",
" minifig_name = description.find(name=\"strong\").text\n",
"\n",
" links = description.find_all(name=\"a\")\n",
"\n",
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n",
"\n",
" writer.writerow([minifig_id, minifig_name, categories])\n",
" except Exception as e:\n",
" print(str(row), e)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "0fc0394d",
"metadata": {},
"outputs": [],
"source": [
"def process_page_sets(page : int):\n",
" with open(f\"./data/bricklink/sets_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
"\n",
" form = s.find(id=\"ItemEditForm\")\n",
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
" with open(\"data/bricklink/sets.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
" writer = csv.writer(parts_file)\n",
" writer.writerow([\"set_id\", \"set_name\", \"categories\", \"parts\", \"minifigs\", \"year\"])\n",
" for row in list(table.children)[1:]:\n",
" try:\n",
" img, nr, description = list(row.children)\n",
" \n",
" set_id = nr.find(name=\"a\").text\n",
" set_name = description.find(name=\"strong\").text\n",
"\n",
" links = description.find_all(name=\"a\")\n",
"\n",
" font = description.find(name=\"font\", class_=\"fv\")\n",
" infos = \"\".join(child for child in font.contents if isinstance(child, NavigableString)).strip(\"\\n\").split(\",\")\n",
"\n",
" extracted = {\n",
" \"Part\" : \"\",\n",
" \"Minifigure\" : \"\",\n",
" \"Year\" : \"\"\n",
" }\n",
"\n",
" for info in infos:\n",
" if \"Part\" in info:\n",
" extracted[\"Part\"] = info.split(\" \")[0]\n",
" if \"Minifigure\" in info:\n",
" extracted[\"Minifigure\"] = info.split(\" \")[0]\n",
" else:\n",
" extracted[\"Year\"] = info.split(\":\")[0].strip()\n",
"\n",
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Sets\"])\n",
"\n",
" writer.writerow([set_id, set_name, categories, extracted[\"Part\"], extracted[\"Minifigure\"], extracted[\"Year\"]])\n",
" except Exception as e:\n",
" print(str(row), e)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "3fa694ef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 'NavigableString' object has no attribute 'children'\n",
" 'NavigableString' object has no attribute 'children'\n",
" 'NavigableString' object has no attribute 'children'\n"
]
}
],
"source": [
"for i in [1,2,3]:\n",
" process_page_sets(i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -48,7 +48,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": null,
"id": "18a3fe83", "id": "18a3fe83",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],