added bricklink + merlin
parent
c8b37e6a18
commit
16a3c3e480
|
|
@ -1,3 +1,3 @@
|
||||||
# kgr
|
# kgr
|
||||||
|
|
||||||
Vorlesung Knowledge Graphen (KGR) SS26
|
Vorlesung Knowledge Graphen (KGR) SS26
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,180 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ee5d9084",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Bricklink"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 83,
|
||||||
|
"id": "6a0f9ac8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"import requests as rq\n",
|
||||||
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"from bs4.element import NavigableString"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c31288a3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def process_page(page : int):\n",
|
||||||
|
" with open(f\"./data/bricklink/parts_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
|
||||||
|
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
|
||||||
|
"\n",
|
||||||
|
" form = s.find(id=\"ItemEditForm\")\n",
|
||||||
|
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
|
||||||
|
" with open(\"data/bricklink/parts.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
|
||||||
|
" writer = csv.writer(parts_file)\n",
|
||||||
|
" writer.writerow([\"part_id\", \"part_name\", \"categories\"])\n",
|
||||||
|
" for row in list(table.children)[1:]:\n",
|
||||||
|
" try:\n",
|
||||||
|
" img, nr, description = list(row.children)\n",
|
||||||
|
" \n",
|
||||||
|
" part_id = nr.find(name=\"a\").text\n",
|
||||||
|
" part_name = description.find(name=\"strong\").text\n",
|
||||||
|
"\n",
|
||||||
|
" links = description.find_all(name=\"a\")\n",
|
||||||
|
"\n",
|
||||||
|
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n",
|
||||||
|
"\n",
|
||||||
|
" writer.writerow([part_id, part_name, categories])\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(str(row), e)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 63,
|
||||||
|
"id": "9b58d65d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def process_page_minifigs(page : int):\n",
|
||||||
|
" with open(f\"./data/bricklink/minifigs_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
|
||||||
|
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
|
||||||
|
"\n",
|
||||||
|
" form = s.find(id=\"ItemEditForm\")\n",
|
||||||
|
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
|
||||||
|
" with open(\"data/bricklink/minifigs.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
|
||||||
|
" writer = csv.writer(parts_file)\n",
|
||||||
|
" writer.writerow([\"minifig_id\", \"minifig_name\", \"categories\"])\n",
|
||||||
|
" for row in list(table.children)[1:]:\n",
|
||||||
|
" try:\n",
|
||||||
|
" img, nr, description = list(row.children)\n",
|
||||||
|
" \n",
|
||||||
|
" minifig_id = nr.find(name=\"a\").text\n",
|
||||||
|
" minifig_name = description.find(name=\"strong\").text\n",
|
||||||
|
"\n",
|
||||||
|
" links = description.find_all(name=\"a\")\n",
|
||||||
|
"\n",
|
||||||
|
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n",
|
||||||
|
"\n",
|
||||||
|
" writer.writerow([minifig_id, minifig_name, categories])\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(str(row), e)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 98,
|
||||||
|
"id": "0fc0394d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def process_page_sets(page : int):\n",
|
||||||
|
" with open(f\"./data/bricklink/sets_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n",
|
||||||
|
" s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n",
|
||||||
|
"\n",
|
||||||
|
" form = s.find(id=\"ItemEditForm\")\n",
|
||||||
|
" table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n",
|
||||||
|
" with open(\"data/bricklink/sets.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n",
|
||||||
|
" writer = csv.writer(parts_file)\n",
|
||||||
|
" writer.writerow([\"set_id\", \"set_name\", \"categories\", \"parts\", \"minifigs\", \"year\"])\n",
|
||||||
|
" for row in list(table.children)[1:]:\n",
|
||||||
|
" try:\n",
|
||||||
|
" img, nr, description = list(row.children)\n",
|
||||||
|
" \n",
|
||||||
|
" set_id = nr.find(name=\"a\").text\n",
|
||||||
|
" set_name = description.find(name=\"strong\").text\n",
|
||||||
|
"\n",
|
||||||
|
" links = description.find_all(name=\"a\")\n",
|
||||||
|
"\n",
|
||||||
|
" font = description.find(name=\"font\", class_=\"fv\")\n",
|
||||||
|
" infos = \"\".join(child for child in font.contents if isinstance(child, NavigableString)).strip(\"\\n\").split(\",\")\n",
|
||||||
|
"\n",
|
||||||
|
" extracted = {\n",
|
||||||
|
" \"Part\" : \"\",\n",
|
||||||
|
" \"Minifigure\" : \"\",\n",
|
||||||
|
" \"Year\" : \"\"\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" for info in infos:\n",
|
||||||
|
" if \"Part\" in info:\n",
|
||||||
|
" extracted[\"Part\"] = info.split(\" \")[0]\n",
|
||||||
|
" if \"Minifigure\" in info:\n",
|
||||||
|
" extracted[\"Minifigure\"] = info.split(\" \")[0]\n",
|
||||||
|
" else:\n",
|
||||||
|
" extracted[\"Year\"] = info.split(\":\")[0].strip()\n",
|
||||||
|
"\n",
|
||||||
|
" categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Sets\"])\n",
|
||||||
|
"\n",
|
||||||
|
" writer.writerow([set_id, set_name, categories, extracted[\"Part\"], extracted[\"Minifigure\"], extracted[\"Year\"]])\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(str(row), e)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 99,
|
||||||
|
"id": "3fa694ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" 'NavigableString' object has no attribute 'children'\n",
|
||||||
|
" 'NavigableString' object has no attribute 'children'\n",
|
||||||
|
" 'NavigableString' object has no attribute 'children'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for i in [1,2,3]:\n",
|
||||||
|
" process_page_sets(i)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "venv (3.14.4)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.14.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|
@ -48,7 +48,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": null,
|
||||||
"id": "18a3fe83",
|
"id": "18a3fe83",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue