{ "cells": [ { "cell_type": "markdown", "id": "ee5d9084", "metadata": {}, "source": [ "## Bricklink" ] }, { "cell_type": "code", "execution_count": 83, "id": "6a0f9ac8", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import requests as rq\n", "from bs4 import BeautifulSoup\n", "from bs4.element import NavigableString" ] }, { "cell_type": "code", "execution_count": null, "id": "c31288a3", "metadata": {}, "outputs": [], "source": [ "def process_page(page : int):\n", " with open(f\"./data/bricklink/parts_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n", " s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n", "\n", " form = s.find(id=\"ItemEditForm\")\n", " table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n", " with open(\"data/bricklink/parts.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n", " writer = csv.writer(parts_file)\n", " writer.writerow([\"part_id\", \"part_name\", \"categories\"])\n", " for row in list(table.children)[1:]:\n", " try:\n", " img, nr, description = list(row.children)\n", " \n", " part_id = nr.find(name=\"a\").text\n", " part_name = description.find(name=\"strong\").text\n", "\n", " links = description.find_all(name=\"a\")\n", "\n", " categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n", "\n", " writer.writerow([part_id, part_name, categories])\n", " except Exception as e:\n", " print(str(row), e)" ] }, { "cell_type": "code", "execution_count": 63, "id": "9b58d65d", "metadata": {}, "outputs": [], "source": [ "def process_page_minifigs(page : int):\n", " with open(f\"./data/bricklink/minifigs_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n", " s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n", "\n", " form = s.find(id=\"ItemEditForm\")\n", " table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n", " with open(\"data/bricklink/minifigs.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n", " writer = csv.writer(parts_file)\n", " writer.writerow([\"minifig_id\", \"minifig_name\", \"categories\"])\n", " for row in list(table.children)[1:]:\n", " try:\n", " img, nr, description = list(row.children)\n", " \n", " minifig_id = nr.find(name=\"a\").text\n", " minifig_name = description.find(name=\"strong\").text\n", "\n", " links = description.find_all(name=\"a\")\n", "\n", " categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Parts\"])\n", "\n", " writer.writerow([minifig_id, minifig_name, categories])\n", " except Exception as e:\n", " print(str(row), e)" ] }, { "cell_type": "code", "execution_count": 98, "id": "0fc0394d", "metadata": {}, "outputs": [], "source": [ "def process_page_sets(page : int):\n", " with open(f\"./data/bricklink/sets_{page}.html\", mode=\"r\", encoding=\"utf8\") as pagefile:\n", " s = BeautifulSoup(pagefile.read().replace(\"\\n\", \"\"))\n", "\n", " form = s.find(id=\"ItemEditForm\")\n", " table = form.find_all(name=\"table\", class_=\"bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row\")[0].find(name=\"tbody\")\n", " with open(\"data/bricklink/sets.csv\", mode=\"a+\", encoding=\"utf8\", newline='') as parts_file:\n", " writer = csv.writer(parts_file)\n", " writer.writerow([\"set_id\", \"set_name\", \"categories\", \"parts\", \"minifigs\", \"year\"])\n", " for row in list(table.children)[1:]:\n", " try:\n", " img, nr, description = list(row.children)\n", " \n", " set_id = nr.find(name=\"a\").text\n", " set_name = description.find(name=\"strong\").text\n", "\n", " links = description.find_all(name=\"a\")\n", "\n", " font = description.find(name=\"font\", class_=\"fv\")\n", " infos = \"\".join(child for child in font.contents if isinstance(child, NavigableString)).strip(\"\\n\").split(\",\")\n", "\n", " extracted = {\n", " \"Part\" : \"\",\n", " \"Minifigure\" : \"\",\n", " \"Year\" : \"\"\n", " }\n", "\n", " for info in infos:\n", " if \"Part\" in info:\n", " extracted[\"Part\"] = info.split(\" \")[0]\n", " if \"Minifigure\" in info:\n", " extracted[\"Minifigure\"] = info.split(\" \")[0]\n", " else:\n", " extracted[\"Year\"] = info.split(\":\")[0].strip()\n", "\n", " categories = \", \".join([link.text for link in links if link.text != \"Catalog\" and link.text != \"Sets\"])\n", "\n", " writer.writerow([set_id, set_name, categories, extracted[\"Part\"], extracted[\"Minifigure\"], extracted[\"Year\"]])\n", " except Exception as e:\n", " print(str(row), e)" ] }, { "cell_type": "code", "execution_count": 99, "id": "3fa694ef", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 'NavigableString' object has no attribute 'children'\n", " 'NavigableString' object has no attribute 'children'\n", " 'NavigableString' object has no attribute 'children'\n" ] } ], "source": [ "for i in [1,2,3]:\n", " process_page_sets(i)" ] } ], "metadata": { "kernelspec": { "display_name": "venv (3.14.4)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.4" } }, "nbformat": 4, "nbformat_minor": 5 }