kgr/lego/lego_graph_merlin.ipynb

120 lines
3.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "ad994162",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import json\n",
"import requests as rq\n",
"import bs4\n",
"import pandas as pd\n",
"import time\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5536e8c",
"metadata": {},
"outputs": [],
"source": [
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5daea73",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
" writer = csv.writer(producerfile)\n",
" writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
" for producer in producers:\n",
" with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
" data = json.loads(sourcefile.read())\n",
" for row in data[\"data\"]:\n",
" _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
"\n",
" writer.writerow([id, producer, name, size, parts, year])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab997198",
"metadata": {},
"outputs": [],
"source": [
"# uvp preise bestimmen :(\n",
"def get_all_ids() -> list[str]:\n",
" df = pd.read_csv(\"./data/merlin/others.csv\")\n",
" return df[\"id\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32b1fa46",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
" try:\n",
" small_id = id.lower()\n",
"\n",
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
" soup = bs4.BeautifulSoup(response.text)\n",
"\n",
" # Prices\n",
" price_eur = soup.find(id=\"listprice_eur\")\n",
" price_usd = soup.find(id=\"listprice_usd\")\n",
" price_cn = soup.find(id=\"listprice_cn\")\n",
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
"\n",
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
" \n",
" #categories\n",
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
" writer = csv.writer(pricefile)\n",
" \n",
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
" writer.writerow([id, *all_prices, *other_dump])\n",
" time.sleep(random.randint(2, 3))\n",
" except Exception as e:\n",
" print(e)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}