kgr/lego/util/lego_util_merlin.ipynb

535 lines
18 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ad994162",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import json\n",
"import requests as rq\n",
"import bs4\n",
"import pandas as pd\n",
"import time\n",
"import random\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5536e8c",
"metadata": {},
"outputs": [],
"source": [
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "6d109e8a",
"metadata": {},
"outputs": [],
"source": [
"id_to_name = dict()\n",
"for producer in producers:\n",
" with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n",
" listings = json.load(prodfile)[\"data\"]\n",
"\n",
" for listing in listings:\n",
" name = listing[3] \n",
" id = listing[1]\n",
"\n",
" id_to_name[id] = name"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ab997198",
"metadata": {},
"outputs": [],
"source": [
"# uvp preise bestimmen :(\n",
"def get_all_ids() -> list[str]:\n",
" df = pd.read_csv(\"../data/merlin/others.csv\")\n",
" return df[\"id\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "32b1fa46",
"metadata": {},
"outputs": [],
"source": [
"with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
" try:\n",
" small_id = id.lower()\n",
"\n",
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
" soup = bs4.BeautifulSoup(response.text)\n",
"\n",
" # Prices\n",
" price_eur = soup.find(id=\"listprice_eur\")\n",
" price_usd = soup.find(id=\"listprice_usd\")\n",
" price_cn = soup.find(id=\"listprice_cn\")\n",
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
"\n",
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
" \n",
" #categories\n",
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
" writer = csv.writer(pricefile)\n",
" \n",
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
" writer.writerow([id, *all_prices, *other_dump])\n",
" time.sleep(random.randint(2, 3))\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4a10a1e3",
"metadata": {},
"outputs": [],
"source": [
"def split_by_keywords(text, keywords):\n",
" pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n",
" parts = re.split(pattern, text)\n",
" \n",
" result = {}\n",
" for i in range(1, len(parts), 2):\n",
" key = parts[i]\n",
" value = parts[i + 1].strip()\n",
" result[key] = value\n",
" \n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9c00f188",
"metadata": {},
"outputs": [],
"source": [
"keywords = [\n",
" \"Listenpreis:\",\n",
" \"DetailsVon:\",\n",
" \"EAN:\",\n",
" \"Altersempfehlung:\",\n",
" \"Steine von:\",\n",
" \"Bestpreis:EU:\",\n",
" \"Bewertungen\",\n",
" \"Inhalt\",\n",
" \"PreiseListenpreis:\",\n",
" \"Hersteller-Kategorie:\",\n",
" \"Designer:\",\n",
" \"Maße:\",\n",
" \"Release:\",\n",
" \"Kategorien:\",\n",
" \"Hersteller-Videos\",\n",
" \"EU:\",\n",
" \"Anleitung\",\n",
" \"BewertungenCommunity:\",\n",
" \"Maßstab:\",\n",
" \"Erweiterung zu:\",\n",
" \"Reviews\",\n",
" \"Lizenz:\",\n",
" \"Farbverteilung\",\n",
" \"TeilelistenBrickLink\",\n",
" \"Bild:\",\n",
" \"ReviewsCommunity\",\n",
" \"Gewicht\",\n",
" \"Keine Aufkleber\",\n",
" \"Verpackungsmaße:\",\n",
" \"EU:Brickmo\",\n",
" \"Datenbanken:\",\n",
" \"Kategorie:\",\n",
" \"Keine Drucke\",\n",
" \"TechnikMOC:\",\n",
" \"Steingröße:\",\n",
" \"SonstigesMOC:\",\n",
" \"Variationen:\",\n",
" \"RebrickableVariation:\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "9b44a0e5",
"metadata": {},
"outputs": [],
"source": [
"def rm_epsilon(l : list[str]) ->list[str]:\n",
" return list(filter(lambda s : len(s) > 0, l))"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "ae53869e",
"metadata": {},
"outputs": [],
"source": [
"me_details = pd.DataFrame({\n",
" \"id\" : [],\n",
" \"name\" : [],\n",
" \"price_eur\" : [],\n",
" \"price_cn\" : [],\n",
" \"price_us\" : [],\n",
" \"brand\" : [],\n",
" \"ean\" : [],\n",
" \"producer\" : [],\n",
" \"release\" : [],\n",
" \"category\" : [],\n",
" \"producer_category\" : [],\n",
" \"num_parts\" : [],\n",
" })\n",
"\n",
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
" reader = csv.reader(price_file)\n",
"\n",
" for row in reader:\n",
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
"\n",
" retrieved = split_by_keywords(\"\".join(other), keywords)\n",
"\n",
" brand = retrieved.get(\"DetailsVon:\", \"\")\n",
" ean = retrieved.get(\"EAN:\", \"\")\n",
" producer = retrieved.get(\"Steine von:\", \"\")\n",
" age = retrieved.get(\"Altersempfehlung:\", \"\")\n",
" release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n",
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].strip()\n",
" category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n",
" categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n",
" producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n",
" producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n",
"\n",
" me_extra = pd.DataFrame({\n",
" \"id\" : [id],\n",
" \"name\" : [id_to_name.get(id, \"\")],\n",
" \"price_eur\" : [lp_eur],\n",
" \"price_us\" : [lp_usd],\n",
" \"price_cn\" : [lp_cn],\n",
" \"brand\" : [brand],\n",
" \"ean\" : [ean],\n",
" \"producer\" : [producer],\n",
" \"release\" : [release],\n",
" \"category\" : [categories],\n",
" \"producer_category\" : [producer_categories],\n",
" \"num_parts\" : [num_parts],\n",
" })\n",
"\n",
" me_details = pd.concat([me_details, me_extra])"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "1b5bcea6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>price_eur</th>\n",
" <th>price_cn</th>\n",
" <th>price_us</th>\n",
" <th>brand</th>\n",
" <th>ean</th>\n",
" <th>producer</th>\n",
" <th>release</th>\n",
" <th>category</th>\n",
" <th>producer_category</th>\n",
" <th>num_parts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108899</td>\n",
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
" <td>99.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904014783</td>\n",
" <td></td>\n",
" <td>2026</td>\n",
" <td>BBPlay, The Three Investigators</td>\n",
" <td></td>\n",
" <td>1393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108899</td>\n",
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
" <td>99.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904014783</td>\n",
" <td></td>\n",
" <td>2026</td>\n",
" <td>BBPlay, The Three Investigators</td>\n",
" <td></td>\n",
" <td>1393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108569</td>\n",
" <td>Fledermaus</td>\n",
" <td>29.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904023020</td>\n",
" <td>Xingbao</td>\n",
" <td>2026</td>\n",
" <td>Tiere</td>\n",
" <td>BBPro</td>\n",
" <td>579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-109262</td>\n",
" <td>1970er Sport Cabriolet schwarz</td>\n",
" <td>49.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td></td>\n",
" <td>Qunlong</td>\n",
" <td>2026</td>\n",
" <td>Autos, Fahrzeuge</td>\n",
" <td>BBSpecial</td>\n",
" <td>1291</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-109021</td>\n",
" <td>Mittelalterliche Steinbrücke</td>\n",
" <td>59.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904022184</td>\n",
" <td>Qunlong</td>\n",
" <td>2026</td>\n",
" <td>Geschichte, Mittelalter</td>\n",
" <td>BBSpecial</td>\n",
" <td>1654</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86219</td>\n",
" <td>My Own Swordsman™ Tavern Gate 武林外传</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2023</td>\n",
" <td>China, Gebäude, Popkultur</td>\n",
" <td>My Own Swordsman</td>\n",
" <td>422</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86220</td>\n",
" <td>My Own Swordsman™ Tong Fu Inn 武林外传</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2023</td>\n",
" <td>China, Gebäude, Popkultur</td>\n",
" <td>My Own Swordsman</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-61008</td>\n",
" <td>Retro 1960s Television</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td>6973817320354</td>\n",
" <td></td>\n",
" <td>2022</td>\n",
" <td>Gegenstände</td>\n",
" <td>Retro Collection</td>\n",
" <td>1173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-15007</td>\n",
" <td>Pink Rose</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td>GoBricks</td>\n",
" <td>2024</td>\n",
" <td>Blumen, Pflanzen</td>\n",
" <td>Botanical World</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86218</td>\n",
" <td>Sherlock Holmes™ 221B Baker Street</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td>6973817320156</td>\n",
" <td></td>\n",
" <td>2022</td>\n",
" <td>Popkultur</td>\n",
" <td>Sherlock Holmes</td>\n",
" <td>1088</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4509 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" id name price_eur \\\n",
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
"0 BB-108569 Fledermaus 29.95 \n",
"0 BB-109262 1970er Sport Cabriolet schwarz 49.95 \n",
"0 BB-109021 Mittelalterliche Steinbrücke 59.95 \n",
".. ... ... ... \n",
"0 PANT-86219 My Own Swordsman™ Tavern Gate 武林外传 _ \n",
"0 PANT-86220 My Own Swordsman™ Tong Fu Inn 武林外传 _ \n",
"0 PANT-61008 Retro 1960s Television _ \n",
"0 PANT-15007 Pink Rose _ \n",
"0 PANT-86218 Sherlock Holmes™ 221B Baker Street _ \n",
"\n",
" price_cn price_us brand ean producer release \\\n",
"0 _ _ BlueBrixx 4060904014783 2026 \n",
"0 _ _ BlueBrixx 4060904014783 2026 \n",
"0 _ _ BlueBrixx 4060904023020 Xingbao 2026 \n",
"0 _ _ BlueBrixx Qunlong 2026 \n",
"0 _ _ BlueBrixx 4060904022184 Qunlong 2026 \n",
".. ... ... ... ... ... ... \n",
"0 _ _ Pantasy 2023 \n",
"0 _ _ Pantasy 2023 \n",
"0 _ _ Pantasy 6973817320354 2022 \n",
"0 _ _ Pantasy GoBricks 2024 \n",
"0 _ _ Pantasy 6973817320156 2022 \n",
"\n",
" category producer_category num_parts \n",
"0 BBPlay, The Three Investigators 1393 \n",
"0 BBPlay, The Three Investigators 1393 \n",
"0 Tiere BBPro 579 \n",
"0 Autos, Fahrzeuge BBSpecial 1291 \n",
"0 Geschichte, Mittelalter BBSpecial 1654 \n",
".. ... ... ... \n",
"0 China, Gebäude, Popkultur My Own Swordsman 422 \n",
"0 China, Gebäude, Popkultur My Own Swordsman 2000 \n",
"0 Gegenstände Retro Collection 1173 \n",
"0 Blumen, Pflanzen Botanical World \n",
"0 Popkultur Sherlock Holmes 1088 \n",
"\n",
"[4509 rows x 12 columns]"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"me_details"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "0fb65dec",
"metadata": {},
"outputs": [],
"source": [
"me_details.to_csv(\"../data/merlin/others.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.12.3)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}