535 lines
18 KiB
Plaintext
535 lines
18 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ad994162",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import csv\n",
|
||
"import json\n",
|
||
"import requests as rq\n",
|
||
"import bs4\n",
|
||
"import pandas as pd\n",
|
||
"import time\n",
|
||
"import random\n",
|
||
"import re"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b5536e8c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 103,
|
||
"id": "6d109e8a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"id_to_name = dict()\n",
|
||
"for producer in producers:\n",
|
||
" with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n",
|
||
" listings = json.load(prodfile)[\"data\"]\n",
|
||
"\n",
|
||
" for listing in listings:\n",
|
||
" name = listing[3] \n",
|
||
" id = listing[1]\n",
|
||
"\n",
|
||
" id_to_name[id] = name"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "ab997198",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# uvp preise bestimmen :(\n",
|
||
"def get_all_ids() -> list[str]:\n",
|
||
" df = pd.read_csv(\"../data/merlin/others.csv\")\n",
|
||
" return df[\"id\"].to_list()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "32b1fa46",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
|
||
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
|
||
" try:\n",
|
||
" small_id = id.lower()\n",
|
||
"\n",
|
||
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
|
||
" soup = bs4.BeautifulSoup(response.text)\n",
|
||
"\n",
|
||
" # Prices\n",
|
||
" price_eur = soup.find(id=\"listprice_eur\")\n",
|
||
" price_usd = soup.find(id=\"listprice_usd\")\n",
|
||
" price_cn = soup.find(id=\"listprice_cn\")\n",
|
||
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
|
||
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
|
||
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
|
||
"\n",
|
||
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
|
||
" \n",
|
||
" #categories\n",
|
||
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
|
||
" writer = csv.writer(pricefile)\n",
|
||
" \n",
|
||
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
|
||
" writer.writerow([id, *all_prices, *other_dump])\n",
|
||
" time.sleep(random.randint(2, 3))\n",
|
||
" except Exception as e:\n",
|
||
" print(e)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "4a10a1e3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def split_by_keywords(text, keywords):\n",
|
||
" pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n",
|
||
" parts = re.split(pattern, text)\n",
|
||
" \n",
|
||
" result = {}\n",
|
||
" for i in range(1, len(parts), 2):\n",
|
||
" key = parts[i]\n",
|
||
" value = parts[i + 1].strip()\n",
|
||
" result[key] = value\n",
|
||
" \n",
|
||
" return result"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "9c00f188",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"keywords = [\n",
|
||
" \"Listenpreis:\",\n",
|
||
" \"DetailsVon:\",\n",
|
||
" \"EAN:\",\n",
|
||
" \"Altersempfehlung:\",\n",
|
||
" \"Steine von:\",\n",
|
||
" \"Bestpreis:EU:\",\n",
|
||
" \"Bewertungen\",\n",
|
||
" \"Inhalt\",\n",
|
||
" \"PreiseListenpreis:\",\n",
|
||
" \"Hersteller-Kategorie:\",\n",
|
||
" \"Designer:\",\n",
|
||
" \"Maße:\",\n",
|
||
" \"Release:\",\n",
|
||
" \"Kategorien:\",\n",
|
||
" \"Hersteller-Videos\",\n",
|
||
" \"EU:\",\n",
|
||
" \"Anleitung\",\n",
|
||
" \"BewertungenCommunity:\",\n",
|
||
" \"Maßstab:\",\n",
|
||
" \"Erweiterung zu:\",\n",
|
||
" \"Reviews\",\n",
|
||
" \"Lizenz:\",\n",
|
||
" \"Farbverteilung\",\n",
|
||
" \"TeilelistenBrickLink\",\n",
|
||
" \"Bild:\",\n",
|
||
" \"ReviewsCommunity\",\n",
|
||
" \"Gewicht\",\n",
|
||
" \"Keine Aufkleber\",\n",
|
||
" \"Verpackungsmaße:\",\n",
|
||
" \"EU:Brickmo\",\n",
|
||
" \"Datenbanken:\",\n",
|
||
" \"Kategorie:\",\n",
|
||
" \"Keine Drucke\",\n",
|
||
" \"TechnikMOC:\",\n",
|
||
" \"Steingröße:\",\n",
|
||
" \"SonstigesMOC:\",\n",
|
||
" \"Variationen:\",\n",
|
||
" \"RebrickableVariation:\"\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 85,
|
||
"id": "9b44a0e5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def rm_epsilon(l : list[str]) ->list[str]:\n",
|
||
" return list(filter(lambda s : len(s) > 0, l))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 110,
|
||
"id": "ae53869e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"me_details = pd.DataFrame({\n",
|
||
" \"id\" : [],\n",
|
||
" \"name\" : [],\n",
|
||
" \"price_eur\" : [],\n",
|
||
" \"price_cn\" : [],\n",
|
||
" \"price_us\" : [],\n",
|
||
" \"brand\" : [],\n",
|
||
" \"ean\" : [],\n",
|
||
" \"producer\" : [],\n",
|
||
" \"release\" : [],\n",
|
||
" \"category\" : [],\n",
|
||
" \"producer_category\" : [],\n",
|
||
" \"num_parts\" : [],\n",
|
||
" })\n",
|
||
"\n",
|
||
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
|
||
" reader = csv.reader(price_file)\n",
|
||
"\n",
|
||
" for row in reader:\n",
|
||
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
|
||
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
|
||
"\n",
|
||
" retrieved = split_by_keywords(\"\".join(other), keywords)\n",
|
||
"\n",
|
||
" brand = retrieved.get(\"DetailsVon:\", \"\")\n",
|
||
" ean = retrieved.get(\"EAN:\", \"\")\n",
|
||
" producer = retrieved.get(\"Steine von:\", \"\")\n",
|
||
" age = retrieved.get(\"Altersempfehlung:\", \"\")\n",
|
||
" release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n",
|
||
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].strip()\n",
|
||
" category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n",
|
||
" categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n",
|
||
" producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n",
|
||
" producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n",
|
||
"\n",
|
||
" me_extra = pd.DataFrame({\n",
|
||
" \"id\" : [id],\n",
|
||
" \"name\" : [id_to_name.get(id, \"\")],\n",
|
||
" \"price_eur\" : [lp_eur],\n",
|
||
" \"price_us\" : [lp_usd],\n",
|
||
" \"price_cn\" : [lp_cn],\n",
|
||
" \"brand\" : [brand],\n",
|
||
" \"ean\" : [ean],\n",
|
||
" \"producer\" : [producer],\n",
|
||
" \"release\" : [release],\n",
|
||
" \"category\" : [categories],\n",
|
||
" \"producer_category\" : [producer_categories],\n",
|
||
" \"num_parts\" : [num_parts],\n",
|
||
" })\n",
|
||
"\n",
|
||
" me_details = pd.concat([me_details, me_extra])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 111,
|
||
"id": "1b5bcea6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>price_eur</th>\n",
|
||
" <th>price_cn</th>\n",
|
||
" <th>price_us</th>\n",
|
||
" <th>brand</th>\n",
|
||
" <th>ean</th>\n",
|
||
" <th>producer</th>\n",
|
||
" <th>release</th>\n",
|
||
" <th>category</th>\n",
|
||
" <th>producer_category</th>\n",
|
||
" <th>num_parts</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BB-108899</td>\n",
|
||
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
|
||
" <td>99.95</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>BlueBrixx</td>\n",
|
||
" <td>4060904014783</td>\n",
|
||
" <td></td>\n",
|
||
" <td>2026</td>\n",
|
||
" <td>BBPlay, The Three Investigators</td>\n",
|
||
" <td></td>\n",
|
||
" <td>1393</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BB-108899</td>\n",
|
||
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
|
||
" <td>99.95</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>BlueBrixx</td>\n",
|
||
" <td>4060904014783</td>\n",
|
||
" <td></td>\n",
|
||
" <td>2026</td>\n",
|
||
" <td>BBPlay, The Three Investigators</td>\n",
|
||
" <td></td>\n",
|
||
" <td>1393</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BB-108569</td>\n",
|
||
" <td>Fledermaus</td>\n",
|
||
" <td>29.95</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>BlueBrixx</td>\n",
|
||
" <td>4060904023020</td>\n",
|
||
" <td>Xingbao</td>\n",
|
||
" <td>2026</td>\n",
|
||
" <td>Tiere</td>\n",
|
||
" <td>BBPro</td>\n",
|
||
" <td>579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BB-109262</td>\n",
|
||
" <td>1970er Sport Cabriolet schwarz</td>\n",
|
||
" <td>49.95</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>BlueBrixx</td>\n",
|
||
" <td></td>\n",
|
||
" <td>Qunlong</td>\n",
|
||
" <td>2026</td>\n",
|
||
" <td>Autos, Fahrzeuge</td>\n",
|
||
" <td>BBSpecial</td>\n",
|
||
" <td>1291</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BB-109021</td>\n",
|
||
" <td>Mittelalterliche Steinbrücke</td>\n",
|
||
" <td>59.95</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>BlueBrixx</td>\n",
|
||
" <td>4060904022184</td>\n",
|
||
" <td>Qunlong</td>\n",
|
||
" <td>2026</td>\n",
|
||
" <td>Geschichte, Mittelalter</td>\n",
|
||
" <td>BBSpecial</td>\n",
|
||
" <td>1654</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>PANT-86219</td>\n",
|
||
" <td>My Own Swordsman™ Tavern Gate 武林外传</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>Pantasy</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>China, Gebäude, Popkultur</td>\n",
|
||
" <td>My Own Swordsman</td>\n",
|
||
" <td>422</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>PANT-86220</td>\n",
|
||
" <td>My Own Swordsman™ Tong Fu Inn 武林外传</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>Pantasy</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>China, Gebäude, Popkultur</td>\n",
|
||
" <td>My Own Swordsman</td>\n",
|
||
" <td>2000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>PANT-61008</td>\n",
|
||
" <td>Retro 1960s Television</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>Pantasy</td>\n",
|
||
" <td>6973817320354</td>\n",
|
||
" <td></td>\n",
|
||
" <td>2022</td>\n",
|
||
" <td>Gegenstände</td>\n",
|
||
" <td>Retro Collection</td>\n",
|
||
" <td>1173</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>PANT-15007</td>\n",
|
||
" <td>Pink Rose</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>Pantasy</td>\n",
|
||
" <td></td>\n",
|
||
" <td>GoBricks</td>\n",
|
||
" <td>2024</td>\n",
|
||
" <td>Blumen, Pflanzen</td>\n",
|
||
" <td>Botanical World</td>\n",
|
||
" <td></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>PANT-86218</td>\n",
|
||
" <td>Sherlock Holmes™ 221B Baker Street</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>_</td>\n",
|
||
" <td>Pantasy</td>\n",
|
||
" <td>6973817320156</td>\n",
|
||
" <td></td>\n",
|
||
" <td>2022</td>\n",
|
||
" <td>Popkultur</td>\n",
|
||
" <td>Sherlock Holmes</td>\n",
|
||
" <td>1088</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>4509 rows × 12 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id name price_eur \\\n",
|
||
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
|
||
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
|
||
"0 BB-108569 Fledermaus 29.95 \n",
|
||
"0 BB-109262 1970er Sport Cabriolet schwarz 49.95 \n",
|
||
"0 BB-109021 Mittelalterliche Steinbrücke 59.95 \n",
|
||
".. ... ... ... \n",
|
||
"0 PANT-86219 My Own Swordsman™ Tavern Gate 武林外传 _ \n",
|
||
"0 PANT-86220 My Own Swordsman™ Tong Fu Inn 武林外传 _ \n",
|
||
"0 PANT-61008 Retro 1960s Television _ \n",
|
||
"0 PANT-15007 Pink Rose _ \n",
|
||
"0 PANT-86218 Sherlock Holmes™ 221B Baker Street _ \n",
|
||
"\n",
|
||
" price_cn price_us brand ean producer release \\\n",
|
||
"0 _ _ BlueBrixx 4060904014783 2026 \n",
|
||
"0 _ _ BlueBrixx 4060904014783 2026 \n",
|
||
"0 _ _ BlueBrixx 4060904023020 Xingbao 2026 \n",
|
||
"0 _ _ BlueBrixx Qunlong 2026 \n",
|
||
"0 _ _ BlueBrixx 4060904022184 Qunlong 2026 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"0 _ _ Pantasy 2023 \n",
|
||
"0 _ _ Pantasy 2023 \n",
|
||
"0 _ _ Pantasy 6973817320354 2022 \n",
|
||
"0 _ _ Pantasy GoBricks 2024 \n",
|
||
"0 _ _ Pantasy 6973817320156 2022 \n",
|
||
"\n",
|
||
" category producer_category num_parts \n",
|
||
"0 BBPlay, The Three Investigators 1393 \n",
|
||
"0 BBPlay, The Three Investigators 1393 \n",
|
||
"0 Tiere BBPro 579 \n",
|
||
"0 Autos, Fahrzeuge BBSpecial 1291 \n",
|
||
"0 Geschichte, Mittelalter BBSpecial 1654 \n",
|
||
".. ... ... ... \n",
|
||
"0 China, Gebäude, Popkultur My Own Swordsman 422 \n",
|
||
"0 China, Gebäude, Popkultur My Own Swordsman 2000 \n",
|
||
"0 Gegenstände Retro Collection 1173 \n",
|
||
"0 Blumen, Pflanzen Botanical World \n",
|
||
"0 Popkultur Sherlock Holmes 1088 \n",
|
||
"\n",
|
||
"[4509 rows x 12 columns]"
|
||
]
|
||
},
|
||
"execution_count": 111,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"me_details"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 112,
|
||
"id": "0fb65dec",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"me_details.to_csv(\"../data/merlin/others.csv\", index=False)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "venv (3.12.3)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|