adjusted merlin

main
Roman Schöne 2026-05-02 12:38:16 +02:00
parent 8720ab9c57
commit 4cee889722
3 changed files with 4578 additions and 18459 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"id": "ad994162",
"metadata": {},
"outputs": [],
@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "b5536e8c",
"metadata": {},
"outputs": [],
@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 16,
"id": "6d109e8a",
"metadata": {},
"outputs": [],
@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 17,
"id": "ab997198",
"metadata": {},
"outputs": [],
@ -61,12 +61,23 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 18,
"id": "32b1fa46",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"' with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\\n for idx, id in enumerate(get_all_ids()[3663:]):\\n try:\\n small_id = id.lower()\\n\\n response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\\n soup = bs4.BeautifulSoup(response.text)\\n\\n # Prices\\n price_eur = soup.find(id=\"listprice_eur\")\\n price_usd = soup.find(id=\"listprice_usd\")\\n price_cn = soup.find(id=\"listprice_cn\")\\n bestprice_eur = soup.find(id=\"bestprice_eur\")\\n bestprice_usd = soup.find(id=\"bestprice_usd\")\\n bestprice_cn = soup.find(id=\"bestprice_cn\")\\n\\n all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\\n\\n #categories\\n other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\\n writer = csv.writer(pricefile)\\n\\n all_prices = [p.text if p != None else \"_\" for p in all_prices]\\n writer.writerow([id, *all_prices, *other_dump])\\n time.sleep(random.randint(2, 3))\\n except Exception as e:\\n print(e) '"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
"\"\"\" with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
" try:\n",
" small_id = id.lower()\n",
@ -92,12 +103,12 @@
" writer.writerow([id, *all_prices, *other_dump])\n",
" time.sleep(random.randint(2, 3))\n",
" except Exception as e:\n",
" print(e)"
" print(e) \"\"\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 19,
"id": "4a10a1e3",
"metadata": {},
"outputs": [],
@ -117,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 20,
"id": "9c00f188",
"metadata": {},
"outputs": [],
@ -166,7 +177,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": 21,
"id": "9b44a0e5",
"metadata": {},
"outputs": [],
@ -177,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 110,
"execution_count": 56,
"id": "ae53869e",
"metadata": {},
"outputs": [],
@ -206,23 +217,26 @@
"\n",
" retrieved = split_by_keywords(\"\".join(other), keywords)\n",
"\n",
" brand = retrieved.get(\"DetailsVon:\", \"\")\n",
" brand = retrieved.get(\"DetailsVon:\", \"\").replace(\" \", \"\")\n",
" ean = retrieved.get(\"EAN:\", \"\")\n",
" producer = retrieved.get(\"Steine von:\", \"\")\n",
" age = retrieved.get(\"Altersempfehlung:\", \"\")\n",
" release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n",
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].strip()\n",
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].replace(\"Ein Teil\", \"1\").replace(\"Preise\", \"\").replace(\"Mit Fernsteuerung / Elektrik\", \"1\").replace(\"Eine Minifigur\", \"1\").replace(\"Minifiguren\", \"\").strip()\n",
"\n",
" category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n",
" categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n",
" producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n",
" producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n",
"\n",
" if brand == \"\":\n",
" continue\n",
" me_extra = pd.DataFrame({\n",
" \"id\" : [id],\n",
" \"name\" : [id_to_name.get(id, \"\")],\n",
" \"price_eur\" : [lp_eur],\n",
" \"price_us\" : [lp_usd],\n",
" \"price_cn\" : [lp_cn],\n",
" \"price_eur\" : [lp_eur.replace(\"_\", \"\")],\n",
" \"price_us\" : [lp_usd.replace(\"_\", \"\")],\n",
" \"price_cn\" : [lp_cn.replace(\"_\", \"\")],\n",
" \"brand\" : [brand],\n",
" \"ean\" : [ean],\n",
" \"producer\" : [producer],\n",
@ -237,7 +251,7 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 57,
"id": "1b5bcea6",
"metadata": {},
"outputs": [
@ -277,231 +291,28 @@
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108899</td>\n",
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
" <td>99.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904014783</td>\n",
" <td></td>\n",
" <td>2026</td>\n",
" <td>BBPlay, The Three Investigators</td>\n",
" <td></td>\n",
" <td>1393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108899</td>\n",
" <td>Die drei ??? - Kids - Einbruch im Leuchtturm</td>\n",
" <td>99.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904014783</td>\n",
" <td></td>\n",
" <td>2026</td>\n",
" <td>BBPlay, The Three Investigators</td>\n",
" <td></td>\n",
" <td>1393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-108569</td>\n",
" <td>Fledermaus</td>\n",
" <td>29.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904023020</td>\n",
" <td>Xingbao</td>\n",
" <td>2026</td>\n",
" <td>Tiere</td>\n",
" <td>BBPro</td>\n",
" <td>579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-109262</td>\n",
" <td>1970er Sport Cabriolet schwarz</td>\n",
" <td>49.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td></td>\n",
" <td>Qunlong</td>\n",
" <td>2026</td>\n",
" <td>Autos, Fahrzeuge</td>\n",
" <td>BBSpecial</td>\n",
" <td>1291</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BB-109021</td>\n",
" <td>Mittelalterliche Steinbrücke</td>\n",
" <td>59.95</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>BlueBrixx</td>\n",
" <td>4060904022184</td>\n",
" <td>Qunlong</td>\n",
" <td>2026</td>\n",
" <td>Geschichte, Mittelalter</td>\n",
" <td>BBSpecial</td>\n",
" <td>1654</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86219</td>\n",
" <td>My Own Swordsman™ Tavern Gate 武林外传</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2023</td>\n",
" <td>China, Gebäude, Popkultur</td>\n",
" <td>My Own Swordsman</td>\n",
" <td>422</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86220</td>\n",
" <td>My Own Swordsman™ Tong Fu Inn 武林外传</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2023</td>\n",
" <td>China, Gebäude, Popkultur</td>\n",
" <td>My Own Swordsman</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-61008</td>\n",
" <td>Retro 1960s Television</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td>6973817320354</td>\n",
" <td></td>\n",
" <td>2022</td>\n",
" <td>Gegenstände</td>\n",
" <td>Retro Collection</td>\n",
" <td>1173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-15007</td>\n",
" <td>Pink Rose</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td></td>\n",
" <td>GoBricks</td>\n",
" <td>2024</td>\n",
" <td>Blumen, Pflanzen</td>\n",
" <td>Botanical World</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PANT-86218</td>\n",
" <td>Sherlock Holmes™ 221B Baker Street</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>_</td>\n",
" <td>Pantasy</td>\n",
" <td>6973817320156</td>\n",
" <td></td>\n",
" <td>2022</td>\n",
" <td>Popkultur</td>\n",
" <td>Sherlock Holmes</td>\n",
" <td>1088</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4509 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" id name price_eur \\\n",
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
"0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n",
"0 BB-108569 Fledermaus 29.95 \n",
"0 BB-109262 1970er Sport Cabriolet schwarz 49.95 \n",
"0 BB-109021 Mittelalterliche Steinbrücke 59.95 \n",
".. ... ... ... \n",
"0 PANT-86219 My Own Swordsman™ Tavern Gate 武林外传 _ \n",
"0 PANT-86220 My Own Swordsman™ Tong Fu Inn 武林外传 _ \n",
"0 PANT-61008 Retro 1960s Television _ \n",
"0 PANT-15007 Pink Rose _ \n",
"0 PANT-86218 Sherlock Holmes™ 221B Baker Street _ \n",
"\n",
" price_cn price_us brand ean producer release \\\n",
"0 _ _ BlueBrixx 4060904014783 2026 \n",
"0 _ _ BlueBrixx 4060904014783 2026 \n",
"0 _ _ BlueBrixx 4060904023020 Xingbao 2026 \n",
"0 _ _ BlueBrixx Qunlong 2026 \n",
"0 _ _ BlueBrixx 4060904022184 Qunlong 2026 \n",
".. ... ... ... ... ... ... \n",
"0 _ _ Pantasy 2023 \n",
"0 _ _ Pantasy 2023 \n",
"0 _ _ Pantasy 6973817320354 2022 \n",
"0 _ _ Pantasy GoBricks 2024 \n",
"0 _ _ Pantasy 6973817320156 2022 \n",
"\n",
" category producer_category num_parts \n",
"0 BBPlay, The Three Investigators 1393 \n",
"0 BBPlay, The Three Investigators 1393 \n",
"0 Tiere BBPro 579 \n",
"0 Autos, Fahrzeuge BBSpecial 1291 \n",
"0 Geschichte, Mittelalter BBSpecial 1654 \n",
".. ... ... ... \n",
"0 China, Gebäude, Popkultur My Own Swordsman 422 \n",
"0 China, Gebäude, Popkultur My Own Swordsman 2000 \n",
"0 Gegenstände Retro Collection 1173 \n",
"0 Blumen, Pflanzen Botanical World \n",
"0 Popkultur Sherlock Holmes 1088 \n",
"\n",
"[4509 rows x 12 columns]"
"Empty DataFrame\n",
"Columns: [id, name, price_eur, price_cn, price_us, brand, ean, producer, release, category, producer_category, num_parts]\n",
"Index: []"
]
},
"execution_count": 111,
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"me_details"
"me_details[\"Mit Fernsteuerung / Elektrik\" == me_details[\"num_parts\"]]"
]
},
{
"cell_type": "code",
"execution_count": 112,
"execution_count": 58,
"id": "0fb65dec",
"metadata": {},
"outputs": [],
@ -512,7 +323,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.12.3)",
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
@ -526,7 +337,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.14.4"
}
},
"nbformat": 4,