{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ad994162", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import json\n", "import requests as rq\n", "import bs4\n", "import pandas as pd\n", "import time\n", "import random\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "id": "b5536e8c", "metadata": {}, "outputs": [], "source": [ "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" ] }, { "cell_type": "code", "execution_count": 103, "id": "6d109e8a", "metadata": {}, "outputs": [], "source": [ "id_to_name = dict()\n", "for producer in producers:\n", " with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n", " listings = json.load(prodfile)[\"data\"]\n", "\n", " for listing in listings:\n", " name = listing[3] \n", " id = listing[1]\n", "\n", " id_to_name[id] = name" ] }, { "cell_type": "code", "execution_count": 11, "id": "ab997198", "metadata": {}, "outputs": [], "source": [ "# uvp preise bestimmen :(\n", "def get_all_ids() -> list[str]:\n", " df = pd.read_csv(\"../data/merlin/others.csv\")\n", " return df[\"id\"].to_list()" ] }, { "cell_type": "code", "execution_count": 12, "id": "32b1fa46", "metadata": {}, "outputs": [], "source": [ "with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", " for idx, id in enumerate(get_all_ids()[3663:]):\n", " try:\n", " small_id = id.lower()\n", "\n", " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", " soup = bs4.BeautifulSoup(response.text)\n", "\n", " # Prices\n", " price_eur = soup.find(id=\"listprice_eur\")\n", " price_usd = soup.find(id=\"listprice_usd\")\n", " price_cn = soup.find(id=\"listprice_cn\")\n", " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", "\n", " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", " \n", " #categories\n", " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", " writer = csv.writer(pricefile)\n", " \n", " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", " writer.writerow([id, *all_prices, *other_dump])\n", " time.sleep(random.randint(2, 3))\n", " except Exception as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 13, "id": "4a10a1e3", "metadata": {}, "outputs": [], "source": [ "def split_by_keywords(text, keywords):\n", " pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n", " parts = re.split(pattern, text)\n", " \n", " result = {}\n", " for i in range(1, len(parts), 2):\n", " key = parts[i]\n", " value = parts[i + 1].strip()\n", " result[key] = value\n", " \n", " return result" ] }, { "cell_type": "code", "execution_count": 14, "id": "9c00f188", "metadata": {}, "outputs": [], "source": [ "keywords = [\n", " \"Listenpreis:\",\n", " \"DetailsVon:\",\n", " \"EAN:\",\n", " \"Altersempfehlung:\",\n", " \"Steine von:\",\n", " \"Bestpreis:EU:\",\n", " \"Bewertungen\",\n", " \"Inhalt\",\n", " \"PreiseListenpreis:\",\n", " \"Hersteller-Kategorie:\",\n", " \"Designer:\",\n", " \"Maße:\",\n", " \"Release:\",\n", " \"Kategorien:\",\n", " \"Hersteller-Videos\",\n", " \"EU:\",\n", " \"Anleitung\",\n", " \"BewertungenCommunity:\",\n", " \"Maßstab:\",\n", " \"Erweiterung zu:\",\n", " \"Reviews\",\n", " \"Lizenz:\",\n", " \"Farbverteilung\",\n", " \"TeilelistenBrickLink\",\n", " \"Bild:\",\n", " \"ReviewsCommunity\",\n", " \"Gewicht\",\n", " \"Keine Aufkleber\",\n", " \"Verpackungsmaße:\",\n", " \"EU:Brickmo\",\n", " \"Datenbanken:\",\n", " \"Kategorie:\",\n", " \"Keine Drucke\",\n", " \"TechnikMOC:\",\n", " \"Steingröße:\",\n", " \"SonstigesMOC:\",\n", " \"Variationen:\",\n", " \"RebrickableVariation:\"\n", "]" ] }, { "cell_type": "code", "execution_count": 85, "id": "9b44a0e5", "metadata": {}, "outputs": [], "source": [ "def rm_epsilon(l : list[str]) ->list[str]:\n", " return list(filter(lambda s : len(s) > 0, l))" ] }, { "cell_type": "code", "execution_count": 110, "id": "ae53869e", "metadata": {}, "outputs": [], "source": [ "me_details = pd.DataFrame({\n", " \"id\" : [],\n", " \"name\" : [],\n", " \"price_eur\" : [],\n", " \"price_cn\" : [],\n", " \"price_us\" : [],\n", " \"brand\" : [],\n", " \"ean\" : [],\n", " \"producer\" : [],\n", " \"release\" : [],\n", " \"category\" : [],\n", " \"producer_category\" : [],\n", " \"num_parts\" : [],\n", " })\n", "\n", "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n", " reader = csv.reader(price_file)\n", "\n", " for row in reader:\n", " id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n", " other = filter(lambda s: not \"Wikipedia\" in s, other)\n", "\n", " retrieved = split_by_keywords(\"\".join(other), keywords)\n", "\n", " brand = retrieved.get(\"DetailsVon:\", \"\")\n", " ean = retrieved.get(\"EAN:\", \"\")\n", " producer = retrieved.get(\"Steine von:\", \"\")\n", " age = retrieved.get(\"Altersempfehlung:\", \"\")\n", " release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n", " num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].strip()\n", " category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n", " categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n", " producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n", " producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n", "\n", " me_extra = pd.DataFrame({\n", " \"id\" : [id],\n", " \"name\" : [id_to_name.get(id, \"\")],\n", " \"price_eur\" : [lp_eur],\n", " \"price_us\" : [lp_usd],\n", " \"price_cn\" : [lp_cn],\n", " \"brand\" : [brand],\n", " \"ean\" : [ean],\n", " \"producer\" : [producer],\n", " \"release\" : [release],\n", " \"category\" : [categories],\n", " \"producer_category\" : [producer_categories],\n", " \"num_parts\" : [num_parts],\n", " })\n", "\n", " me_details = pd.concat([me_details, me_extra])" ] }, { "cell_type": "code", "execution_count": 111, "id": "1b5bcea6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnameprice_eurprice_cnprice_usbrandeanproducerreleasecategoryproducer_categorynum_parts
0BB-108899Die drei ??? - Kids - Einbruch im Leuchtturm99.95__BlueBrixx40609040147832026BBPlay, The Three Investigators1393
0BB-108899Die drei ??? - Kids - Einbruch im Leuchtturm99.95__BlueBrixx40609040147832026BBPlay, The Three Investigators1393
0BB-108569Fledermaus29.95__BlueBrixx4060904023020Xingbao2026TiereBBPro579
0BB-1092621970er Sport Cabriolet schwarz49.95__BlueBrixxQunlong2026Autos, FahrzeugeBBSpecial1291
0BB-109021Mittelalterliche Steinbrücke59.95__BlueBrixx4060904022184Qunlong2026Geschichte, MittelalterBBSpecial1654
.......................................
0PANT-86219My Own Swordsman™ Tavern Gate 武林外传___Pantasy2023China, Gebäude, PopkulturMy Own Swordsman422
0PANT-86220My Own Swordsman™ Tong Fu Inn 武林外传___Pantasy2023China, Gebäude, PopkulturMy Own Swordsman2000
0PANT-61008Retro 1960s Television___Pantasy69738173203542022GegenständeRetro Collection1173
0PANT-15007Pink Rose___PantasyGoBricks2024Blumen, PflanzenBotanical World
0PANT-86218Sherlock Holmes™ 221B Baker Street___Pantasy69738173201562022PopkulturSherlock Holmes1088
\n", "

4509 rows × 12 columns

\n", "
" ], "text/plain": [ " id name price_eur \\\n", "0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n", "0 BB-108899 Die drei ??? - Kids - Einbruch im Leuchtturm 99.95 \n", "0 BB-108569 Fledermaus 29.95 \n", "0 BB-109262 1970er Sport Cabriolet schwarz 49.95 \n", "0 BB-109021 Mittelalterliche Steinbrücke 59.95 \n", ".. ... ... ... \n", "0 PANT-86219 My Own Swordsman™ Tavern Gate 武林外传 _ \n", "0 PANT-86220 My Own Swordsman™ Tong Fu Inn 武林外传 _ \n", "0 PANT-61008 Retro 1960s Television _ \n", "0 PANT-15007 Pink Rose _ \n", "0 PANT-86218 Sherlock Holmes™ 221B Baker Street _ \n", "\n", " price_cn price_us brand ean producer release \\\n", "0 _ _ BlueBrixx 4060904014783 2026 \n", "0 _ _ BlueBrixx 4060904014783 2026 \n", "0 _ _ BlueBrixx 4060904023020 Xingbao 2026 \n", "0 _ _ BlueBrixx Qunlong 2026 \n", "0 _ _ BlueBrixx 4060904022184 Qunlong 2026 \n", ".. ... ... ... ... ... ... \n", "0 _ _ Pantasy 2023 \n", "0 _ _ Pantasy 2023 \n", "0 _ _ Pantasy 6973817320354 2022 \n", "0 _ _ Pantasy GoBricks 2024 \n", "0 _ _ Pantasy 6973817320156 2022 \n", "\n", " category producer_category num_parts \n", "0 BBPlay, The Three Investigators 1393 \n", "0 BBPlay, The Three Investigators 1393 \n", "0 Tiere BBPro 579 \n", "0 Autos, Fahrzeuge BBSpecial 1291 \n", "0 Geschichte, Mittelalter BBSpecial 1654 \n", ".. ... ... ... \n", "0 China, Gebäude, Popkultur My Own Swordsman 422 \n", "0 China, Gebäude, Popkultur My Own Swordsman 2000 \n", "0 Gegenstände Retro Collection 1173 \n", "0 Blumen, Pflanzen Botanical World \n", "0 Popkultur Sherlock Holmes 1088 \n", "\n", "[4509 rows x 12 columns]" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "me_details" ] }, { "cell_type": "code", "execution_count": 112, "id": "0fb65dec", "metadata": {}, "outputs": [], "source": [ "me_details.to_csv(\"../data/merlin/others.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "venv (3.12.3)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }