{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ad994162", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import json\n", "import requests as rq\n", "import bs4\n", "import pandas as pd\n", "import time\n", "import random\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "id": "b5536e8c", "metadata": {}, "outputs": [], "source": [ "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" ] }, { "cell_type": "code", "execution_count": 103, "id": "6d109e8a", "metadata": {}, "outputs": [], "source": [ "id_to_name = dict()\n", "for producer in producers:\n", " with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n", " listings = json.load(prodfile)[\"data\"]\n", "\n", " for listing in listings:\n", " name = listing[3] \n", " id = listing[1]\n", "\n", " id_to_name[id] = name" ] }, { "cell_type": "code", "execution_count": 11, "id": "ab997198", "metadata": {}, "outputs": [], "source": [ "# uvp preise bestimmen :(\n", "def get_all_ids() -> list[str]:\n", " df = pd.read_csv(\"../data/merlin/others.csv\")\n", " return df[\"id\"].to_list()" ] }, { "cell_type": "code", "execution_count": 12, "id": "32b1fa46", "metadata": {}, "outputs": [], "source": [ "with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", " for idx, id in enumerate(get_all_ids()[3663:]):\n", " try:\n", " small_id = id.lower()\n", "\n", " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", " soup = bs4.BeautifulSoup(response.text)\n", "\n", " # Prices\n", " price_eur = soup.find(id=\"listprice_eur\")\n", " price_usd = soup.find(id=\"listprice_usd\")\n", " price_cn = soup.find(id=\"listprice_cn\")\n", " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", "\n", " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", " \n", " #categories\n", " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", " writer = csv.writer(pricefile)\n", " \n", " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", " writer.writerow([id, *all_prices, *other_dump])\n", " time.sleep(random.randint(2, 3))\n", " except Exception as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 13, "id": "4a10a1e3", "metadata": {}, "outputs": [], "source": [ "def split_by_keywords(text, keywords):\n", " pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n", " parts = re.split(pattern, text)\n", " \n", " result = {}\n", " for i in range(1, len(parts), 2):\n", " key = parts[i]\n", " value = parts[i + 1].strip()\n", " result[key] = value\n", " \n", " return result" ] }, { "cell_type": "code", "execution_count": 14, "id": "9c00f188", "metadata": {}, "outputs": [], "source": [ "keywords = [\n", " \"Listenpreis:\",\n", " \"DetailsVon:\",\n", " \"EAN:\",\n", " \"Altersempfehlung:\",\n", " \"Steine von:\",\n", " \"Bestpreis:EU:\",\n", " \"Bewertungen\",\n", " \"Inhalt\",\n", " \"PreiseListenpreis:\",\n", " \"Hersteller-Kategorie:\",\n", " \"Designer:\",\n", " \"Maße:\",\n", " \"Release:\",\n", " \"Kategorien:\",\n", " \"Hersteller-Videos\",\n", " \"EU:\",\n", " \"Anleitung\",\n", " \"BewertungenCommunity:\",\n", " \"Maßstab:\",\n", " \"Erweiterung zu:\",\n", " \"Reviews\",\n", " \"Lizenz:\",\n", " \"Farbverteilung\",\n", " \"TeilelistenBrickLink\",\n", " \"Bild:\",\n", " \"ReviewsCommunity\",\n", " \"Gewicht\",\n", " \"Keine Aufkleber\",\n", " \"Verpackungsmaße:\",\n", " \"EU:Brickmo\",\n", " \"Datenbanken:\",\n", " \"Kategorie:\",\n", " \"Keine Drucke\",\n", " \"TechnikMOC:\",\n", " \"Steingröße:\",\n", " \"SonstigesMOC:\",\n", " \"Variationen:\",\n", " \"RebrickableVariation:\"\n", "]" ] }, { "cell_type": "code", "execution_count": 85, "id": "9b44a0e5", "metadata": {}, "outputs": [], "source": [ "def rm_epsilon(l : list[str]) ->list[str]:\n", " return list(filter(lambda s : len(s) > 0, l))" ] }, { "cell_type": "code", "execution_count": 110, "id": "ae53869e", "metadata": {}, "outputs": [], "source": [ "me_details = pd.DataFrame({\n", " \"id\" : [],\n", " \"name\" : [],\n", " \"price_eur\" : [],\n", " \"price_cn\" : [],\n", " \"price_us\" : [],\n", " \"brand\" : [],\n", " \"ean\" : [],\n", " \"producer\" : [],\n", " \"release\" : [],\n", " \"category\" : [],\n", " \"producer_category\" : [],\n", " \"num_parts\" : [],\n", " })\n", "\n", "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n", " reader = csv.reader(price_file)\n", "\n", " for row in reader:\n", " id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n", " other = filter(lambda s: not \"Wikipedia\" in s, other)\n", "\n", " retrieved = split_by_keywords(\"\".join(other), keywords)\n", "\n", " brand = retrieved.get(\"DetailsVon:\", \"\")\n", " ean = retrieved.get(\"EAN:\", \"\")\n", " producer = retrieved.get(\"Steine von:\", \"\")\n", " age = retrieved.get(\"Altersempfehlung:\", \"\")\n", " release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n", " num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].strip()\n", " category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n", " categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n", " producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n", " producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n", "\n", " me_extra = pd.DataFrame({\n", " \"id\" : [id],\n", " \"name\" : [id_to_name.get(id, \"\")],\n", " \"price_eur\" : [lp_eur],\n", " \"price_us\" : [lp_usd],\n", " \"price_cn\" : [lp_cn],\n", " \"brand\" : [brand],\n", " \"ean\" : [ean],\n", " \"producer\" : [producer],\n", " \"release\" : [release],\n", " \"category\" : [categories],\n", " \"producer_category\" : [producer_categories],\n", " \"num_parts\" : [num_parts],\n", " })\n", "\n", " me_details = pd.concat([me_details, me_extra])" ] }, { "cell_type": "code", "execution_count": 111, "id": "1b5bcea6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "name | \n", "price_eur | \n", "price_cn | \n", "price_us | \n", "brand | \n", "ean | \n", "producer | \n", "release | \n", "category | \n", "producer_category | \n", "num_parts | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "BB-108899 | \n", "Die drei ??? - Kids - Einbruch im Leuchtturm | \n", "99.95 | \n", "_ | \n", "_ | \n", "BlueBrixx | \n", "4060904014783 | \n", "\n", " | 2026 | \n", "BBPlay, The Three Investigators | \n", "\n", " | 1393 | \n", "
| 0 | \n", "BB-108899 | \n", "Die drei ??? - Kids - Einbruch im Leuchtturm | \n", "99.95 | \n", "_ | \n", "_ | \n", "BlueBrixx | \n", "4060904014783 | \n", "\n", " | 2026 | \n", "BBPlay, The Three Investigators | \n", "\n", " | 1393 | \n", "
| 0 | \n", "BB-108569 | \n", "Fledermaus | \n", "29.95 | \n", "_ | \n", "_ | \n", "BlueBrixx | \n", "4060904023020 | \n", "Xingbao | \n", "2026 | \n", "Tiere | \n", "BBPro | \n", "579 | \n", "
| 0 | \n", "BB-109262 | \n", "1970er Sport Cabriolet schwarz | \n", "49.95 | \n", "_ | \n", "_ | \n", "BlueBrixx | \n", "\n", " | Qunlong | \n", "2026 | \n", "Autos, Fahrzeuge | \n", "BBSpecial | \n", "1291 | \n", "
| 0 | \n", "BB-109021 | \n", "Mittelalterliche Steinbrücke | \n", "59.95 | \n", "_ | \n", "_ | \n", "BlueBrixx | \n", "4060904022184 | \n", "Qunlong | \n", "2026 | \n", "Geschichte, Mittelalter | \n", "BBSpecial | \n", "1654 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 0 | \n", "PANT-86219 | \n", "My Own Swordsman™ Tavern Gate 武林外传 | \n", "_ | \n", "_ | \n", "_ | \n", "Pantasy | \n", "\n", " | \n", " | 2023 | \n", "China, Gebäude, Popkultur | \n", "My Own Swordsman | \n", "422 | \n", "
| 0 | \n", "PANT-86220 | \n", "My Own Swordsman™ Tong Fu Inn 武林外传 | \n", "_ | \n", "_ | \n", "_ | \n", "Pantasy | \n", "\n", " | \n", " | 2023 | \n", "China, Gebäude, Popkultur | \n", "My Own Swordsman | \n", "2000 | \n", "
| 0 | \n", "PANT-61008 | \n", "Retro 1960s Television | \n", "_ | \n", "_ | \n", "_ | \n", "Pantasy | \n", "6973817320354 | \n", "\n", " | 2022 | \n", "Gegenstände | \n", "Retro Collection | \n", "1173 | \n", "
| 0 | \n", "PANT-15007 | \n", "Pink Rose | \n", "_ | \n", "_ | \n", "_ | \n", "Pantasy | \n", "\n", " | GoBricks | \n", "2024 | \n", "Blumen, Pflanzen | \n", "Botanical World | \n", "\n", " |
| 0 | \n", "PANT-86218 | \n", "Sherlock Holmes™ 221B Baker Street | \n", "_ | \n", "_ | \n", "_ | \n", "Pantasy | \n", "6973817320156 | \n", "\n", " | 2022 | \n", "Popkultur | \n", "Sherlock Holmes | \n", "1088 | \n", "
4509 rows × 12 columns
\n", "