{ "cells": [ { "cell_type": "code", "execution_count": 14, "id": "ad994162", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import json\n", "import requests as rq\n", "import bs4\n", "import pandas as pd\n", "import time\n", "import random\n", "import re" ] }, { "cell_type": "code", "execution_count": 15, "id": "b5536e8c", "metadata": {}, "outputs": [], "source": [ "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" ] }, { "cell_type": "code", "execution_count": 16, "id": "6d109e8a", "metadata": {}, "outputs": [], "source": [ "id_to_name = dict()\n", "for producer in producers:\n", " with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n", " listings = json.load(prodfile)[\"data\"]\n", "\n", " for listing in listings:\n", " name = listing[3] \n", " id = listing[1]\n", "\n", " id_to_name[id] = name" ] }, { "cell_type": "code", "execution_count": 17, "id": "ab997198", "metadata": {}, "outputs": [], "source": [ "# uvp preise bestimmen :(\n", "def get_all_ids() -> list[str]:\n", " df = pd.read_csv(\"../data/merlin/others.csv\")\n", " return df[\"id\"].to_list()" ] }, { "cell_type": "code", "execution_count": 18, "id": "32b1fa46", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\\n for idx, id in enumerate(get_all_ids()[3663:]):\\n try:\\n small_id = id.lower()\\n\\n response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\\n soup = bs4.BeautifulSoup(response.text)\\n\\n # Prices\\n price_eur = soup.find(id=\"listprice_eur\")\\n price_usd = soup.find(id=\"listprice_usd\")\\n price_cn = soup.find(id=\"listprice_cn\")\\n bestprice_eur = soup.find(id=\"bestprice_eur\")\\n bestprice_usd = soup.find(id=\"bestprice_usd\")\\n bestprice_cn = soup.find(id=\"bestprice_cn\")\\n\\n all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\\n\\n #categories\\n other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\\n writer = csv.writer(pricefile)\\n\\n all_prices = [p.text if p != None else \"_\" for p in all_prices]\\n writer.writerow([id, *all_prices, *other_dump])\\n time.sleep(random.randint(2, 3))\\n except Exception as e:\\n print(e) '" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\" with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", " for idx, id in enumerate(get_all_ids()[3663:]):\n", " try:\n", " small_id = id.lower()\n", "\n", " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", " soup = bs4.BeautifulSoup(response.text)\n", "\n", " # Prices\n", " price_eur = soup.find(id=\"listprice_eur\")\n", " price_usd = soup.find(id=\"listprice_usd\")\n", " price_cn = soup.find(id=\"listprice_cn\")\n", " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", "\n", " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", " \n", " #categories\n", " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", " writer = csv.writer(pricefile)\n", " \n", " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", " writer.writerow([id, *all_prices, *other_dump])\n", " time.sleep(random.randint(2, 3))\n", " except Exception as e:\n", " print(e) \"\"\"" ] }, { "cell_type": "code", "execution_count": 19, "id": "4a10a1e3", "metadata": {}, "outputs": [], "source": [ "def split_by_keywords(text, keywords):\n", " pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n", " parts = re.split(pattern, text)\n", " \n", " result = {}\n", " for i in range(1, len(parts), 2):\n", " key = parts[i]\n", " value = parts[i + 1].strip()\n", " result[key] = value\n", " \n", " return result" ] }, { "cell_type": "code", "execution_count": 20, "id": "9c00f188", "metadata": {}, "outputs": [], "source": [ "keywords = [\n", " \"Listenpreis:\",\n", " \"DetailsVon:\",\n", " \"EAN:\",\n", " \"Altersempfehlung:\",\n", " \"Steine von:\",\n", " \"Bestpreis:EU:\",\n", " \"Bewertungen\",\n", " \"Inhalt\",\n", " \"PreiseListenpreis:\",\n", " \"Hersteller-Kategorie:\",\n", " \"Designer:\",\n", " \"Maße:\",\n", " \"Release:\",\n", " \"Kategorien:\",\n", " \"Hersteller-Videos\",\n", " \"EU:\",\n", " \"Anleitung\",\n", " \"BewertungenCommunity:\",\n", " \"Maßstab:\",\n", " \"Erweiterung zu:\",\n", " \"Reviews\",\n", " \"Lizenz:\",\n", " \"Farbverteilung\",\n", " \"TeilelistenBrickLink\",\n", " \"Bild:\",\n", " \"ReviewsCommunity\",\n", " \"Gewicht\",\n", " \"Keine Aufkleber\",\n", " \"Verpackungsmaße:\",\n", " \"EU:Brickmo\",\n", " \"Datenbanken:\",\n", " \"Kategorie:\",\n", " \"Keine Drucke\",\n", " \"TechnikMOC:\",\n", " \"Steingröße:\",\n", " \"SonstigesMOC:\",\n", " \"Variationen:\",\n", " \"RebrickableVariation:\"\n", "]" ] }, { "cell_type": "code", "execution_count": 21, "id": "9b44a0e5", "metadata": {}, "outputs": [], "source": [ "def rm_epsilon(l : list[str]) ->list[str]:\n", " return list(filter(lambda s : len(s) > 0, l))" ] }, { "cell_type": "code", "execution_count": 56, "id": "ae53869e", "metadata": {}, "outputs": [], "source": [ "me_details = pd.DataFrame({\n", " \"id\" : [],\n", " \"name\" : [],\n", " \"price_eur\" : [],\n", " \"price_cn\" : [],\n", " \"price_us\" : [],\n", " \"brand\" : [],\n", " \"ean\" : [],\n", " \"producer\" : [],\n", " \"release\" : [],\n", " \"category\" : [],\n", " \"producer_category\" : [],\n", " \"num_parts\" : [],\n", " })\n", "\n", "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n", " reader = csv.reader(price_file)\n", "\n", " for row in reader:\n", " id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n", " other = filter(lambda s: not \"Wikipedia\" in s, other)\n", "\n", " retrieved = split_by_keywords(\"\".join(other), keywords)\n", "\n", " brand = retrieved.get(\"DetailsVon:\", \"\").replace(\" \", \"\")\n", " ean = retrieved.get(\"EAN:\", \"\")\n", " producer = retrieved.get(\"Steine von:\", \"\")\n", " age = retrieved.get(\"Altersempfehlung:\", \"\")\n", " release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n", " num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].replace(\"Ein Teil\", \"1\").replace(\"Preise\", \"\").replace(\"Mit Fernsteuerung / Elektrik\", \"1\").replace(\"Eine Minifigur\", \"1\").replace(\"Minifiguren\", \"\").strip()\n", "\n", " category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n", " categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n", " producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n", " producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n", "\n", " if brand == \"\":\n", " continue\n", " me_extra = pd.DataFrame({\n", " \"id\" : [id],\n", " \"name\" : [id_to_name.get(id, \"\")],\n", " \"price_eur\" : [lp_eur.replace(\"_\", \"\")],\n", " \"price_us\" : [lp_usd.replace(\"_\", \"\")],\n", " \"price_cn\" : [lp_cn.replace(\"_\", \"\")],\n", " \"brand\" : [brand],\n", " \"ean\" : [ean],\n", " \"producer\" : [producer],\n", " \"release\" : [release],\n", " \"category\" : [categories],\n", " \"producer_category\" : [producer_categories],\n", " \"num_parts\" : [num_parts],\n", " })\n", "\n", " me_details = pd.concat([me_details, me_extra])" ] }, { "cell_type": "code", "execution_count": 57, "id": "1b5bcea6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnameprice_eurprice_cnprice_usbrandeanproducerreleasecategoryproducer_categorynum_parts
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, name, price_eur, price_cn, price_us, brand, ean, producer, release, category, producer_category, num_parts]\n", "Index: []" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "me_details[\"Mit Fernsteuerung / Elektrik\" == me_details[\"num_parts\"]]" ] }, { "cell_type": "code", "execution_count": 58, "id": "0fb65dec", "metadata": {}, "outputs": [], "source": [ "me_details.to_csv(\"../data/merlin/others.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "venv (3.14.4)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.4" } }, "nbformat": 4, "nbformat_minor": 5 }