{ "cells": [ { "cell_type": "code", "execution_count": 44, "id": "ad994162", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import json\n", "import requests as rq\n", "import bs4\n", "import pandas as pd\n", "import time\n", "import random\n", "import re\n", "import pprint" ] }, { "cell_type": "code", "execution_count": null, "id": "b5536e8c", "metadata": {}, "outputs": [], "source": [ "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "a5daea73", "metadata": {}, "outputs": [], "source": [ "with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n", " writer = csv.writer(producerfile)\n", " writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n", " for producer in producers:\n", " with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n", " data = json.loads(sourcefile.read())\n", " for row in data[\"data\"]:\n", " _, id, _, name, rating, _, _, size, parts, year, _ = row\n", "\n", " writer.writerow([id, producer, name, size, parts, year])" ] }, { "cell_type": "code", "execution_count": 4, "id": "ab997198", "metadata": {}, "outputs": [], "source": [ "# uvp preise bestimmen :(\n", "def get_all_ids() -> list[str]:\n", " df = pd.read_csv(\"./data/merlin/others.csv\")\n", " return df[\"id\"].to_list()" ] }, { "cell_type": "code", "execution_count": 7, "id": "32b1fa46", "metadata": {}, "outputs": [], "source": [ "with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", " for idx, id in enumerate(get_all_ids()[3663:]):\n", " try:\n", " small_id = id.lower()\n", "\n", " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", " soup = bs4.BeautifulSoup(response.text)\n", "\n", " # Prices\n", " price_eur = soup.find(id=\"listprice_eur\")\n", " price_usd = soup.find(id=\"listprice_usd\")\n", " price_cn = soup.find(id=\"listprice_cn\")\n", " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", "\n", " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", " \n", " #categories\n", " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", " writer = csv.writer(pricefile)\n", " \n", " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", " writer.writerow([id, *all_prices, *other_dump])\n", " time.sleep(random.randint(2, 3))\n", " except Exception as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 27, "id": "4a10a1e3", "metadata": {}, "outputs": [], "source": [ "def split_by_keywords(text, keywords):\n", " pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n", " parts = re.split(pattern, text)\n", " \n", " result = {}\n", " for i in range(1, len(parts), 2):\n", " key = parts[i]\n", " value = parts[i + 1].strip()\n", " result[key] = value\n", " \n", " return result" ] }, { "cell_type": "code", "execution_count": null, "id": "9c00f188", "metadata": {}, "outputs": [], "source": [ "keywords = [\n", " \"Listenpreis:\",\n", " \"DetailsVon:\",\n", " \"EAN:\",\n", " \"Altersempfehlung:\",\n", " \"Steine von:\",\n", " \"Bestpreis:EU:\",\n", " \"Bewertungen\",\n", " \"Inhalt\",\n", " \"PreiseListenpreis:\",\n", " \"Hersteller-Kategorie:\",\n", " \"Designer:\",\n", " \"Maße:\",\n", " \"Release:\",\n", " \"Kategorien:\",\n", " \"Hersteller-Videos\",\n", " \"EU:\",\n", " \"Anleitung\",\n", " \"BewertungenCommunity:\",\n", " \"Maßstab:\",\n", " \"Erweiterung zu:\",\n", " \"Reviews\",\n", " \"Lizenz:\",\n", " \"Farbverteilung\",\n", " \"TeilelistenBrickLink\",\n", " \"Bild:\",\n", " \"ReviewsCommunity\",\n", " \"Gewicht\",\n", " \"Keine Aufkleber\",\n", " \"Verpackungsmaße:\",\n", " \"EU:Brickmo\",\n", " \"Datenbanken:\",\n", " \"Kategorie:\",\n", " \"Keine Drucke\",\n", " \"TechnikMOC:\",\n", " \"Steingröße:\",\n", " \"SonstigesMOC:\",\n", " \"Variationen:\",\n", " \"RebrickableVariation:\"\n", "]" ] }, { "cell_type": "code", "execution_count": 176, "id": "ae53869e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Listenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)',\n", " 'DetailsVon:': 'BlueBrixx',\n", " 'EAN:': '4060904003671',\n", " 'Steine von:': 'Qunlong',\n", " 'Kategorie:': 'EisenbahnHersteller-',\n", " 'Kategorien:': 'BBSpecial, BRIX',\n", " 'Anleitung': 'Ohne Bauabschnitte',\n", " 'Bewertungen': 'Bewerten',\n", " 'Hersteller-Videos': 'video-1',\n", " 'Inhalt': '205 Teile',\n", " 'Gewicht': ': 190 g',\n", " 'Keine Aufkleber': '',\n", " 'Keine Drucke': '',\n", " 'Farbverteilung': '',\n", " 'TeilelistenBrickLink': 'XMLRebrickable CSVLEGO PaB CSVSetDB CSV',\n", " 'PreiseListenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)'}\n" ] } ], "source": [ "details = {\n", " \"id\" : [],\n", " \"listprice_eur\" : [],\n", " \"listprice_cn\" : [],\n", " \"listprice_usd\" : [],\n", " \"bestprice_eur\" : [],\n", " \"bestprice_cn\" : [],\n", " \"bestprice_usd\" : [],\n", " \"brand\" : [],\n", " \"ean\" : [],\n", " \"producer\" : [],\n", " \"release\" : [],\n", " \"scale\" : [],\n", " \"category\" : [],\n", " \"producer_category\" : [],\n", " \"num_parts\" : [],\n", " \"width\" : [],\n", " \"height\" : [],\n", " \"depth\" : [],\n", " \"designer\" : [],\n", " \"weight\" : [],\n", " \"age\" : []\n", "}\n", "import random\n", "\n", "me_details = pd.DataFrame(details)\n", "\n", "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n", " reader = csv.reader(price_file)\n", "\n", " # for row in reader:\n", " # id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n", " \n", " # me_details.loc[-1] = [id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd] + list(range(0, 12))\n", " # me_details.index = me_details.index + 1\n", " id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = [row for row in reader][random.randint(0, 4500)]\n", " other = filter(lambda s: not \"Wikipedia\" in s, other)\n", "\n", " pprint.pp(split_by_keywords(\"\".join(other), keywords))\n" ] }, { "cell_type": "code", "execution_count": 40, "id": "b83aa413", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "listprice_eur | \n", "listprice_cn | \n", "listprice_usd | \n", "bestprice_eur | \n", "bestprice_cn | \n", "bestprice_usd | \n", "brand | \n", "ean | \n", "producer | \n", "... | \n", "scale | \n", "category | \n", "producer_category | \n", "num_parts | \n", "width | \n", "height | \n", "depth | \n", "designer | \n", "weight | \n", "age | \n", "
|---|
0 rows × 21 columns
\n", "