346 lines
12 KiB
Plaintext
346 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "ad994162",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import csv\n",
|
|
"import json\n",
|
|
"import requests as rq\n",
|
|
"import bs4\n",
|
|
"import pandas as pd\n",
|
|
"import time\n",
|
|
"import random\n",
|
|
"import re"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "b5536e8c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "6d109e8a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"id_to_name = dict()\n",
|
|
"for producer in producers:\n",
|
|
" with open(f\"../data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as prodfile:\n",
|
|
" listings = json.load(prodfile)[\"data\"]\n",
|
|
"\n",
|
|
" for listing in listings:\n",
|
|
" name = listing[3] \n",
|
|
" id = listing[1]\n",
|
|
"\n",
|
|
" id_to_name[id] = name"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "ab997198",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# uvp preise bestimmen :(\n",
|
|
"def get_all_ids() -> list[str]:\n",
|
|
" df = pd.read_csv(\"../data/merlin/others.csv\")\n",
|
|
" return df[\"id\"].to_list()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "32b1fa46",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"' with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\\n for idx, id in enumerate(get_all_ids()[3663:]):\\n try:\\n small_id = id.lower()\\n\\n response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\\n soup = bs4.BeautifulSoup(response.text)\\n\\n # Prices\\n price_eur = soup.find(id=\"listprice_eur\")\\n price_usd = soup.find(id=\"listprice_usd\")\\n price_cn = soup.find(id=\"listprice_cn\")\\n bestprice_eur = soup.find(id=\"bestprice_eur\")\\n bestprice_usd = soup.find(id=\"bestprice_usd\")\\n bestprice_cn = soup.find(id=\"bestprice_cn\")\\n\\n all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\\n\\n #categories\\n other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\\n writer = csv.writer(pricefile)\\n\\n all_prices = [p.text if p != None else \"_\" for p in all_prices]\\n writer.writerow([id, *all_prices, *other_dump])\\n time.sleep(random.randint(2, 3))\\n except Exception as e:\\n print(e) '"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\"\"\" with open(\"../data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
|
|
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
|
|
" try:\n",
|
|
" small_id = id.lower()\n",
|
|
"\n",
|
|
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
|
|
" soup = bs4.BeautifulSoup(response.text)\n",
|
|
"\n",
|
|
" # Prices\n",
|
|
" price_eur = soup.find(id=\"listprice_eur\")\n",
|
|
" price_usd = soup.find(id=\"listprice_usd\")\n",
|
|
" price_cn = soup.find(id=\"listprice_cn\")\n",
|
|
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
|
|
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
|
|
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
|
|
"\n",
|
|
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
|
|
" \n",
|
|
" #categories\n",
|
|
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
|
|
" writer = csv.writer(pricefile)\n",
|
|
" \n",
|
|
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
|
|
" writer.writerow([id, *all_prices, *other_dump])\n",
|
|
" time.sleep(random.randint(2, 3))\n",
|
|
" except Exception as e:\n",
|
|
" print(e) \"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "4a10a1e3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def split_by_keywords(text, keywords):\n",
|
|
" pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n",
|
|
" parts = re.split(pattern, text)\n",
|
|
" \n",
|
|
" result = {}\n",
|
|
" for i in range(1, len(parts), 2):\n",
|
|
" key = parts[i]\n",
|
|
" value = parts[i + 1].strip()\n",
|
|
" result[key] = value\n",
|
|
" \n",
|
|
" return result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "9c00f188",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"keywords = [\n",
|
|
" \"Listenpreis:\",\n",
|
|
" \"DetailsVon:\",\n",
|
|
" \"EAN:\",\n",
|
|
" \"Altersempfehlung:\",\n",
|
|
" \"Steine von:\",\n",
|
|
" \"Bestpreis:EU:\",\n",
|
|
" \"Bewertungen\",\n",
|
|
" \"Inhalt\",\n",
|
|
" \"PreiseListenpreis:\",\n",
|
|
" \"Hersteller-Kategorie:\",\n",
|
|
" \"Designer:\",\n",
|
|
" \"Maße:\",\n",
|
|
" \"Release:\",\n",
|
|
" \"Kategorien:\",\n",
|
|
" \"Hersteller-Videos\",\n",
|
|
" \"EU:\",\n",
|
|
" \"Anleitung\",\n",
|
|
" \"BewertungenCommunity:\",\n",
|
|
" \"Maßstab:\",\n",
|
|
" \"Erweiterung zu:\",\n",
|
|
" \"Reviews\",\n",
|
|
" \"Lizenz:\",\n",
|
|
" \"Farbverteilung\",\n",
|
|
" \"TeilelistenBrickLink\",\n",
|
|
" \"Bild:\",\n",
|
|
" \"ReviewsCommunity\",\n",
|
|
" \"Gewicht\",\n",
|
|
" \"Keine Aufkleber\",\n",
|
|
" \"Verpackungsmaße:\",\n",
|
|
" \"EU:Brickmo\",\n",
|
|
" \"Datenbanken:\",\n",
|
|
" \"Kategorie:\",\n",
|
|
" \"Keine Drucke\",\n",
|
|
" \"TechnikMOC:\",\n",
|
|
" \"Steingröße:\",\n",
|
|
" \"SonstigesMOC:\",\n",
|
|
" \"Variationen:\",\n",
|
|
" \"RebrickableVariation:\"\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "9b44a0e5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def rm_epsilon(l : list[str]) ->list[str]:\n",
|
|
" return list(filter(lambda s : len(s) > 0, l))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"id": "ae53869e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"me_details = pd.DataFrame({\n",
|
|
" \"id\" : [],\n",
|
|
" \"name\" : [],\n",
|
|
" \"price_eur\" : [],\n",
|
|
" \"price_cn\" : [],\n",
|
|
" \"price_us\" : [],\n",
|
|
" \"brand\" : [],\n",
|
|
" \"ean\" : [],\n",
|
|
" \"producer\" : [],\n",
|
|
" \"release\" : [],\n",
|
|
" \"category\" : [],\n",
|
|
" \"producer_category\" : [],\n",
|
|
" \"num_parts\" : [],\n",
|
|
" })\n",
|
|
"\n",
|
|
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
|
|
" reader = csv.reader(price_file)\n",
|
|
"\n",
|
|
" for row in reader:\n",
|
|
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
|
|
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
|
|
"\n",
|
|
" retrieved = split_by_keywords(\"\".join(other), keywords)\n",
|
|
"\n",
|
|
" brand = retrieved.get(\"DetailsVon:\", \"\").replace(\" \", \"\")\n",
|
|
" ean = retrieved.get(\"EAN:\", \"\")\n",
|
|
" producer = retrieved.get(\"Steine von:\", \"\")\n",
|
|
" age = retrieved.get(\"Altersempfehlung:\", \"\")\n",
|
|
" release = retrieved.get(\"Release:\", \"\").split(\" \")[-1]\n",
|
|
" num_parts = retrieved.get(\"Inhalt\", \"\").split(\"Teile\")[0].replace(\"Ein Teil\", \"1\").replace(\"Preise\", \"\").replace(\"Mit Fernsteuerung / Elektrik\", \"1\").replace(\"Eine Minifigur\", \"1\").replace(\"Minifiguren\", \"\").strip()\n",
|
|
"\n",
|
|
" category = retrieved.get(\"Kategorie:\", \"\").strip().split(\",\")\n",
|
|
" categories = \",\".join(rm_epsilon(retrieved.get(\"Kategorien:\", \"\") .split(\",\") + category)).replace(\"Hersteller\", \"\")\n",
|
|
" producer_category = retrieved.get(\"Hersteller-Kategorie:\", \"\").split(\",\")\n",
|
|
" producer_categories = \",\".join(rm_epsilon(retrieved.get(\"Hersteller-Kategorien:\", \"\").split(\",\") + producer_category))\n",
|
|
"\n",
|
|
" if brand == \"\":\n",
|
|
" continue\n",
|
|
" me_extra = pd.DataFrame({\n",
|
|
" \"id\" : [id],\n",
|
|
" \"name\" : [id_to_name.get(id, \"\")],\n",
|
|
" \"price_eur\" : [lp_eur.replace(\"_\", \"\")],\n",
|
|
" \"price_us\" : [lp_usd.replace(\"_\", \"\")],\n",
|
|
" \"price_cn\" : [lp_cn.replace(\"_\", \"\")],\n",
|
|
" \"brand\" : [brand],\n",
|
|
" \"ean\" : [ean],\n",
|
|
" \"producer\" : [producer],\n",
|
|
" \"release\" : [release],\n",
|
|
" \"category\" : [categories],\n",
|
|
" \"producer_category\" : [producer_categories],\n",
|
|
" \"num_parts\" : [num_parts],\n",
|
|
" })\n",
|
|
"\n",
|
|
" me_details = pd.concat([me_details, me_extra])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"id": "1b5bcea6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>name</th>\n",
|
|
" <th>price_eur</th>\n",
|
|
" <th>price_cn</th>\n",
|
|
" <th>price_us</th>\n",
|
|
" <th>brand</th>\n",
|
|
" <th>ean</th>\n",
|
|
" <th>producer</th>\n",
|
|
" <th>release</th>\n",
|
|
" <th>category</th>\n",
|
|
" <th>producer_category</th>\n",
|
|
" <th>num_parts</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"Empty DataFrame\n",
|
|
"Columns: [id, name, price_eur, price_cn, price_us, brand, ean, producer, release, category, producer_category, num_parts]\n",
|
|
"Index: []"
|
|
]
|
|
},
|
|
"execution_count": 57,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"me_details[\"Mit Fernsteuerung / Elektrik\" == me_details[\"num_parts\"]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"id": "0fb65dec",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"me_details.to_csv(\"../data/merlin/others.csv\", index=False)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "venv (3.14.4)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|