merlinbricks util parsed but not done

pull/1/head
Roman Schöne 2026-04-26 20:05:41 +02:00
parent 3b4bfae39b
commit 77a486868b
6 changed files with 342 additions and 119 deletions

View File

@ -1,119 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "ad994162",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import json\n",
"import requests as rq\n",
"import bs4\n",
"import pandas as pd\n",
"import time\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5536e8c",
"metadata": {},
"outputs": [],
"source": [
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5daea73",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
" writer = csv.writer(producerfile)\n",
" writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
" for producer in producers:\n",
" with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
" data = json.loads(sourcefile.read())\n",
" for row in data[\"data\"]:\n",
" _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
"\n",
" writer.writerow([id, producer, name, size, parts, year])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab997198",
"metadata": {},
"outputs": [],
"source": [
"# uvp preise bestimmen :(\n",
"def get_all_ids() -> list[str]:\n",
" df = pd.read_csv(\"./data/merlin/others.csv\")\n",
" return df[\"id\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32b1fa46",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
" try:\n",
" small_id = id.lower()\n",
"\n",
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
" soup = bs4.BeautifulSoup(response.text)\n",
"\n",
" # Prices\n",
" price_eur = soup.find(id=\"listprice_eur\")\n",
" price_usd = soup.find(id=\"listprice_usd\")\n",
" price_cn = soup.find(id=\"listprice_cn\")\n",
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
"\n",
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
" \n",
" #categories\n",
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
" writer = csv.writer(pricefile)\n",
" \n",
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
" writer.writerow([id, *all_prices, *other_dump])\n",
" time.sleep(random.randint(2, 3))\n",
" except Exception as e:\n",
" print(e)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -166,6 +166,14 @@
\subsection{Integrationsprozess} \subsection{Integrationsprozess}
Jedes von Lego veröffentlichte Teil besitzt der Form zugrunde eine eindeutige Teile-Nummer.
%Verwandte Objekte erkennen (Schema Alignment)
%Gleiche Entitäten erkennen (Entity Resolution)
%Integrationssstrategien (linking strategy)
\subsection{Pipeline} \subsection{Pipeline}
\section{Evaluation} \section{Evaluation}

View File

@ -0,0 +1,334 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 44,
"id": "ad994162",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import json\n",
"import requests as rq\n",
"import bs4\n",
"import pandas as pd\n",
"import time\n",
"import random\n",
"import re\n",
"import pprint"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5536e8c",
"metadata": {},
"outputs": [],
"source": [
"producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5daea73",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
" writer = csv.writer(producerfile)\n",
" writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
" for producer in producers:\n",
" with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
" data = json.loads(sourcefile.read())\n",
" for row in data[\"data\"]:\n",
" _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
"\n",
" writer.writerow([id, producer, name, size, parts, year])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab997198",
"metadata": {},
"outputs": [],
"source": [
"# uvp preise bestimmen :(\n",
"def get_all_ids() -> list[str]:\n",
" df = pd.read_csv(\"./data/merlin/others.csv\")\n",
" return df[\"id\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32b1fa46",
"metadata": {},
"outputs": [],
"source": [
"with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
" for idx, id in enumerate(get_all_ids()[3663:]):\n",
" try:\n",
" small_id = id.lower()\n",
"\n",
" response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
" soup = bs4.BeautifulSoup(response.text)\n",
"\n",
" # Prices\n",
" price_eur = soup.find(id=\"listprice_eur\")\n",
" price_usd = soup.find(id=\"listprice_usd\")\n",
" price_cn = soup.find(id=\"listprice_cn\")\n",
" bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
" bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
" bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
"\n",
" all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
" \n",
" #categories\n",
" other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
" writer = csv.writer(pricefile)\n",
" \n",
" all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
" writer.writerow([id, *all_prices, *other_dump])\n",
" time.sleep(random.randint(2, 3))\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "4a10a1e3",
"metadata": {},
"outputs": [],
"source": [
"def split_by_keywords(text, keywords):\n",
" pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n",
" parts = re.split(pattern, text)\n",
" \n",
" result = {}\n",
" for i in range(1, len(parts), 2):\n",
" key = parts[i]\n",
" value = parts[i + 1].strip()\n",
" result[key] = value\n",
" \n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c00f188",
"metadata": {},
"outputs": [],
"source": [
"keywords = [\n",
" \"Listenpreis:\",\n",
" \"DetailsVon:\",\n",
" \"EAN:\",\n",
" \"Altersempfehlung:\",\n",
" \"Steine von:\",\n",
" \"Bestpreis:EU:\",\n",
" \"Bewertungen\",\n",
" \"Inhalt\",\n",
" \"PreiseListenpreis:\",\n",
" \"Hersteller-Kategorie:\",\n",
" \"Designer:\",\n",
" \"Maße:\",\n",
" \"Release:\",\n",
" \"Kategorien:\",\n",
" \"Hersteller-Videos\",\n",
" \"EU:\",\n",
" \"Anleitung\",\n",
" \"BewertungenCommunity:\",\n",
" \"Maßstab:\",\n",
" \"Erweiterung zu:\",\n",
" \"Reviews\",\n",
" \"Lizenz:\",\n",
" \"Farbverteilung\",\n",
" \"TeilelistenBrickLink\",\n",
" \"Bild:\",\n",
" \"ReviewsCommunity\",\n",
" \"Gewicht\",\n",
" \"Keine Aufkleber\",\n",
" \"Verpackungsmaße:\",\n",
" \"EU:Brickmo\",\n",
" \"Datenbanken:\",\n",
" \"Kategorie:\",\n",
" \"Keine Drucke\",\n",
" \"TechnikMOC:\",\n",
" \"Steingröße:\",\n",
" \"SonstigesMOC:\",\n",
" \"Variationen:\",\n",
" \"RebrickableVariation:\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 176,
"id": "ae53869e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Listenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)',\n",
" 'DetailsVon:': 'BlueBrixx',\n",
" 'EAN:': '4060904003671',\n",
" 'Steine von:': 'Qunlong',\n",
" 'Kategorie:': 'EisenbahnHersteller-',\n",
" 'Kategorien:': 'BBSpecial, BRIX',\n",
" 'Anleitung': 'Ohne Bauabschnitte',\n",
" 'Bewertungen': 'Bewerten',\n",
" 'Hersteller-Videos': 'video-1',\n",
" 'Inhalt': '205 Teile',\n",
" 'Gewicht': ': 190 g',\n",
" 'Keine Aufkleber': '',\n",
" 'Keine Drucke': '',\n",
" 'Farbverteilung': '',\n",
" 'TeilelistenBrickLink': 'XMLRebrickable CSVLEGO PaB CSVSetDB CSV',\n",
" 'PreiseListenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)'}\n"
]
}
],
"source": [
"details = {\n",
" \"id\" : [],\n",
" \"listprice_eur\" : [],\n",
" \"listprice_cn\" : [],\n",
" \"listprice_usd\" : [],\n",
" \"bestprice_eur\" : [],\n",
" \"bestprice_cn\" : [],\n",
" \"bestprice_usd\" : [],\n",
" \"brand\" : [],\n",
" \"ean\" : [],\n",
" \"producer\" : [],\n",
" \"release\" : [],\n",
" \"scale\" : [],\n",
" \"category\" : [],\n",
" \"producer_category\" : [],\n",
" \"num_parts\" : [],\n",
" \"width\" : [],\n",
" \"height\" : [],\n",
" \"depth\" : [],\n",
" \"designer\" : [],\n",
" \"weight\" : [],\n",
" \"age\" : []\n",
"}\n",
"import random\n",
"\n",
"me_details = pd.DataFrame(details)\n",
"\n",
"with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
" reader = csv.reader(price_file)\n",
"\n",
" # for row in reader:\n",
" # id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
" \n",
" # me_details.loc[-1] = [id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd] + list(range(0, 12))\n",
" # me_details.index = me_details.index + 1\n",
" id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = [row for row in reader][random.randint(0, 4500)]\n",
" other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
"\n",
" pprint.pp(split_by_keywords(\"\".join(other), keywords))\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "b83aa413",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>listprice_eur</th>\n",
" <th>listprice_cn</th>\n",
" <th>listprice_usd</th>\n",
" <th>bestprice_eur</th>\n",
" <th>bestprice_cn</th>\n",
" <th>bestprice_usd</th>\n",
" <th>brand</th>\n",
" <th>ean</th>\n",
" <th>producer</th>\n",
" <th>...</th>\n",
" <th>scale</th>\n",
" <th>category</th>\n",
" <th>producer_category</th>\n",
" <th>num_parts</th>\n",
" <th>width</th>\n",
" <th>height</th>\n",
" <th>depth</th>\n",
" <th>designer</th>\n",
" <th>weight</th>\n",
" <th>age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [id, listprice_eur, listprice_cn, listprice_usd, bestprice_eur, bestprice_cn, bestprice_usd, brand, ean, producer, release, scale, category, producer_category, num_parts, width, height, depth, designer, weight, age]\n",
"Index: []\n",
"\n",
"[0 rows x 21 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"me_details"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv (3.14.4)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}