merlinbricks util parsed but not done

2026-04-26 20:05:41 +02:00 · 2026-04-26 20:05:41 +02:00 · 77a486868b
parent 3b4bfae39b
commit 77a486868b
6 changed files with 342 additions and 119 deletions
--- a/lego/lego_util_merlin.ipynb
+++ b/lego/lego_util_merlin.ipynb
@ -1,119 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ad994162",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import json\n",
    "import requests as rq\n",
    "import bs4\n",
    "import pandas as pd\n",
    "import time\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5536e8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5daea73",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
    "    writer = csv.writer(producerfile)\n",
    "    writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
    "    for producer in producers:\n",
    "        with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
    "            data = json.loads(sourcefile.read())\n",
    "            for row in data[\"data\"]:\n",
    "                _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
    "\n",
    "                writer.writerow([id, producer, name, size, parts, year])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ab997198",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uvp preise bestimmen :(\n",
    "def get_all_ids() -> list[str]:\n",
    "    df = pd.read_csv(\"./data/merlin/others.csv\")\n",
    "    return df[\"id\"].to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "32b1fa46",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
    "    for idx, id in enumerate(get_all_ids()[3663:]):\n",
    "        try:\n",
    "            small_id = id.lower()\n",
    "\n",
    "            response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
    "            soup = bs4.BeautifulSoup(response.text)\n",
    "\n",
    "            # Prices\n",
    "            price_eur = soup.find(id=\"listprice_eur\")\n",
    "            price_usd = soup.find(id=\"listprice_usd\")\n",
    "            price_cn = soup.find(id=\"listprice_cn\")\n",
    "            bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
    "            bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
    "            bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
    "\n",
    "            all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
    "            \n",
    "            #categories\n",
    "            other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
    "            writer = csv.writer(pricefile)\n",
    "            \n",
    "            all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
    "            writer.writerow([id, *all_prices, *other_dump])\n",
    "            time.sleep(random.randint(2, 3))\n",
    "        except Exception as e:\n",
    "            print(e)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv (3.14.4)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/lego/paper/KGR_paper1_lego.tex
+++ b/lego/paper/KGR_paper1_lego.tex
@ -166,6 +166,14 @@
 		\subsection{Integrationsprozess}
 		Jedes von Lego veröffentlichte Teil besitzt der Form zugrunde eine eindeutige Teile-Nummer.
 		%Verwandte Objekte erkennen (Schema Alignment)
 		%Gleiche Entitäten erkennen (Entity Resolution)
 		%Integrationssstrategien (linking strategy)
 		\subsection{Pipeline}
 		\section{Evaluation}
--- a/lego/util/lego_util_bricklink.ipynb
+++ b/lego/util/lego_util_bricklink.ipynb
--- a/lego/util/lego_util_brickset.ipynb
+++ b/lego/util/lego_util_brickset.ipynb
--- a/lego/util/lego_util_merlin.ipynb
+++ b/lego/util/lego_util_merlin.ipynb
@ -0,0 +1,334 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "ad994162",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import json\n",
    "import requests as rq\n",
    "import bs4\n",
    "import pandas as pd\n",
    "import time\n",
    "import random\n",
    "import re\n",
    "import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5536e8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5daea73",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n",
    "    writer = csv.writer(producerfile)\n",
    "    writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n",
    "    for producer in producers:\n",
    "        with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n",
    "            data = json.loads(sourcefile.read())\n",
    "            for row in data[\"data\"]:\n",
    "                _, id, _, name, rating, _, _, size, parts, year, _ = row\n",
    "\n",
    "                writer.writerow([id, producer, name, size, parts, year])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ab997198",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uvp preise bestimmen :(\n",
    "def get_all_ids() -> list[str]:\n",
    "    df = pd.read_csv(\"./data/merlin/others.csv\")\n",
    "    return df[\"id\"].to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "32b1fa46",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n",
    "    for idx, id in enumerate(get_all_ids()[3663:]):\n",
    "        try:\n",
    "            small_id = id.lower()\n",
    "\n",
    "            response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n",
    "            soup = bs4.BeautifulSoup(response.text)\n",
    "\n",
    "            # Prices\n",
    "            price_eur = soup.find(id=\"listprice_eur\")\n",
    "            price_usd = soup.find(id=\"listprice_usd\")\n",
    "            price_cn = soup.find(id=\"listprice_cn\")\n",
    "            bestprice_eur = soup.find(id=\"bestprice_eur\")\n",
    "            bestprice_usd = soup.find(id=\"bestprice_usd\")\n",
    "            bestprice_cn = soup.find(id=\"bestprice_cn\")\n",
    "\n",
    "            all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n",
    "            \n",
    "            #categories\n",
    "            other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n",
    "            writer = csv.writer(pricefile)\n",
    "            \n",
    "            all_prices = [p.text if p != None else \"_\" for p in all_prices]\n",
    "            writer.writerow([id, *all_prices, *other_dump])\n",
    "            time.sleep(random.randint(2, 3))\n",
    "        except Exception as e:\n",
    "            print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "4a10a1e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_by_keywords(text, keywords):\n",
    "    pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n",
    "    parts = re.split(pattern, text)\n",
    "    \n",
    "    result = {}\n",
    "    for i in range(1, len(parts), 2):\n",
    "        key = parts[i]\n",
    "        value = parts[i + 1].strip()\n",
    "        result[key] = value\n",
    "    \n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c00f188",
   "metadata": {},
   "outputs": [],
   "source": [
    "keywords = [\n",
    "    \"Listenpreis:\",\n",
    "    \"DetailsVon:\",\n",
    "    \"EAN:\",\n",
    "    \"Altersempfehlung:\",\n",
    "    \"Steine von:\",\n",
    "    \"Bestpreis:EU:\",\n",
    "    \"Bewertungen\",\n",
    "    \"Inhalt\",\n",
    "    \"PreiseListenpreis:\",\n",
    "    \"Hersteller-Kategorie:\",\n",
    "    \"Designer:\",\n",
    "    \"Maße:\",\n",
    "    \"Release:\",\n",
    "    \"Kategorien:\",\n",
    "    \"Hersteller-Videos\",\n",
    "    \"EU:\",\n",
    "    \"Anleitung\",\n",
    "    \"BewertungenCommunity:\",\n",
    "    \"Maßstab:\",\n",
    "    \"Erweiterung zu:\",\n",
    "    \"Reviews\",\n",
    "    \"Lizenz:\",\n",
    "    \"Farbverteilung\",\n",
    "    \"TeilelistenBrickLink\",\n",
    "    \"Bild:\",\n",
    "    \"ReviewsCommunity\",\n",
    "    \"Gewicht\",\n",
    "    \"Keine Aufkleber\",\n",
    "    \"Verpackungsmaße:\",\n",
    "    \"EU:Brickmo\",\n",
    "    \"Datenbanken:\",\n",
    "    \"Kategorie:\",\n",
    "    \"Keine Drucke\",\n",
    "    \"TechnikMOC:\",\n",
    "    \"Steingröße:\",\n",
    "    \"SonstigesMOC:\",\n",
    "    \"Variationen:\",\n",
    "    \"RebrickableVariation:\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "ae53869e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Listenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)',\n",
      " 'DetailsVon:': 'BlueBrixx',\n",
      " 'EAN:': '4060904003671',\n",
      " 'Steine von:': 'Qunlong',\n",
      " 'Kategorie:': 'EisenbahnHersteller-',\n",
      " 'Kategorien:': 'BBSpecial, BRIX',\n",
      " 'Anleitung': 'Ohne Bauabschnitte',\n",
      " 'Bewertungen': 'Bewerten',\n",
      " 'Hersteller-Videos': 'video-1',\n",
      " 'Inhalt': '205 Teile',\n",
      " 'Gewicht': ': 190 g',\n",
      " 'Keine Aufkleber': '',\n",
      " 'Keine Drucke': '',\n",
      " 'Farbverteilung': '',\n",
      " 'TeilelistenBrickLink': 'XMLRebrickable CSVLEGO PaB CSVSetDB CSV',\n",
      " 'PreiseListenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)'}\n"
     ]
    }
   ],
   "source": [
    "details = {\n",
    "    \"id\" : [],\n",
    "    \"listprice_eur\" : [],\n",
    "    \"listprice_cn\" : [],\n",
    "    \"listprice_usd\" : [],\n",
    "    \"bestprice_eur\" : [],\n",
    "    \"bestprice_cn\" : [],\n",
    "    \"bestprice_usd\" : [],\n",
    "    \"brand\" : [],\n",
    "    \"ean\" : [],\n",
    "    \"producer\" : [],\n",
    "    \"release\" : [],\n",
    "    \"scale\" : [],\n",
    "    \"category\" : [],\n",
    "    \"producer_category\" : [],\n",
    "    \"num_parts\" : [],\n",
    "    \"width\" : [],\n",
    "    \"height\" : [],\n",
    "    \"depth\" : [],\n",
    "    \"designer\" : [],\n",
    "    \"weight\" : [],\n",
    "    \"age\" : []\n",
    "}\n",
    "import random\n",
    "\n",
    "me_details = pd.DataFrame(details)\n",
    "\n",
    "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n",
    "    reader = csv.reader(price_file)\n",
    "\n",
    "    # for row in reader:\n",
    "    #     id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n",
    "        \n",
    "    #     me_details.loc[-1] = [id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd] + list(range(0, 12))\n",
    "    #     me_details.index = me_details.index + 1\n",
    "    id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = [row for row in reader][random.randint(0, 4500)]\n",
    "    other = filter(lambda s: not \"Wikipedia\" in s, other)\n",
    "\n",
    "    pprint.pp(split_by_keywords(\"\".join(other), keywords))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "b83aa413",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>listprice_eur</th>\n",
       "      <th>listprice_cn</th>\n",
       "      <th>listprice_usd</th>\n",
       "      <th>bestprice_eur</th>\n",
       "      <th>bestprice_cn</th>\n",
       "      <th>bestprice_usd</th>\n",
       "      <th>brand</th>\n",
       "      <th>ean</th>\n",
       "      <th>producer</th>\n",
       "      <th>...</th>\n",
       "      <th>scale</th>\n",
       "      <th>category</th>\n",
       "      <th>producer_category</th>\n",
       "      <th>num_parts</th>\n",
       "      <th>width</th>\n",
       "      <th>height</th>\n",
       "      <th>depth</th>\n",
       "      <th>designer</th>\n",
       "      <th>weight</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>0 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [id, listprice_eur, listprice_cn, listprice_usd, bestprice_eur, bestprice_cn, bestprice_usd, brand, ean, producer, release, scale, category, producer_category, num_parts, width, height, depth, designer, weight, age]\n",
       "Index: []\n",
       "\n",
       "[0 rows x 21 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "me_details"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv (3.14.4)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/lego/util/lego_util_rebrickable.ipynb
+++ b/lego/util/lego_util_rebrickable.ipynb