diff --git a/lego/lego_util_merlin.ipynb b/lego/lego_util_merlin.ipynb deleted file mode 100644 index fd4fd9a..0000000 --- a/lego/lego_util_merlin.ipynb +++ /dev/null @@ -1,119 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "ad994162", - "metadata": {}, - "outputs": [], - "source": [ - "import csv\n", - "import json\n", - "import requests as rq\n", - "import bs4\n", - "import pandas as pd\n", - "import time\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5536e8c", - "metadata": {}, - "outputs": [], - "source": [ - "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5daea73", - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n", - " writer = csv.writer(producerfile)\n", - " writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n", - " for producer in producers:\n", - " with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n", - " data = json.loads(sourcefile.read())\n", - " for row in data[\"data\"]:\n", - " _, id, _, name, rating, _, _, size, parts, year, _ = row\n", - "\n", - " writer.writerow([id, producer, name, size, parts, year])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "ab997198", - "metadata": {}, - "outputs": [], - "source": [ - "# uvp preise bestimmen :(\n", - "def get_all_ids() -> list[str]:\n", - " df = pd.read_csv(\"./data/merlin/others.csv\")\n", - " return df[\"id\"].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "32b1fa46", - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", - " for idx, id in enumerate(get_all_ids()[3663:]):\n", - " try:\n", - " small_id = id.lower()\n", - "\n", - " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", - " soup = bs4.BeautifulSoup(response.text)\n", - "\n", - " # Prices\n", - " price_eur = soup.find(id=\"listprice_eur\")\n", - " price_usd = soup.find(id=\"listprice_usd\")\n", - " price_cn = soup.find(id=\"listprice_cn\")\n", - " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", - " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", - " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", - "\n", - " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", - " \n", - " #categories\n", - " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", - " writer = csv.writer(pricefile)\n", - " \n", - " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", - " writer.writerow([id, *all_prices, *other_dump])\n", - " time.sleep(random.randint(2, 3))\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv (3.14.4)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.14.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lego/paper/KGR_paper1_lego.tex b/lego/paper/KGR_paper1_lego.tex index 3e8a62b..aec0d33 100644 --- a/lego/paper/KGR_paper1_lego.tex +++ b/lego/paper/KGR_paper1_lego.tex @@ -166,6 +166,14 @@ \subsection{Integrationsprozess} + Jedes von Lego veröffentlichte Teil besitzt der Form zugrunde eine eindeutige Teile-Nummer. + + %Verwandte Objekte erkennen (Schema Alignment) + + %Gleiche Entitäten erkennen (Entity Resolution) + + %Integrationssstrategien (linking strategy) + \subsection{Pipeline} \section{Evaluation} diff --git a/lego/lego_util_bricklink.ipynb b/lego/util/lego_util_bricklink.ipynb similarity index 100% rename from lego/lego_util_bricklink.ipynb rename to lego/util/lego_util_bricklink.ipynb diff --git a/lego/lego_util_brickset.ipynb b/lego/util/lego_util_brickset.ipynb similarity index 100% rename from lego/lego_util_brickset.ipynb rename to lego/util/lego_util_brickset.ipynb diff --git a/lego/util/lego_util_merlin.ipynb b/lego/util/lego_util_merlin.ipynb new file mode 100644 index 0000000..2beb13f --- /dev/null +++ b/lego/util/lego_util_merlin.ipynb @@ -0,0 +1,334 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 44, + "id": "ad994162", + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import json\n", + "import requests as rq\n", + "import bs4\n", + "import pandas as pd\n", + "import time\n", + "import random\n", + "import re\n", + "import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5536e8c", + "metadata": {}, + "outputs": [], + "source": [ + "producers = [\"bluebrixx\", \"cada\", \"cobi\", \"mouldking\", \"pantasy\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5daea73", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"./data/merlin/others.csv\", mode=\"w+\", encoding=\"utf8\", newline=\"\") as producerfile:\n", + " writer = csv.writer(producerfile)\n", + " writer.writerow([\"id\", \"producer\", \"name\", \"size\", \"parts\", \"year\"])\n", + " for producer in producers:\n", + " with open(f\"data/merlin/{producer}.json\", mode=\"r\", encoding=\"utf8\") as sourcefile:\n", + " data = json.loads(sourcefile.read())\n", + " for row in data[\"data\"]:\n", + " _, id, _, name, rating, _, _, size, parts, year, _ = row\n", + "\n", + " writer.writerow([id, producer, name, size, parts, year])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ab997198", + "metadata": {}, + "outputs": [], + "source": [ + "# uvp preise bestimmen :(\n", + "def get_all_ids() -> list[str]:\n", + " df = pd.read_csv(\"./data/merlin/others.csv\")\n", + " return df[\"id\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "32b1fa46", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"./data/merlin/prices.csv\", mode=\"a+\", encoding=\"utf8\", newline=\"\") as pricefile:\n", + " for idx, id in enumerate(get_all_ids()[3663:]):\n", + " try:\n", + " small_id = id.lower()\n", + "\n", + " response = rq.get(f\"https://www.merlinssteine.de/sets/{small_id}\")\n", + " soup = bs4.BeautifulSoup(response.text)\n", + "\n", + " # Prices\n", + " price_eur = soup.find(id=\"listprice_eur\")\n", + " price_usd = soup.find(id=\"listprice_usd\")\n", + " price_cn = soup.find(id=\"listprice_cn\")\n", + " bestprice_eur = soup.find(id=\"bestprice_eur\")\n", + " bestprice_usd = soup.find(id=\"bestprice_usd\")\n", + " bestprice_cn = soup.find(id=\"bestprice_cn\")\n", + "\n", + " all_prices = [price_eur, price_cn, price_usd, bestprice_eur, bestprice_cn, bestprice_usd]\n", + " \n", + " #categories\n", + " other_dump = [description.text.replace(\"\\n\", \"\") for description in soup.find_all(class_=\"setpage_ct\")]\n", + " writer = csv.writer(pricefile)\n", + " \n", + " all_prices = [p.text if p != None else \"_\" for p in all_prices]\n", + " writer.writerow([id, *all_prices, *other_dump])\n", + " time.sleep(random.randint(2, 3))\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "4a10a1e3", + "metadata": {}, + "outputs": [], + "source": [ + "def split_by_keywords(text, keywords):\n", + " pattern = r'(' + '|'.join(map(re.escape, keywords)) + r')'\n", + " parts = re.split(pattern, text)\n", + " \n", + " result = {}\n", + " for i in range(1, len(parts), 2):\n", + " key = parts[i]\n", + " value = parts[i + 1].strip()\n", + " result[key] = value\n", + " \n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c00f188", + "metadata": {}, + "outputs": [], + "source": [ + "keywords = [\n", + " \"Listenpreis:\",\n", + " \"DetailsVon:\",\n", + " \"EAN:\",\n", + " \"Altersempfehlung:\",\n", + " \"Steine von:\",\n", + " \"Bestpreis:EU:\",\n", + " \"Bewertungen\",\n", + " \"Inhalt\",\n", + " \"PreiseListenpreis:\",\n", + " \"Hersteller-Kategorie:\",\n", + " \"Designer:\",\n", + " \"Maße:\",\n", + " \"Release:\",\n", + " \"Kategorien:\",\n", + " \"Hersteller-Videos\",\n", + " \"EU:\",\n", + " \"Anleitung\",\n", + " \"BewertungenCommunity:\",\n", + " \"Maßstab:\",\n", + " \"Erweiterung zu:\",\n", + " \"Reviews\",\n", + " \"Lizenz:\",\n", + " \"Farbverteilung\",\n", + " \"TeilelistenBrickLink\",\n", + " \"Bild:\",\n", + " \"ReviewsCommunity\",\n", + " \"Gewicht\",\n", + " \"Keine Aufkleber\",\n", + " \"Verpackungsmaße:\",\n", + " \"EU:Brickmo\",\n", + " \"Datenbanken:\",\n", + " \"Kategorie:\",\n", + " \"Keine Drucke\",\n", + " \"TechnikMOC:\",\n", + " \"Steingröße:\",\n", + " \"SonstigesMOC:\",\n", + " \"Variationen:\",\n", + " \"RebrickableVariation:\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "ae53869e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Listenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)',\n", + " 'DetailsVon:': 'BlueBrixx',\n", + " 'EAN:': '4060904003671',\n", + " 'Steine von:': 'Qunlong',\n", + " 'Kategorie:': 'EisenbahnHersteller-',\n", + " 'Kategorien:': 'BBSpecial, BRIX',\n", + " 'Anleitung': 'Ohne Bauabschnitte',\n", + " 'Bewertungen': 'Bewerten',\n", + " 'Hersteller-Videos': 'video-1',\n", + " 'Inhalt': '205 Teile',\n", + " 'Gewicht': ': 190 g',\n", + " 'Keine Aufkleber': '',\n", + " 'Keine Drucke': '',\n", + " 'Farbverteilung': '',\n", + " 'TeilelistenBrickLink': 'XMLRebrickable CSVLEGO PaB CSVSetDB CSV',\n", + " 'PreiseListenpreis:': '14.95 EUR (7.3 ct/Teil) (7.9 ct/g)'}\n" + ] + } + ], + "source": [ + "details = {\n", + " \"id\" : [],\n", + " \"listprice_eur\" : [],\n", + " \"listprice_cn\" : [],\n", + " \"listprice_usd\" : [],\n", + " \"bestprice_eur\" : [],\n", + " \"bestprice_cn\" : [],\n", + " \"bestprice_usd\" : [],\n", + " \"brand\" : [],\n", + " \"ean\" : [],\n", + " \"producer\" : [],\n", + " \"release\" : [],\n", + " \"scale\" : [],\n", + " \"category\" : [],\n", + " \"producer_category\" : [],\n", + " \"num_parts\" : [],\n", + " \"width\" : [],\n", + " \"height\" : [],\n", + " \"depth\" : [],\n", + " \"designer\" : [],\n", + " \"weight\" : [],\n", + " \"age\" : []\n", + "}\n", + "import random\n", + "\n", + "me_details = pd.DataFrame(details)\n", + "\n", + "with open(\"../data/merlin/prices.csv\", mode=\"r\", encoding=\"utf8\") as price_file:\n", + " reader = csv.reader(price_file)\n", + "\n", + " # for row in reader:\n", + " # id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = row\n", + " \n", + " # me_details.loc[-1] = [id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd] + list(range(0, 12))\n", + " # me_details.index = me_details.index + 1\n", + " id, lp_eur, lp_cn, lp_usd, bp_eur, bp_cn, bp_usd, *other = [row for row in reader][random.randint(0, 4500)]\n", + " other = filter(lambda s: not \"Wikipedia\" in s, other)\n", + "\n", + " pprint.pp(split_by_keywords(\"\".join(other), keywords))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b83aa413", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | id | \n", + "listprice_eur | \n", + "listprice_cn | \n", + "listprice_usd | \n", + "bestprice_eur | \n", + "bestprice_cn | \n", + "bestprice_usd | \n", + "brand | \n", + "ean | \n", + "producer | \n", + "... | \n", + "scale | \n", + "category | \n", + "producer_category | \n", + "num_parts | \n", + "width | \n", + "height | \n", + "depth | \n", + "designer | \n", + "weight | \n", + "age | \n", + "
|---|
0 rows × 21 columns
\n", + "