Added content to repo

2024-04-18 08:36:22 +02:00 · 2024-04-18 08:36:22 +02:00 · 228e6e1289
parent 15ad4e09ba
commit 228e6e1289
10 changed files with 23983 additions and 1 deletions
--- a/Preprocess/01_addIds.py
+++ b/Preprocess/01_addIds.py
@ -0,0 +1,13 @@
+import csv
+
+with open("data/passengers_noid.csv", 'r') as input, open('data/passengers.csv', 'w') as output:
+    reader = csv.reader(input, delimiter = ',')
+    writer = csv.writer(output, delimiter = ',')
+
+    all = []
+    row = next(reader)
+    row.insert(0, 'id')
+    all.append(row)
+    for k, row in enumerate(reader):
+        all.append([str(k+1)] + row)
+    writer.writerows(all)
--- a/Preprocess/02_splitname.py
+++ b/Preprocess/02_splitname.py
@ -0,0 +1,29 @@
+import csv
+
+input_file = "../data/passengers.csv"
+output_file = "../data/passengers2.csv"
+
+with open(input_file, 'r') as csv_input:
+    with open(output_file, 'w', newline='') as csv_output:
+        reader = csv.reader(csv_input)
+        writer = csv.writer(csv_output)
+
+        # Iterate through each row in the CSV
+        for row in reader:
+            # If the row is not empty
+            if row:
+                # Split name into last name and rest of name
+                split_name = row[3].split(',', 1)
+
+                if len(split_name) == 2:
+                    lastname, rest_of_name = split_name
+                    # further split name into salutation and first name
+                    split_rest_of_name = rest_of_name.split('.', 1)
+
+                    if len(split_rest_of_name) == 2:
+                        salutation, first_name = [name.strip() for name in split_rest_of_name]
+                    else:
+                        salutation, first_name = split_rest_of_name[0], ""
+
+                    row[3:4] = [first_name, salutation, lastname]
+                writer.writerow(row)
--- a/Preprocess/03_separate_home_dest.py
+++ b/Preprocess/03_separate_home_dest.py
@ -0,0 +1,37 @@
+import csv
+
+input_file = "../data/passengers.csv"
+output_file = "../data/passengers2.csv"
+
+with open(input_file, 'r') as csv_input:
+    with open(output_file, 'w', newline='') as csv_output:
+        reader = csv.reader(csv_input)
+        writer = csv.writer(csv_output)
+
+        # Iterate through each row in the CSV
+        for row in reader:
+            # Get the last field in the row
+            location = row[-1]
+            
+            # Split the location into two parts if it contains '/' and ','
+            if '/' in location and ',' in location:
+                location1, location2 = location.split('/')
+                
+                # Further split location1 into city1 and country1
+                city1, country_state1 = location1.split(',', 1)
+                
+                # Further split location2 into city2 and country2
+                city2, country_state2 = location2.split(',', 1)
+            elif ',' in location:
+                city1, country_state1 = location.split(',', 1)
+                # For the second location, set to empty strings
+                city2, country_state2 = "", ""
+            else:
+                # If no city and country details are present
+                city1, country_state1, city2, country_state2 = "", "", "", ""
+            
+            # Now replace the last field with the 4 new fields
+            row[-1:] = [city1.strip(), country_state1.strip(), city2.strip(), country_state2.strip()]
+                   
+            # Write the row into the new csv file
+            writer.writerow(row)
--- a/README.md
+++ b/README.md
@ -1,3 +1,32 @@
 # BDEA_Aufgabe1

-Repository für die erste Aufgabe von BDEA im SoSe 2024
+Repository für die erste Aufgabe von BDEA im SoSe 2024
+
+## Aufgabestellung
+
+Aufgabentyp: Individual (keine Teamaufgabe)
+
+Erstellen Sie ein Notebook-basiertes Tutorial, in dem Sie eine beliebige Datenbank (SQL oder NoSQL (außer Postgres)) in einem Docker-Container ausführen und einen beliebigen Daten-Import mit einer beispielhaften Datenanalyse zeigen (!= dem Bsp. aus der Vorlesung). Seien Sie gerne kreativ.
+
+Die Datenanalyse soll dabei interaktiv in Form von Notebooks demonstriert werden (entweder Apache Zeppelin oder JupyterLab). Verwenden sie geeignete Visualisierungsmöglichkeiten (Analysefunktionen zur Aggregation, Tabellen und Plots) um ihre gewonnenen Daten übersichtlich zu veranschaulichen. Fügen Sie für ihre Analyseschritte jeweils geeignete Beschreibungen hinzu damit diese nachvollziehbar sind.
+
+Das Tutorial soll zwischen 10 und 15 min dauern (d.h. Lesedauer) und hat als Zielgruppe Ihre Mitstudierenden aus der Veranstaltung, die auf dieser Basis in der Lage sein sollen, die Analyse nachzuvollziehen.
+
+Bitte geben Sie (1) alle verwendeten Quellen zu Ihrem Tutorial nach guter wissenschaftlicher Praxis im Notebook an, und (2) geben Sie die verwendeten Notebooks die alle demonstrierten Anfragen enthalten ab. Dritte sollten in der Lage sein ihr Tutorial lokal auszuführen.
+
+Optional (!): Wenn sie möchten, können Sie gerne ein entsprechendes Video Tutorial anfertigen.
+
+Bewertet werden vor allem Kreativität der Aufgabe, Sorgfalt und didaktischer Aufbau bzw. Nachvollziehbarkeit.
+
+## Quellen
+
+https://www.kaggle.com/datasets/sakshisatre/titanic-dataset/data
+
+## Commands init db in docker (Mit beigefügte container nicht notwendig)
+copy init.sql to /tmp/init.sql
+copy data/passengers.csv to /var/lib/mysql-files/passengers.csv
+
+mysql -u root -p
+SET sql_mode = '';
+
+source /tmp/init.sql;
--- a/Titanic.ipynb
+++ b/Titanic.ipynb
--- a/container/database.tar
+++ b/container/database.tar
--- a/data/1_passengers_original.csv
+++ b/data/1_passengers_original.csv
--- a/data/2_passengers_names.csv
+++ b/data/2_passengers_names.csv
--- a/data/3_passengers.csv
+++ b/data/3_passengers.csv
--- a/datase_init/init.sql
+++ b/datase_init/init.sql
@ -0,0 +1,30 @@
+CREATE DATABASE if not exists titanic;
+USE titanic;
+
+CREATE TABLE passengers (
+    id INT PRIMARY KEY,
+    pclass INT NULL,
+    survived BOOLEAN NULL,
+    first_name VARCHAR(255) NULL,
+    salutation VARCHAR(255) NULL,
+    last_name VARCHAR(255) NULL,
+    sex VARCHAR(10) NULL,
+    age FLOAT NULL,
+    sibsp INT NULL,
+    parch INT NULL,
+    ticket VARCHAR(30) NULL,
+    fare DECIMAL(10,4) NULL,
+    cabin VARCHAR(30) NULL,
+    embarked CHAR(1) NULL,
+    boat VARCHAR(10) NULL,
+    body VARCHAR(5) NULL,
+    city1 VARCHAR(255) NULL,
+    state1 VARCHAR(255) NULL,
+    city2 VARCHAR(255) NULL,
+    state2 VARCHAR(255) NULL
+);
+
+LOAD DATA INFILE '/var/lib/mysql-files/passengers.csv'
+INTO TABLE passengers
+FIELDS TERMINATED BY ',' ENCLOSED BY '"'
+LINES TERMINATED BY '\n' (id,pclass,survived,first_name,salutation,last_name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,city1,state1,city2,state2);