From 0edd81861afa806c0cc53a5fd140434122b2c2e6 Mon Sep 17 00:00:00 2001 From: mahehsma Date: Wed, 12 Jun 2024 09:23:15 +0200 Subject: [PATCH] removed --- codeprojekt_nicole.ipynb | 989 --------------------------------------- 1 file changed, 989 deletions(-) delete mode 100644 codeprojekt_nicole.ipynb diff --git a/codeprojekt_nicole.ipynb b/codeprojekt_nicole.ipynb deleted file mode 100644 index 8892a9c..0000000 --- a/codeprojekt_nicole.ipynb +++ /dev/null @@ -1,989 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "ffe86c63-5718-4c71-a3a3-94672d34e676", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# import libraries\n", - "from ucimlrepo import fetch_ucirepo\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "markdown", - "id": "d1b8dc0f-8e8b-485a-9147-cf694f832357", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "### Explorative Datenanalyse\n", - "Deskriptive Statistik: Berechnung von Mittelwerten, Mediane, Standardabweichungen und anderen Kenngrößen für die verschiedenen Merkmale. \n", - "Visualisierung: Nutzung von Histogrammen, Boxplots und Scatterplots, um die Verteilung und Beziehungen der Merkmale zu analysieren. \n", - "Korrelation: Untersuchung der Korrelation zwischen den Merkmalen und dem Outcome." - ] - }, - { - "cell_type": "markdown", - "id": "282a1c63-e28c-4d81-b1dc-1d971e9a4c12", - "metadata": {}, - "source": [ - "### Dataset preparation" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "130b7fb5-7f85-4393-a741-05246920ce63", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 303 entries, 0 to 302\n", - "Data columns (total 14 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 age 303 non-null int64 \n", - " 1 sex 303 non-null int64 \n", - " 2 cp 303 non-null int64 \n", - " 3 trestbps 303 non-null int64 \n", - " 4 chol 303 non-null int64 \n", - " 5 fbs 303 non-null int64 \n", - " 6 restecg 303 non-null int64 \n", - " 7 thalach 303 non-null int64 \n", - " 8 exang 303 non-null int64 \n", - " 9 oldpeak 303 non-null float64\n", - " 10 slope 303 non-null int64 \n", - " 11 ca 299 non-null float64\n", - " 12 thal 301 non-null float64\n", - " 13 healthy 303 non-null int64 \n", - "dtypes: float64(3), int64(11)\n", - "memory usage: 33.3 KB\n" - ] - } - ], - "source": [ - "# fetch dataset \n", - "heart_disease = fetch_ucirepo(id=45)\n", - "\n", - "# features\n", - "x = heart_disease.data.features \n", - "\n", - "# target variable\n", - "y = heart_disease.data.targets \n", - "\n", - "# complete dataframe\n", - "heart_df = pd.concat([x, y], axis=1).rename(columns={'num':'healthy'})\n", - "\n", - "# replace values for target variable: 0=sick, 1=healthy\n", - "heart_df['healthy'] = heart_df['healthy'].replace({0: 1, 1: 0, 2: 0, 3: 0, 4: 0})\n", - "\n", - "# view summary of dataset\n", - "heart_df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "0780f01d-771b-4443-86b2-0e2aa8ef464f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of rows with NaN values to be removed: 6\n", - "Total number of rows: 303\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathalhealthy
063111452331215002.330.06.01
167141602860210811.523.03.00
267141202290212912.622.07.00
337131302500018703.530.03.01
441021302040217201.410.03.01
.............................................
29757041402410012310.220.07.00
29845111102640013201.220.07.00
29968141441931014103.422.07.00
30057141301310011511.221.07.00
30157021302360217400.021.03.00
\n", - "

297 rows × 14 columns

\n", - "
" - ], - "text/plain": [ - " age sex cp trestbps chol fbs restecg thalach exang oldpeak \\\n", - "0 63 1 1 145 233 1 2 150 0 2.3 \n", - "1 67 1 4 160 286 0 2 108 1 1.5 \n", - "2 67 1 4 120 229 0 2 129 1 2.6 \n", - "3 37 1 3 130 250 0 0 187 0 3.5 \n", - "4 41 0 2 130 204 0 2 172 0 1.4 \n", - ".. ... ... .. ... ... ... ... ... ... ... \n", - "297 57 0 4 140 241 0 0 123 1 0.2 \n", - "298 45 1 1 110 264 0 0 132 0 1.2 \n", - "299 68 1 4 144 193 1 0 141 0 3.4 \n", - "300 57 1 4 130 131 0 0 115 1 1.2 \n", - "301 57 0 2 130 236 0 2 174 0 0.0 \n", - "\n", - " slope ca thal healthy \n", - "0 3 0.0 6.0 1 \n", - "1 2 3.0 3.0 0 \n", - "2 2 2.0 7.0 0 \n", - "3 3 0.0 3.0 1 \n", - "4 1 0.0 3.0 1 \n", - ".. ... ... ... ... \n", - "297 2 0.0 7.0 0 \n", - "298 2 0.0 7.0 0 \n", - "299 2 2.0 7.0 0 \n", - "300 2 1.0 7.0 0 \n", - "301 2 1.0 3.0 0 \n", - "\n", - "[297 rows x 14 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# data cleaning\n", - "rows_with_nan = heart_df.isna().any(axis=1).sum()\n", - "print(f\"Number of rows with NaN values to be removed: {rows_with_nan}\")\n", - "total_rows = heart_df.shape[0]\n", - "print(f\"Total number of rows: {total_rows}\")\n", - "\n", - "# remove rows with NaN values\n", - "heart_df = heart_df.dropna()\n", - "\n", - "heart_df" - ] - }, - { - "cell_type": "markdown", - "id": "15d04b9c-2803-43e8-98a0-118dda07536f", - "metadata": {}, - "source": [ - "### Clustering and PCA\n", - "Um zu analysieren, ob ähnliche Merkmale auch zur gleichen Diagnose führen, wird zuerst ein k-Means Clustering angewandt." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "76dc9ce9-8e81-4a50-ad4d-557576a58a92", - "metadata": {}, - "outputs": [], - "source": [ - "# prepare data for clustering\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "# split data into featrues (x) and targets (y)\n", - "X = heart_df.iloc[:, :-1]\n", - "y = heart_df.iloc[:, -1]\n", - "\n", - "# scale data (MinMax probieren)\n", - "scaler = StandardScaler() \n", - "X_scaled = scaler.fit_transform (X)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b56af950-7a58-43b5-8a83-1fd95b0c7220", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
KMeans(n_clusters=2, n_init='auto', random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "KMeans(n_clusters=2, n_init='auto', random_state=42)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# k-means clustering\n", - "from sklearn.cluster import KMeans\n", - "from sklearn.metrics import confusion_matrix\n", - "\n", - "# prepare model\n", - "kmeans = KMeans(n_clusters=2, random_state=42, n_init='auto')\n", - "kmeans.fit(X_scaled)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4dfc00f7-a2b6-4408-b48d-c24a9c32f74d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79.12% der Datensätze wurden mithilfe von KMeans richtig einem Cluster zugeordnet\n" - ] - } - ], - "source": [ - "# calculate percentage of data points correctly assigned to each cluster\n", - "cluster1 = kmeans.labels_ == 0\n", - "cluster2 = kmeans.labels_ == 1\n", - "\n", - "perc_cluster1 = np.round(np.mean(cluster1 == y) * 100, decimals=2)\n", - "perc_cluster2 = np.round(np.mean(cluster2 == y) * 100, decimals=2)\n", - "\n", - "# choose cluster with higher correspondence\n", - "if perc_cluster1 > perc_cluster2:\n", - " km_healthy = cluster1\n", - " max_perc = perc_cluster1\n", - "else:\n", - " km_healthy = cluster2\n", - " max_perc = perc_cluster2\n", - "\n", - "print(f\"{max_perc}% der Datensätze wurden mithilfe von KMeans richtig einem Cluster zugeordnet\")\n", - "\n", - "# hier vlt noch irgendwie diskutieren ob das ein smart way ist um das auszuwerten, anscheinend gibt's dafür andere Metriken" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "d5edbc93-22e2-4086-aa77-ec7a76765048", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Proportion of variance explained by each principal component:\n", - "[0.23695056 0.12349486 0.0960383 0.0851614 0.07638123 0.06720242\n", - " 0.06493755 0.05994447 0.05268635 0.04368496 0.03486402 0.03140469\n", - " 0.02724919]\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# apply PCA\n", - "pca = PCA()\n", - "pca.fit(X_scaled)\n", - "\n", - "print(f\"Proportion of variance explained by each principal component:\\n{pca.explained_variance_ratio_}\")\n", - "\n", - "# Plot the proportion of variance explained\n", - "plt.figure(figsize=(8, 6))\n", - "plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, color='skyblue')\n", - "plt.xlabel('Principal Component')\n", - "plt.ylabel('Proportion of Variance Explained')\n", - "plt.title('Proportion of Variance Explained by Principal Components')\n", - "plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))\n", - "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "c3d40f19-8299-44ab-bb8a-326fdd2008c1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Contributions of features to the first principal component:\n", - " Feature Contribution\n", - "9 oldpeak 0.397006\n", - "10 slope 0.352024\n", - "12 thal 0.346208\n", - "8 exang 0.333140\n", - "11 ca 0.306420\n", - "2 cp 0.286212\n", - "0 age 0.285868\n", - "3 trestbps 0.167841\n", - "6 restecg 0.145938\n", - "1 sex 0.116802\n", - "4 chol 0.083504\n", - "5 fbs 0.076094\n", - "7 thalach -0.392706\n" - ] - } - ], - "source": [ - "# get the loadings or weights of features in the first principal component\n", - "first_pc_loadings = pca.components_[0]\n", - "\n", - "# create a DataFrame to display the contributions of features to the first principal component\n", - "pc_loadings_df = pd.DataFrame({\"Feature\": X.columns, \"Contribution\": first_pc_loadings})\n", - "pc_loadings_df = pc_loadings_df.sort_values(by=\"Contribution\", ascending=False)\n", - "\n", - "print(\"Contributions of features to the first principal component:\")\n", - "print(pc_loadings_df)" - ] - }, - { - "cell_type": "markdown", - "id": "b68452ef-aa60-4b2f-aec4-4762f9136360", - "metadata": {}, - "source": [ - "#### Interpretation:\n", - "Die \"Proportion of Variance Explained by Principal Components\" gibt an, wie viel der gesamten Varianz in den Daten durch jede einzelne Hauptkomponente erklärt wird.\n", - "Hauptkomponenten mit einer höheren Proportion der erklärten Varianz tragen mehr zur Erklärung der Variation in den Daten bei und sind daher wichtiger.\n", - "Eine höhere Proportion der erklärten Varianz in einer bestimmten Hauptkomponente kann darauf hinweisen, dass diese Hauptkomponente wichtige Informationen über die Variation in den Merkmalen oder Beobachtungen enthält, die mit den zugrunde liegenden Mustern oder Gruppierungen in den Herzdaten zusammenhängen könnten. Dabei ist wichtig zu erwähnen, dass die Ergebnisse nicht in direktem Zusammenhang mit der goal-Variable stehen, sondern lediglich auf Muster in den Daten hinweisen.\n", - "\n", - "Merkmale, die hohe Contributions zu der ersten Hauptkomponente haben, tragen wesentlich zur Variation in den Daten bei. Diese Merkmale haben möglicherweise eine starke Korrelation mit der ersten Hauptkomponente und könnten daher wichtigere Informationen über die zugrunde liegenden Muster oder Strukturen in den Daten liefern, als jene mit einer geringen Contribution." - ] - }, - { - "cell_type": "markdown", - "id": "e8dcc395-b548-4dde-bd46-67a962af6f03", - "metadata": {}, - "source": [ - "### Cholesterinwerte im Vergleich Frauen/Männer" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "id": "ed81d87f-a0b0-4dbb-88f6-471423ce3151", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "heart_df['sex'] = heart_df['sex'].replace({0: 'female', 1: 'male'})\n", - "plt.figure(figsize=(8, 6))\n", - "sns.boxplot(x='sex', y='chol', data=heart_df)\n", - "plt.title('Cholesterinwerte nach Geschlecht')\n", - "plt.xlabel('Geschlecht')\n", - "plt.ylabel('Cholesterin in mg/dl')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "3617a148-3ae4-4eb9-99d1-c7a6aa78bd17", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Geschlecht Untere_Grenze Obere_Grenze\n", - "0 Männer 234.288517 246.199046\n", - "1 Frauen 249.044612 275.413721\n" - ] - } - ], - "source": [ - "# Konfidenzintervall (95%) für Cholesterin Level jeweils für Männer und Frauen\n", - "from scipy import stats\n", - "\n", - "# Filtern nach Geschlecht und Berechnen des Konfidenzintervalls\n", - "conf_level = 0.95\n", - "chol_men = heart_df.loc[heart_df['sex'] == 'male', 'chol']\n", - "chol_women = heart_df.loc[heart_df['sex'] == 'female', 'chol']\n", - "conf_int_men = stats.t.interval(conf_level, len(chol_men) - 1, loc=chol_men.mean(), scale=stats.sem(chol_men))\n", - "conf_int_women = stats.t.interval(conf_level, len(chol_women) - 1, loc=chol_women.mean(), scale=stats.sem(chol_women))\n", - "\n", - "result_table_men_vs_women = pd.DataFrame({\n", - " 'Geschlecht': ['Männer', 'Frauen'],\n", - " 'Untere_Grenze': [conf_int_men[0], conf_int_women[0]],\n", - " 'Obere_Grenze': [conf_int_men[1], conf_int_women[1]]\n", - "})\n", - "\n", - "print(result_table_men_vs_women)" - ] - }, - { - "cell_type": "markdown", - "id": "1b4ad40f-dd3b-4309-a5f3-a085d042c1eb", - "metadata": {}, - "source": [ - "### Cholesterin im Vergleich zur Erkrankung" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "60f42093-3616-4e2b-8557-56794af3f25b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Boxplot gruppiert nach Diagnose\n", - "plt.figure(figsize=(10, 6))\n", - "sns.boxplot(x='healthy', y='chol', data=heart_df)\n", - "plt.title('Cholesterin im Vergleich zur Erkrankung')\n", - "plt.xlabel('Diagnose')\n", - "plt.ylabel('Cholesterin in mg/dl')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "id": "7bef4e62-968e-4a2f-9948-063fac1e7a88", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Diagnose Untere_Grenze Obere_Grenze\n", - "0 Gesund 234.397652 250.882836\n", - "1 Krank 243.175250 259.774391\n" - ] - } - ], - "source": [ - "# Filtern nach healthy und Berechnen des Konfidenzintervalls\n", - "conf_level = 0.95\n", - "chol_healthy = heart_df.loc[heart_df['healthy'] == 1, 'chol']\n", - "chol_sick = heart_df.loc[heart_df['healthy'] == 0, 'chol']\n", - "conf_int_healthy = stats.t.interval(conf_level, len(chol_healthy) - 1, loc=chol_healthy.mean(), scale=stats.sem(chol_healthy))\n", - "conf_int_sick = stats.t.interval(conf_level, len(chol_sick) - 1, loc=chol_sick.mean(), scale=stats.sem(chol_sick))\n", - "\n", - "result_table_healthy_vs_sick = pd.DataFrame({\n", - " 'Diagnose': ['Gesund', 'Krank'],\n", - " 'Untere_Grenze': [conf_int_healthy[0], conf_int_sick[0]],\n", - " 'Obere_Grenze': [conf_int_healthy[1], conf_int_sick[1]]\n", - "})\n", - "\n", - "print(result_table_healthy_vs_sick)" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "id": "2aa2b03a-1da6-4846-be5d-1070fa1ae43e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "t-Statistik: 1.4829209163941466\n", - "p-Wert: 0.06957083510218263\n", - "Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\n", - "Es gibt keine signifikanten Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\n" - ] - } - ], - "source": [ - "# t-Test\n", - "# Nullhypothese (HO) = Cholesterinwert bei Kranken ist gleich oder kleiner wie bei Gesunden\n", - "# Alternativhypothese (H1) Cholesterinwert bei Kranken ist höher als bei Gesunden\n", - "\n", - "from scipy.stats import ttest_ind\n", - "\n", - "# Daten für gesunde und kranke Personen\n", - "chol_healthy = heart_df.loc[heart_df['healthy'] == 1, 'chol']\n", - "chol_sick = heart_df.loc[heart_df['healthy'] == 0, 'chol']\n", - "\n", - "# Durchführung des t-Tests\n", - "t_statistic, p_value = ttest_ind(chol_sick, chol_healthy, alternative='greater')\n", - "\n", - "# Ausgabe der Ergebnisse\n", - "print(\"t-Statistik:\", t_statistic)\n", - "print(\"p-Wert:\", p_value)\n", - "\n", - "# Überprüfung der Nullhypothese\n", - "if p_value < 0.05:\n", - " print(\"Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\")\n", - " print(\"Es gibt signifikante Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\")\n", - "else:\n", - " print(\"Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\")\n", - " print(\"Es gibt keine signifikanten Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\")" - ] - }, - { - "cell_type": "markdown", - "id": "c3852db8-3787-4ddf-aed7-80396f7a89ac", - "metadata": {}, - "source": [ - "### Systolischer Ruheblutdruck" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "id": "f063878c-5866-4a19-b065-a41d3e26e290", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n", - "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", - " if pd.api.types.is_categorical_dtype(vector):\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(8, 6))\n", - "sns.boxplot(x='sex', y='trestbps', data=heart_df)\n", - "plt.title('Überblick über Blutdruck')\n", - "plt.xlabel('Geschlecht')\n", - "plt.ylabel('Systolischer Ruheblutdruck (in mmHg bei Aufnahme ins Krankenhaus)')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "id": "f20c6b80-bac5-489e-98e1-b5f2d9d90615", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Diagnose Untere_Grenze Obere_Grenze\n", - "0 Gesund 126.751354 131.748646\n", - "1 Krank 131.420542 137.716148\n" - ] - } - ], - "source": [ - "# Filtern nach goal und Berechnen des Konfidenzintervalls\n", - "conf_level = 0.95\n", - "blutdruck_gesund = heart_df.loc[heart_df['healthy'] == 1, 'trestbps']\n", - "blutdruck_krank = heart_df.loc[heart_df['healthy'] == 0, 'trestbps']\n", - "conf_int_gesund = stats.t.interval(conf_level, len(blutdruck_gesund) - 1, loc=blutdruck_gesund.mean(), scale=stats.sem(blutdruck_gesund))\n", - "conf_int_krank = stats.t.interval(conf_level, len(blutdruck_krank) - 1, loc=blutdruck_krank.mean(), scale=stats.sem(blutdruck_krank))\n", - "\n", - "# Erstellen der Tabelle\n", - "result_table_blutdruck = pd.DataFrame({\n", - " 'Diagnose': ['Gesund', 'Krank'],\n", - " 'Untere_Grenze': [conf_int_gesund[0], conf_int_krank[0]],\n", - " 'Obere_Grenze': [conf_int_gesund[1], conf_int_krank[1]]\n", - "})\n", - "\n", - "# Anzeige der Tabelle\n", - "print(result_table_blutdruck)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "bc919e7f-2b7b-41c1-a5b3-a97d61baa6ca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "t-Statistik: 2.647004396805282\n", - "p-Wert: 0.004274134464297464\n", - "Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\n", - "Es gibt signifikante Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\n" - ] - } - ], - "source": [ - "# t-Test\n", - "# H0 Kranke haben einen niedrigeren oder gleichen Blutdruck wie Gesunde\n", - "# H1 Kranke haben einen höheren BLutdruck als Gesunde\n", - "\n", - "# Daten für gesunde und kranke Personen\n", - "blutdruck_healthy = heart_df.loc[heart_df['healthy'] == 1, 'trestbps']\n", - "blutdruck_sick = heart_df.loc[heart_df['healthy'] == 0, 'trestbps']\n", - "\n", - "# Durchführung des t-Tests\n", - "t_statistic, p_value = ttest_ind(blutdruck_sick, blutdruck_healthy, alternative='greater')\n", - "\n", - "print(\"t-Statistik:\", t_statistic)\n", - "print(\"p-Wert:\", p_value)\n", - "\n", - "# Überprüfung Nullhypothese\n", - "if p_value < 0.05:\n", - " print(\"Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\")\n", - " print(\"Es gibt signifikante Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\")\n", - "else:\n", - " print(\"Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\")\n", - " print(\"Es gibt keine signifikanten Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\")" - ] - }, - { - "cell_type": "markdown", - "id": "4a055d25-1c55-4a3f-a201-4ea252cac4dd", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "### Weitere Dinge die wir machen könnten\n", - "- Korrelationsanalyse\n", - "- logistische Regression, KNN, Entscheidungsbäume\n", - "- ROC/AUC Kurve plotten für Modelle\n", - "\n", - "\n", - "\n", - "- Classification:\n", - "- SVMs\n", - "- Decision Trees\n", - "- K-Nearest Neighbor\n", - "- Random Forest\n", - "\n", - "Dimensionality Reduction\n", - "- PCA\n", - "\n", - "Regression:\n", - "- lineare \n", - "- logistische\n", - "\n", - "Clustering:\n", - "- hierarchisch -> bäume\n", - "- K-means clustering\n", - "\n", - "Statistik:\n", - "- Hypothesen" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}