From 1dd196931dc788e02edda0be4bf6fffa85130cab Mon Sep 17 00:00:00 2001 From: nicole Date: Fri, 7 Jun 2024 13:44:29 +0200 Subject: [PATCH 1/2] add dataset preparation, clustering and PCA, hypothesis tests --- Data Cleaning_05_04_2024.ipynb | 22424 ------------------------------- 1 file changed, 22424 deletions(-) delete mode 100644 Data Cleaning_05_04_2024.ipynb diff --git a/Data Cleaning_05_04_2024.ipynb b/Data Cleaning_05_04_2024.ipynb deleted file mode 100644 index 2eb416f..0000000 --- a/Data Cleaning_05_04_2024.ipynb +++ /dev/null @@ -1,22424 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e18a679f", - "metadata": {}, - "source": [ - "## Import Python libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4922164f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import ydata_profiling\n", - "#from pandas_profiling import ProfileReport\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "72caaa8b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "markdown", - "id": "dfe3a57a", - "metadata": {}, - "source": [ - "## Load Data (Breast Cancer Dataset)\n", - "### Sample code number: Unique identifier for each tissue sample.\n", - "### Clump Thickness: Assessment of the thickness of tumor cell clusters (1 - 10).\n", - "Uniformity of Cell Size: Uniformity in the size of tumor cells (1 - 10).\n", - "Uniformity of Cell Shape: Uniformity in the shape of tumor cells (1 - 10).\n", - "Marginal Adhesion: Degree of adhesion of tumor cells to surrounding tissue (1 - 10).\n", - "Single Epithelial Cell Size: Size of individual tumor cells (1 - 10).\n", - "Bare Nuclei: Presence of nuclei without surrounding cytoplasm (1 - 10).\n", - "Bland Chromatin: Assessment of chromatin structure in tumor cells (1 - 10).\n", - "Normal Nucleoli: Presence of normal-looking nucleoli in tumor cells (1 - 10).\n", - "Mitoses: Frequency of mitotic cell divisions (1 - 10).\n", - "Class: Classification of tumor type (2 for benign, 4 for malignant)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "bd9a55aa", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './breast-cancer-wisconsin.data'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./breast-cancer-wisconsin.data\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m columns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSample code number\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClump Thickness\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUniformity of Cell Size\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUniformity of Cell Shape\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMarginal Adhesion\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSingle Epithelial Cell Size\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBare Nuclei\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBland Chromatin\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNormal Nucleoli\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMitoses\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClass\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 5\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(url, names\u001b[38;5;241m=\u001b[39mcolumns)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[1;32m 1662\u001b[0m f,\n\u001b[1;32m 1663\u001b[0m mode,\n\u001b[1;32m 1664\u001b[0m encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1665\u001b[0m compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1666\u001b[0m memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 1667\u001b[0m is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[1;32m 1668\u001b[0m errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 1669\u001b[0m storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1670\u001b[0m )\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 860\u001b[0m handle,\n\u001b[1;32m 861\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 862\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 863\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 864\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 865\u001b[0m )\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './breast-cancer-wisconsin.data'" - ] - } - ], - "source": [ - "url = \"./breast-cancer-wisconsin.data\"\n", - "columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',\n", - " 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', \n", - " 'Normal Nucleoli', 'Mitoses', 'Class']\n", - "df = pd.read_csv(url, names=columns)" - ] - }, - { - "cell_type": "markdown", - "id": "3be5a2e3", - "metadata": {}, - "source": [ - "## Show data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "28791c54", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Sample code numberClump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitosesClass
010000255111213112
1100294554457103212
210154253111223112
310162776881343712
410170234113213112
\n", - "
" - ], - "text/plain": [ - " Sample code number Clump Thickness Uniformity of Cell Size \\\n", - "0 1000025 5 1 \n", - "1 1002945 5 4 \n", - "2 1015425 3 1 \n", - "3 1016277 6 8 \n", - "4 1017023 4 1 \n", - "\n", - " Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size \\\n", - "0 1 1 2 \n", - "1 4 5 7 \n", - "2 1 1 2 \n", - "3 8 1 3 \n", - "4 1 3 2 \n", - "\n", - " Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses Class \n", - "0 1 3 1 1 2 \n", - "1 10 3 2 1 2 \n", - "2 2 3 1 1 2 \n", - "3 4 3 7 1 2 \n", - "4 1 3 1 1 2 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "90314305", - "metadata": {}, - "source": [ - "## Generate Report with ydata_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2053a002", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: ipywidgets in /home/mahehsma/anaconda3/lib/python3.11/site-packages (8.0.4)\n", - "Requirement already satisfied: ipykernel>=4.5.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (6.25.0)\n", - "Requirement already satisfied: ipython>=6.1.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (8.15.0)\n", - "Requirement already satisfied: traitlets>=4.3.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (5.7.1)\n", - "Requirement already satisfied: widgetsnbextension~=4.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (4.0.5)\n", - "Requirement already satisfied: jupyterlab-widgets~=3.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (3.0.5)\n", - "Requirement already satisfied: comm>=0.1.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.2)\n", - "Requirement already satisfied: debugpy>=1.6.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.6.7)\n", - "Requirement already satisfied: jupyter-client>=6.1.12 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.4.9)\n", - "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.3.0)\n", - "Requirement already satisfied: matplotlib-inline>=0.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.6)\n", - "Requirement already satisfied: nest-asyncio in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.6)\n", - "Requirement already satisfied: packaging in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.1)\n", - "Requirement already satisfied: psutil in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.9.0)\n", - "Requirement already satisfied: pyzmq>=20 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.2.0)\n", - "Requirement already satisfied: tornado>=6.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.3.2)\n", - "Requirement already satisfied: backcall in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", - "Requirement already satisfied: decorator in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n", - "Requirement already satisfied: jedi>=0.16 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.1)\n", - "Requirement already satisfied: pickleshare in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)\n", - "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.36)\n", - "Requirement already satisfied: pygments>=2.4.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (2.15.1)\n", - "Requirement already satisfied: stack-data in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", - "Requirement already satisfied: pexpect>4.3 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n", - "Requirement already satisfied: entrypoints in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (0.4)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (2.8.2)\n", - "Requirement already satisfied: platformdirs>=2.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.5.1->ipywidgets) (3.10.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets) (0.2.5)\n", - "Requirement already satisfied: executing in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.8.3)\n", - "Requirement already satisfied: asttokens in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.5)\n", - "Requirement already satisfied: pure-eval in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n", - "Requirement already satisfied: six>=1.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (1.16.0)\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip install ipywidgets\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mydata_profiling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ProfileReport\n\u001b[0;32m----> 4\u001b[0m profile \u001b[38;5;241m=\u001b[39m ProfileReport(df, title\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProfiling Report\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m profile\u001b[38;5;241m.\u001b[39mto_notebook_iframe()\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], - "source": [ - "!pip install ipywidgets\n", - "from ydata_profiling import ProfileReport\n", - "\n", - "profile = ProfileReport(df, title=\"Profiling Report\")\n", - "\n", - "profile.to_notebook_iframe()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b642b1f3-4720-4028-805f-2a8e88958f41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting ucimlrepo\n", - " Obtaining dependency information for ucimlrepo from https://files.pythonhosted.org/packages/22/47/9350b2eeeaef8c0fd3ec3505c8a0481b576845b3df0d71c76f989c23d3c6/ucimlrepo-0.0.6-py3-none-any.whl.metadata\n", - " Downloading ucimlrepo-0.0.6-py3-none-any.whl.metadata (5.3 kB)\n", - "Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)\n", - "Installing collected packages: ucimlrepo\n", - "Successfully installed ucimlrepo-0.0.6\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "pip install ucimlrepo" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "cf67e63c-778b-4bb0-b48b-d50bc5fd2cba", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sandhu, K. Guppy, S. Lee, V. Froelicher', 'published_in': 'American Journal of Cardiology', 'year': 1989, 'url': 'https://www.semanticscholar.org/paper/a7d714f8f87bfc41351eb5ae1e5472f0ebbe0574', 'doi': None}, 'additional_info': {'summary': 'This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to date. The \"goal\" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0). \\n \\nThe names and social security numbers of the patients were recently removed from the database, replaced with dummy values.\\n\\nOne file has been \"processed\", that one containing the Cleveland database. All four unprocessed files also exist in this directory.\\n\\nTo see Test Costs (donated by Peter Turney), please see the folder \"Costs\" ', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Only 14 attributes used:\\r\\n 1. #3 (age) \\r\\n 2. #4 (sex) \\r\\n 3. #9 (cp) \\r\\n 4. #10 (trestbps) \\r\\n 5. #12 (chol) \\r\\n 6. #16 (fbs) \\r\\n 7. #19 (restecg) \\r\\n 8. #32 (thalach) \\r\\n 9. #38 (exang) \\r\\n 10. #40 (oldpeak) \\r\\n 11. #41 (slope) \\r\\n 12. #44 (ca) \\r\\n 13. #51 (thal) \\r\\n 14. #58 (num) (the predicted attribute)\\r\\n\\r\\nComplete attribute documentation:\\r\\n 1 id: patient identification number\\r\\n 2 ccf: social security number (I replaced this with a dummy value of 0)\\r\\n 3 age: age in years\\r\\n 4 sex: sex (1 = male; 0 = female)\\r\\n 5 painloc: chest pain location (1 = substernal; 0 = otherwise)\\r\\n 6 painexer (1 = provoked by exertion; 0 = otherwise)\\r\\n 7 relrest (1 = relieved after rest; 0 = otherwise)\\r\\n 8 pncaden (sum of 5, 6, and 7)\\r\\n 9 cp: chest pain type\\r\\n -- Value 1: typical angina\\r\\n -- Value 2: atypical angina\\r\\n -- Value 3: non-anginal pain\\r\\n -- Value 4: asymptomatic\\r\\n 10 trestbps: resting blood pressure (in mm Hg on admission to the hospital)\\r\\n 11 htn\\r\\n 12 chol: serum cholestoral in mg/dl\\r\\n 13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker)\\r\\n 14 cigs (cigarettes per day)\\r\\n 15 years (number of years as a smoker)\\r\\n 16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)\\r\\n 17 dm (1 = history of diabetes; 0 = no such history)\\r\\n 18 famhist: family history of coronary artery disease (1 = yes; 0 = no)\\r\\n 19 restecg: resting electrocardiographic results\\r\\n -- Value 0: normal\\r\\n -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)\\r\\n -- Value 2: showing probable or definite left ventricular hypertrophy by Estes\\' criteria\\r\\n 20 ekgmo (month of exercise ECG reading)\\r\\n 21 ekgday(day of exercise ECG reading)\\r\\n 22 ekgyr (year of exercise ECG reading)\\r\\n 23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no)\\r\\n 24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no)\\r\\n 25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no)\\r\\n 26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no)\\r\\n 27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no)\\r\\n 28 proto: exercise protocol\\r\\n 1 = Bruce \\r\\n 2 = Kottus\\r\\n 3 = McHenry\\r\\n 4 = fast Balke\\r\\n 5 = Balke\\r\\n 6 = Noughton \\r\\n 7 = bike 150 kpa min/min (Not sure if \"kpa min/min\" is what was written!)\\r\\n 8 = bike 125 kpa min/min \\r\\n 9 = bike 100 kpa min/min\\r\\n 10 = bike 75 kpa min/min\\r\\n 11 = bike 50 kpa min/min\\r\\n 12 = arm ergometer\\r\\n 29 thaldur: duration of exercise test in minutes\\r\\n 30 thaltime: time when ST measure depression was noted\\r\\n 31 met: mets achieved\\r\\n 32 thalach: maximum heart rate achieved\\r\\n 33 thalrest: resting heart rate\\r\\n 34 tpeakbps: peak exercise blood pressure (first of 2 parts)\\r\\n 35 tpeakbpd: peak exercise blood pressure (second of 2 parts)\\r\\n 36 dummy\\r\\n 37 trestbpd: resting blood pressure\\r\\n 38 exang: exercise induced angina (1 = yes; 0 = no)\\r\\n 39 xhypo: (1 = yes; 0 = no)\\r\\n 40 oldpeak = ST depression induced by exercise relative to rest\\r\\n 41 slope: the slope of the peak exercise ST segment\\r\\n -- Value 1: upsloping\\r\\n -- Value 2: flat\\r\\n -- Value 3: downsloping\\r\\n 42 rldv5: height at rest\\r\\n 43 rldv5e: height at peak exercise\\r\\n 44 ca: number of major vessels (0-3) colored by flourosopy\\r\\n 45 restckm: irrelevant\\r\\n 46 exerckm: irrelevant\\r\\n 47 restef: rest raidonuclid (sp?) ejection fraction\\r\\n 48 restwm: rest wall (sp?) motion abnormality\\r\\n 0 = none\\r\\n 1 = mild or moderate\\r\\n 2 = moderate or severe\\r\\n 3 = akinesis or dyskmem (sp?)\\r\\n 49 exeref: exercise radinalid (sp?) ejection fraction\\r\\n 50 exerwm: exercise wall (sp?) motion \\r\\n 51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect\\r\\n 52 thalsev: not used\\r\\n 53 thalpul: not used\\r\\n 54 earlobe: not used\\r\\n 55 cmo: month of cardiac cath (sp?) (perhaps \"call\")\\r\\n 56 cday: day of cardiac cath (sp?)\\r\\n 57 cyr: year of cardiac cath (sp?)\\r\\n 58 num: diagnosis of heart disease (angiographic disease status)\\r\\n -- Value 0: < 50% diameter narrowing\\r\\n -- Value 1: > 50% diameter narrowing\\r\\n (in any major vessel: attributes 59 through 68 are vessels)\\r\\n 59 lmt\\r\\n 60 ladprox\\r\\n 61 laddist\\r\\n 62 diag\\r\\n 63 cxmain\\r\\n 64 ramus\\r\\n 65 om1\\r\\n 66 om2\\r\\n 67 rcaprox\\r\\n 68 rcadist\\r\\n 69 lvx1: not used\\r\\n 70 lvx2: not used\\r\\n 71 lvx3: not used\\r\\n 72 lvx4: not used\\r\\n 73 lvf: not used\\r\\n 74 cathef: not used\\r\\n 75 junk: not used\\r\\n 76 name: last name of patient (I replaced this with the dummy string \"name\")', 'citation': None}}\n", - " name role type demographic \\\n", - "0 age Feature Integer Age \n", - "1 sex Feature Categorical Sex \n", - "2 cp Feature Categorical None \n", - "3 trestbps Feature Integer None \n", - "4 chol Feature Integer None \n", - "5 fbs Feature Categorical None \n", - "6 restecg Feature Categorical None \n", - "7 thalach Feature Integer None \n", - "8 exang Feature Categorical None \n", - "9 oldpeak Feature Integer None \n", - "10 slope Feature Categorical None \n", - "11 ca Feature Integer None \n", - "12 thal Feature Categorical None \n", - "13 num Target Integer None \n", - "\n", - " description units missing_values \n", - "0 None years no \n", - "1 None None no \n", - "2 None None no \n", - "3 resting blood pressure (on admission to the ho... mm Hg no \n", - "4 serum cholestoral mg/dl no \n", - "5 fasting blood sugar > 120 mg/dl None no \n", - "6 None None no \n", - "7 maximum heart rate achieved None no \n", - "8 exercise induced angina None no \n", - "9 ST depression induced by exercise relative to ... None no \n", - "10 None None no \n", - "11 number of major vessels (0-3) colored by flour... None yes \n", - "12 None None yes \n", - "13 diagnosis of heart disease None no \n" - ] - } - ], - "source": [ - "from ucimlrepo import fetch_ucirepo \n", - " \n", - "# fetch dataset \n", - "heart_disease = fetch_ucirepo(id=45) \n", - " \n", - "# data (as pandas dataframes) \n", - "X = heart_disease.data.features \n", - "y = heart_disease.data.targets \n", - " \n", - "# metadata \n", - "print(heart_disease.metadata) \n", - " \n", - "# variable information \n", - "print(heart_disease.variables) " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "b885f94c-e6b6-41d9-a484-57dc5ba98ac6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e3de4992aeec499aa299d79cde4a73ec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Summarize dataset: 0%| | 0/5 [00:00" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "profile = ProfileReport(X, title=\"heart disease data report\")\n", - "\n", - "profile.to_notebook_iframe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7b29ede-9e6e-4a0b-8803-827a579c2715", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 27bc641b2b9efaa3c317521691ec1a9047ce646b Mon Sep 17 00:00:00 2001 From: nicole Date: Fri, 7 Jun 2024 13:54:37 +0200 Subject: [PATCH 2/2] add codeprojekt nicole --- codeprojekt_nicole.ipynb | 989 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 989 insertions(+) create mode 100644 codeprojekt_nicole.ipynb diff --git a/codeprojekt_nicole.ipynb b/codeprojekt_nicole.ipynb new file mode 100644 index 0000000..8892a9c --- /dev/null +++ b/codeprojekt_nicole.ipynb @@ -0,0 +1,989 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ffe86c63-5718-4c71-a3a3-94672d34e676", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# import libraries\n", + "from ucimlrepo import fetch_ucirepo\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "d1b8dc0f-8e8b-485a-9147-cf694f832357", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Explorative Datenanalyse\n", + "Deskriptive Statistik: Berechnung von Mittelwerten, Mediane, Standardabweichungen und anderen Kenngrößen für die verschiedenen Merkmale. \n", + "Visualisierung: Nutzung von Histogrammen, Boxplots und Scatterplots, um die Verteilung und Beziehungen der Merkmale zu analysieren. \n", + "Korrelation: Untersuchung der Korrelation zwischen den Merkmalen und dem Outcome." + ] + }, + { + "cell_type": "markdown", + "id": "282a1c63-e28c-4d81-b1dc-1d971e9a4c12", + "metadata": {}, + "source": [ + "### Dataset preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "130b7fb5-7f85-4393-a741-05246920ce63", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 303 entries, 0 to 302\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 age 303 non-null int64 \n", + " 1 sex 303 non-null int64 \n", + " 2 cp 303 non-null int64 \n", + " 3 trestbps 303 non-null int64 \n", + " 4 chol 303 non-null int64 \n", + " 5 fbs 303 non-null int64 \n", + " 6 restecg 303 non-null int64 \n", + " 7 thalach 303 non-null int64 \n", + " 8 exang 303 non-null int64 \n", + " 9 oldpeak 303 non-null float64\n", + " 10 slope 303 non-null int64 \n", + " 11 ca 299 non-null float64\n", + " 12 thal 301 non-null float64\n", + " 13 healthy 303 non-null int64 \n", + "dtypes: float64(3), int64(11)\n", + "memory usage: 33.3 KB\n" + ] + } + ], + "source": [ + "# fetch dataset \n", + "heart_disease = fetch_ucirepo(id=45)\n", + "\n", + "# features\n", + "x = heart_disease.data.features \n", + "\n", + "# target variable\n", + "y = heart_disease.data.targets \n", + "\n", + "# complete dataframe\n", + "heart_df = pd.concat([x, y], axis=1).rename(columns={'num':'healthy'})\n", + "\n", + "# replace values for target variable: 0=sick, 1=healthy\n", + "heart_df['healthy'] = heart_df['healthy'].replace({0: 1, 1: 0, 2: 0, 3: 0, 4: 0})\n", + "\n", + "# view summary of dataset\n", + "heart_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0780f01d-771b-4443-86b2-0e2aa8ef464f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rows with NaN values to be removed: 6\n", + "Total number of rows: 303\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathalhealthy
063111452331215002.330.06.01
167141602860210811.523.03.00
267141202290212912.622.07.00
337131302500018703.530.03.01
441021302040217201.410.03.01
.............................................
29757041402410012310.220.07.00
29845111102640013201.220.07.00
29968141441931014103.422.07.00
30057141301310011511.221.07.00
30157021302360217400.021.03.00
\n", + "

297 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " age sex cp trestbps chol fbs restecg thalach exang oldpeak \\\n", + "0 63 1 1 145 233 1 2 150 0 2.3 \n", + "1 67 1 4 160 286 0 2 108 1 1.5 \n", + "2 67 1 4 120 229 0 2 129 1 2.6 \n", + "3 37 1 3 130 250 0 0 187 0 3.5 \n", + "4 41 0 2 130 204 0 2 172 0 1.4 \n", + ".. ... ... .. ... ... ... ... ... ... ... \n", + "297 57 0 4 140 241 0 0 123 1 0.2 \n", + "298 45 1 1 110 264 0 0 132 0 1.2 \n", + "299 68 1 4 144 193 1 0 141 0 3.4 \n", + "300 57 1 4 130 131 0 0 115 1 1.2 \n", + "301 57 0 2 130 236 0 2 174 0 0.0 \n", + "\n", + " slope ca thal healthy \n", + "0 3 0.0 6.0 1 \n", + "1 2 3.0 3.0 0 \n", + "2 2 2.0 7.0 0 \n", + "3 3 0.0 3.0 1 \n", + "4 1 0.0 3.0 1 \n", + ".. ... ... ... ... \n", + "297 2 0.0 7.0 0 \n", + "298 2 0.0 7.0 0 \n", + "299 2 2.0 7.0 0 \n", + "300 2 1.0 7.0 0 \n", + "301 2 1.0 3.0 0 \n", + "\n", + "[297 rows x 14 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data cleaning\n", + "rows_with_nan = heart_df.isna().any(axis=1).sum()\n", + "print(f\"Number of rows with NaN values to be removed: {rows_with_nan}\")\n", + "total_rows = heart_df.shape[0]\n", + "print(f\"Total number of rows: {total_rows}\")\n", + "\n", + "# remove rows with NaN values\n", + "heart_df = heart_df.dropna()\n", + "\n", + "heart_df" + ] + }, + { + "cell_type": "markdown", + "id": "15d04b9c-2803-43e8-98a0-118dda07536f", + "metadata": {}, + "source": [ + "### Clustering and PCA\n", + "Um zu analysieren, ob ähnliche Merkmale auch zur gleichen Diagnose führen, wird zuerst ein k-Means Clustering angewandt." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "76dc9ce9-8e81-4a50-ad4d-557576a58a92", + "metadata": {}, + "outputs": [], + "source": [ + "# prepare data for clustering\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# split data into featrues (x) and targets (y)\n", + "X = heart_df.iloc[:, :-1]\n", + "y = heart_df.iloc[:, -1]\n", + "\n", + "# scale data (MinMax probieren)\n", + "scaler = StandardScaler() \n", + "X_scaled = scaler.fit_transform (X)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b56af950-7a58-43b5-8a83-1fd95b0c7220", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KMeans(n_clusters=2, n_init='auto', random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KMeans(n_clusters=2, n_init='auto', random_state=42)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# k-means clustering\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "# prepare model\n", + "kmeans = KMeans(n_clusters=2, random_state=42, n_init='auto')\n", + "kmeans.fit(X_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4dfc00f7-a2b6-4408-b48d-c24a9c32f74d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "79.12% der Datensätze wurden mithilfe von KMeans richtig einem Cluster zugeordnet\n" + ] + } + ], + "source": [ + "# calculate percentage of data points correctly assigned to each cluster\n", + "cluster1 = kmeans.labels_ == 0\n", + "cluster2 = kmeans.labels_ == 1\n", + "\n", + "perc_cluster1 = np.round(np.mean(cluster1 == y) * 100, decimals=2)\n", + "perc_cluster2 = np.round(np.mean(cluster2 == y) * 100, decimals=2)\n", + "\n", + "# choose cluster with higher correspondence\n", + "if perc_cluster1 > perc_cluster2:\n", + " km_healthy = cluster1\n", + " max_perc = perc_cluster1\n", + "else:\n", + " km_healthy = cluster2\n", + " max_perc = perc_cluster2\n", + "\n", + "print(f\"{max_perc}% der Datensätze wurden mithilfe von KMeans richtig einem Cluster zugeordnet\")\n", + "\n", + "# hier vlt noch irgendwie diskutieren ob das ein smart way ist um das auszuwerten, anscheinend gibt's dafür andere Metriken" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "d5edbc93-22e2-4086-aa77-ec7a76765048", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proportion of variance explained by each principal component:\n", + "[0.23695056 0.12349486 0.0960383 0.0851614 0.07638123 0.06720242\n", + " 0.06493755 0.05994447 0.05268635 0.04368496 0.03486402 0.03140469\n", + " 0.02724919]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# apply PCA\n", + "pca = PCA()\n", + "pca.fit(X_scaled)\n", + "\n", + "print(f\"Proportion of variance explained by each principal component:\\n{pca.explained_variance_ratio_}\")\n", + "\n", + "# Plot the proportion of variance explained\n", + "plt.figure(figsize=(8, 6))\n", + "plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, color='skyblue')\n", + "plt.xlabel('Principal Component')\n", + "plt.ylabel('Proportion of Variance Explained')\n", + "plt.title('Proportion of Variance Explained by Principal Components')\n", + "plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))\n", + "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "c3d40f19-8299-44ab-bb8a-326fdd2008c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contributions of features to the first principal component:\n", + " Feature Contribution\n", + "9 oldpeak 0.397006\n", + "10 slope 0.352024\n", + "12 thal 0.346208\n", + "8 exang 0.333140\n", + "11 ca 0.306420\n", + "2 cp 0.286212\n", + "0 age 0.285868\n", + "3 trestbps 0.167841\n", + "6 restecg 0.145938\n", + "1 sex 0.116802\n", + "4 chol 0.083504\n", + "5 fbs 0.076094\n", + "7 thalach -0.392706\n" + ] + } + ], + "source": [ + "# get the loadings or weights of features in the first principal component\n", + "first_pc_loadings = pca.components_[0]\n", + "\n", + "# create a DataFrame to display the contributions of features to the first principal component\n", + "pc_loadings_df = pd.DataFrame({\"Feature\": X.columns, \"Contribution\": first_pc_loadings})\n", + "pc_loadings_df = pc_loadings_df.sort_values(by=\"Contribution\", ascending=False)\n", + "\n", + "print(\"Contributions of features to the first principal component:\")\n", + "print(pc_loadings_df)" + ] + }, + { + "cell_type": "markdown", + "id": "b68452ef-aa60-4b2f-aec4-4762f9136360", + "metadata": {}, + "source": [ + "#### Interpretation:\n", + "Die \"Proportion of Variance Explained by Principal Components\" gibt an, wie viel der gesamten Varianz in den Daten durch jede einzelne Hauptkomponente erklärt wird.\n", + "Hauptkomponenten mit einer höheren Proportion der erklärten Varianz tragen mehr zur Erklärung der Variation in den Daten bei und sind daher wichtiger.\n", + "Eine höhere Proportion der erklärten Varianz in einer bestimmten Hauptkomponente kann darauf hinweisen, dass diese Hauptkomponente wichtige Informationen über die Variation in den Merkmalen oder Beobachtungen enthält, die mit den zugrunde liegenden Mustern oder Gruppierungen in den Herzdaten zusammenhängen könnten. Dabei ist wichtig zu erwähnen, dass die Ergebnisse nicht in direktem Zusammenhang mit der goal-Variable stehen, sondern lediglich auf Muster in den Daten hinweisen.\n", + "\n", + "Merkmale, die hohe Contributions zu der ersten Hauptkomponente haben, tragen wesentlich zur Variation in den Daten bei. Diese Merkmale haben möglicherweise eine starke Korrelation mit der ersten Hauptkomponente und könnten daher wichtigere Informationen über die zugrunde liegenden Muster oder Strukturen in den Daten liefern, als jene mit einer geringen Contribution." + ] + }, + { + "cell_type": "markdown", + "id": "e8dcc395-b548-4dde-bd46-67a962af6f03", + "metadata": {}, + "source": [ + "### Cholesterinwerte im Vergleich Frauen/Männer" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "ed81d87f-a0b0-4dbb-88f6-471423ce3151", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "heart_df['sex'] = heart_df['sex'].replace({0: 'female', 1: 'male'})\n", + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(x='sex', y='chol', data=heart_df)\n", + "plt.title('Cholesterinwerte nach Geschlecht')\n", + "plt.xlabel('Geschlecht')\n", + "plt.ylabel('Cholesterin in mg/dl')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "3617a148-3ae4-4eb9-99d1-c7a6aa78bd17", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Geschlecht Untere_Grenze Obere_Grenze\n", + "0 Männer 234.288517 246.199046\n", + "1 Frauen 249.044612 275.413721\n" + ] + } + ], + "source": [ + "# Konfidenzintervall (95%) für Cholesterin Level jeweils für Männer und Frauen\n", + "from scipy import stats\n", + "\n", + "# Filtern nach Geschlecht und Berechnen des Konfidenzintervalls\n", + "conf_level = 0.95\n", + "chol_men = heart_df.loc[heart_df['sex'] == 'male', 'chol']\n", + "chol_women = heart_df.loc[heart_df['sex'] == 'female', 'chol']\n", + "conf_int_men = stats.t.interval(conf_level, len(chol_men) - 1, loc=chol_men.mean(), scale=stats.sem(chol_men))\n", + "conf_int_women = stats.t.interval(conf_level, len(chol_women) - 1, loc=chol_women.mean(), scale=stats.sem(chol_women))\n", + "\n", + "result_table_men_vs_women = pd.DataFrame({\n", + " 'Geschlecht': ['Männer', 'Frauen'],\n", + " 'Untere_Grenze': [conf_int_men[0], conf_int_women[0]],\n", + " 'Obere_Grenze': [conf_int_men[1], conf_int_women[1]]\n", + "})\n", + "\n", + "print(result_table_men_vs_women)" + ] + }, + { + "cell_type": "markdown", + "id": "1b4ad40f-dd3b-4309-a5f3-a085d042c1eb", + "metadata": {}, + "source": [ + "### Cholesterin im Vergleich zur Erkrankung" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "60f42093-3616-4e2b-8557-56794af3f25b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Boxplot gruppiert nach Diagnose\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x='healthy', y='chol', data=heart_df)\n", + "plt.title('Cholesterin im Vergleich zur Erkrankung')\n", + "plt.xlabel('Diagnose')\n", + "plt.ylabel('Cholesterin in mg/dl')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "7bef4e62-968e-4a2f-9948-063fac1e7a88", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Diagnose Untere_Grenze Obere_Grenze\n", + "0 Gesund 234.397652 250.882836\n", + "1 Krank 243.175250 259.774391\n" + ] + } + ], + "source": [ + "# Filtern nach healthy und Berechnen des Konfidenzintervalls\n", + "conf_level = 0.95\n", + "chol_healthy = heart_df.loc[heart_df['healthy'] == 1, 'chol']\n", + "chol_sick = heart_df.loc[heart_df['healthy'] == 0, 'chol']\n", + "conf_int_healthy = stats.t.interval(conf_level, len(chol_healthy) - 1, loc=chol_healthy.mean(), scale=stats.sem(chol_healthy))\n", + "conf_int_sick = stats.t.interval(conf_level, len(chol_sick) - 1, loc=chol_sick.mean(), scale=stats.sem(chol_sick))\n", + "\n", + "result_table_healthy_vs_sick = pd.DataFrame({\n", + " 'Diagnose': ['Gesund', 'Krank'],\n", + " 'Untere_Grenze': [conf_int_healthy[0], conf_int_sick[0]],\n", + " 'Obere_Grenze': [conf_int_healthy[1], conf_int_sick[1]]\n", + "})\n", + "\n", + "print(result_table_healthy_vs_sick)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "2aa2b03a-1da6-4846-be5d-1070fa1ae43e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-Statistik: 1.4829209163941466\n", + "p-Wert: 0.06957083510218263\n", + "Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\n", + "Es gibt keine signifikanten Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\n" + ] + } + ], + "source": [ + "# t-Test\n", + "# Nullhypothese (HO) = Cholesterinwert bei Kranken ist gleich oder kleiner wie bei Gesunden\n", + "# Alternativhypothese (H1) Cholesterinwert bei Kranken ist höher als bei Gesunden\n", + "\n", + "from scipy.stats import ttest_ind\n", + "\n", + "# Daten für gesunde und kranke Personen\n", + "chol_healthy = heart_df.loc[heart_df['healthy'] == 1, 'chol']\n", + "chol_sick = heart_df.loc[heart_df['healthy'] == 0, 'chol']\n", + "\n", + "# Durchführung des t-Tests\n", + "t_statistic, p_value = ttest_ind(chol_sick, chol_healthy, alternative='greater')\n", + "\n", + "# Ausgabe der Ergebnisse\n", + "print(\"t-Statistik:\", t_statistic)\n", + "print(\"p-Wert:\", p_value)\n", + "\n", + "# Überprüfung der Nullhypothese\n", + "if p_value < 0.05:\n", + " print(\"Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\")\n", + " print(\"Es gibt signifikante Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\")\n", + "else:\n", + " print(\"Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\")\n", + " print(\"Es gibt keine signifikanten Hinweise darauf, dass der Cholesterinwert bei kranken Personen höher ist als bei gesunden Personen.\")" + ] + }, + { + "cell_type": "markdown", + "id": "c3852db8-3787-4ddf-aed7-80396f7a89ac", + "metadata": {}, + "source": [ + "### Systolischer Ruheblutdruck" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "f063878c-5866-4a19-b065-a41d3e26e290", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/nicole/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(x='sex', y='trestbps', data=heart_df)\n", + "plt.title('Überblick über Blutdruck')\n", + "plt.xlabel('Geschlecht')\n", + "plt.ylabel('Systolischer Ruheblutdruck (in mmHg bei Aufnahme ins Krankenhaus)')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "f20c6b80-bac5-489e-98e1-b5f2d9d90615", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Diagnose Untere_Grenze Obere_Grenze\n", + "0 Gesund 126.751354 131.748646\n", + "1 Krank 131.420542 137.716148\n" + ] + } + ], + "source": [ + "# Filtern nach goal und Berechnen des Konfidenzintervalls\n", + "conf_level = 0.95\n", + "blutdruck_gesund = heart_df.loc[heart_df['healthy'] == 1, 'trestbps']\n", + "blutdruck_krank = heart_df.loc[heart_df['healthy'] == 0, 'trestbps']\n", + "conf_int_gesund = stats.t.interval(conf_level, len(blutdruck_gesund) - 1, loc=blutdruck_gesund.mean(), scale=stats.sem(blutdruck_gesund))\n", + "conf_int_krank = stats.t.interval(conf_level, len(blutdruck_krank) - 1, loc=blutdruck_krank.mean(), scale=stats.sem(blutdruck_krank))\n", + "\n", + "# Erstellen der Tabelle\n", + "result_table_blutdruck = pd.DataFrame({\n", + " 'Diagnose': ['Gesund', 'Krank'],\n", + " 'Untere_Grenze': [conf_int_gesund[0], conf_int_krank[0]],\n", + " 'Obere_Grenze': [conf_int_gesund[1], conf_int_krank[1]]\n", + "})\n", + "\n", + "# Anzeige der Tabelle\n", + "print(result_table_blutdruck)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "bc919e7f-2b7b-41c1-a5b3-a97d61baa6ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-Statistik: 2.647004396805282\n", + "p-Wert: 0.004274134464297464\n", + "Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\n", + "Es gibt signifikante Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\n" + ] + } + ], + "source": [ + "# t-Test\n", + "# H0 Kranke haben einen niedrigeren oder gleichen Blutdruck wie Gesunde\n", + "# H1 Kranke haben einen höheren BLutdruck als Gesunde\n", + "\n", + "# Daten für gesunde und kranke Personen\n", + "blutdruck_healthy = heart_df.loc[heart_df['healthy'] == 1, 'trestbps']\n", + "blutdruck_sick = heart_df.loc[heart_df['healthy'] == 0, 'trestbps']\n", + "\n", + "# Durchführung des t-Tests\n", + "t_statistic, p_value = ttest_ind(blutdruck_sick, blutdruck_healthy, alternative='greater')\n", + "\n", + "print(\"t-Statistik:\", t_statistic)\n", + "print(\"p-Wert:\", p_value)\n", + "\n", + "# Überprüfung Nullhypothese\n", + "if p_value < 0.05:\n", + " print(\"Der p-Wert ist kleiner als 0.05, daher wird die Nullhypothese abgelehnt.\")\n", + " print(\"Es gibt signifikante Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\")\n", + "else:\n", + " print(\"Der p-Wert ist größer als oder gleich 0.05, daher wird die Nullhypothese nicht abgelehnt.\")\n", + " print(\"Es gibt keine signifikanten Hinweise darauf, dass der Blutdruck bei kranken Personen höher ist als bei gesunden Personen.\")" + ] + }, + { + "cell_type": "markdown", + "id": "4a055d25-1c55-4a3f-a201-4ea252cac4dd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Weitere Dinge die wir machen könnten\n", + "- Korrelationsanalyse\n", + "- logistische Regression, KNN, Entscheidungsbäume\n", + "- ROC/AUC Kurve plotten für Modelle\n", + "\n", + "\n", + "\n", + "- Classification:\n", + "- SVMs\n", + "- Decision Trees\n", + "- K-Nearest Neighbor\n", + "- Random Forest\n", + "\n", + "Dimensionality Reduction\n", + "- PCA\n", + "\n", + "Regression:\n", + "- lineare \n", + "- logistische\n", + "\n", + "Clustering:\n", + "- hierarchisch -> bäume\n", + "- K-means clustering\n", + "\n", + "Statistik:\n", + "- Hypothesen" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}