From 1dd196931dc788e02edda0be4bf6fffa85130cab Mon Sep 17 00:00:00 2001 From: nicole Date: Fri, 7 Jun 2024 13:44:29 +0200 Subject: [PATCH] add dataset preparation, clustering and PCA, hypothesis tests --- Data Cleaning_05_04_2024.ipynb | 22424 ------------------------------- 1 file changed, 22424 deletions(-) delete mode 100644 Data Cleaning_05_04_2024.ipynb diff --git a/Data Cleaning_05_04_2024.ipynb b/Data Cleaning_05_04_2024.ipynb deleted file mode 100644 index 2eb416f..0000000 --- a/Data Cleaning_05_04_2024.ipynb +++ /dev/null @@ -1,22424 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e18a679f", - "metadata": {}, - "source": [ - "## Import Python libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4922164f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import ydata_profiling\n", - "#from pandas_profiling import ProfileReport\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "72caaa8b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "markdown", - "id": "dfe3a57a", - "metadata": {}, - "source": [ - "## Load Data (Breast Cancer Dataset)\n", - "### Sample code number: Unique identifier for each tissue sample.\n", - "### Clump Thickness: Assessment of the thickness of tumor cell clusters (1 - 10).\n", - "Uniformity of Cell Size: Uniformity in the size of tumor cells (1 - 10).\n", - "Uniformity of Cell Shape: Uniformity in the shape of tumor cells (1 - 10).\n", - "Marginal Adhesion: Degree of adhesion of tumor cells to surrounding tissue (1 - 10).\n", - "Single Epithelial Cell Size: Size of individual tumor cells (1 - 10).\n", - "Bare Nuclei: Presence of nuclei without surrounding cytoplasm (1 - 10).\n", - "Bland Chromatin: Assessment of chromatin structure in tumor cells (1 - 10).\n", - "Normal Nucleoli: Presence of normal-looking nucleoli in tumor cells (1 - 10).\n", - "Mitoses: Frequency of mitotic cell divisions (1 - 10).\n", - "Class: Classification of tumor type (2 for benign, 4 for malignant)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "bd9a55aa", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './breast-cancer-wisconsin.data'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./breast-cancer-wisconsin.data\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m columns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSample code number\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClump Thickness\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUniformity of Cell Size\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUniformity of Cell Shape\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMarginal Adhesion\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSingle Epithelial Cell Size\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBare Nuclei\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBland Chromatin\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNormal Nucleoli\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMitoses\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClass\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 5\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(url, names\u001b[38;5;241m=\u001b[39mcolumns)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[1;32m 1662\u001b[0m f,\n\u001b[1;32m 1663\u001b[0m mode,\n\u001b[1;32m 1664\u001b[0m encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1665\u001b[0m compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1666\u001b[0m memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 1667\u001b[0m is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[1;32m 1668\u001b[0m errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 1669\u001b[0m storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1670\u001b[0m )\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 860\u001b[0m handle,\n\u001b[1;32m 861\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 862\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 863\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 864\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 865\u001b[0m )\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './breast-cancer-wisconsin.data'" - ] - } - ], - "source": [ - "url = \"./breast-cancer-wisconsin.data\"\n", - "columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',\n", - " 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', \n", - " 'Normal Nucleoli', 'Mitoses', 'Class']\n", - "df = pd.read_csv(url, names=columns)" - ] - }, - { - "cell_type": "markdown", - "id": "3be5a2e3", - "metadata": {}, - "source": [ - "## Show data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "28791c54", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Sample code numberClump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitosesClass
010000255111213112
1100294554457103212
210154253111223112
310162776881343712
410170234113213112
\n", - "
" - ], - "text/plain": [ - " Sample code number Clump Thickness Uniformity of Cell Size \\\n", - "0 1000025 5 1 \n", - "1 1002945 5 4 \n", - "2 1015425 3 1 \n", - "3 1016277 6 8 \n", - "4 1017023 4 1 \n", - "\n", - " Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size \\\n", - "0 1 1 2 \n", - "1 4 5 7 \n", - "2 1 1 2 \n", - "3 8 1 3 \n", - "4 1 3 2 \n", - "\n", - " Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses Class \n", - "0 1 3 1 1 2 \n", - "1 10 3 2 1 2 \n", - "2 2 3 1 1 2 \n", - "3 4 3 7 1 2 \n", - "4 1 3 1 1 2 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "90314305", - "metadata": {}, - "source": [ - "## Generate Report with ydata_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2053a002", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: ipywidgets in /home/mahehsma/anaconda3/lib/python3.11/site-packages (8.0.4)\n", - "Requirement already satisfied: ipykernel>=4.5.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (6.25.0)\n", - "Requirement already satisfied: ipython>=6.1.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (8.15.0)\n", - "Requirement already satisfied: traitlets>=4.3.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (5.7.1)\n", - "Requirement already satisfied: widgetsnbextension~=4.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (4.0.5)\n", - "Requirement already satisfied: jupyterlab-widgets~=3.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipywidgets) (3.0.5)\n", - "Requirement already satisfied: comm>=0.1.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.2)\n", - "Requirement already satisfied: debugpy>=1.6.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.6.7)\n", - "Requirement already satisfied: jupyter-client>=6.1.12 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.4.9)\n", - "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.3.0)\n", - "Requirement already satisfied: matplotlib-inline>=0.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.6)\n", - "Requirement already satisfied: nest-asyncio in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.6)\n", - "Requirement already satisfied: packaging in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.1)\n", - "Requirement already satisfied: psutil in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.9.0)\n", - "Requirement already satisfied: pyzmq>=20 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.2.0)\n", - "Requirement already satisfied: tornado>=6.1 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.3.2)\n", - "Requirement already satisfied: backcall in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", - "Requirement already satisfied: decorator in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n", - "Requirement already satisfied: jedi>=0.16 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.1)\n", - "Requirement already satisfied: pickleshare in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)\n", - "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.36)\n", - "Requirement already satisfied: pygments>=2.4.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (2.15.1)\n", - "Requirement already satisfied: stack-data in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", - "Requirement already satisfied: pexpect>4.3 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n", - "Requirement already satisfied: entrypoints in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (0.4)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (2.8.2)\n", - "Requirement already satisfied: platformdirs>=2.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.5.1->ipywidgets) (3.10.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets) (0.2.5)\n", - "Requirement already satisfied: executing in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.8.3)\n", - "Requirement already satisfied: asttokens in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.5)\n", - "Requirement already satisfied: pure-eval in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n", - "Requirement already satisfied: six>=1.5 in /home/mahehsma/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (1.16.0)\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip install ipywidgets\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mydata_profiling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ProfileReport\n\u001b[0;32m----> 4\u001b[0m profile \u001b[38;5;241m=\u001b[39m ProfileReport(df, title\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProfiling Report\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m profile\u001b[38;5;241m.\u001b[39mto_notebook_iframe()\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], - "source": [ - "!pip install ipywidgets\n", - "from ydata_profiling import ProfileReport\n", - "\n", - "profile = ProfileReport(df, title=\"Profiling Report\")\n", - "\n", - "profile.to_notebook_iframe()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b642b1f3-4720-4028-805f-2a8e88958f41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting ucimlrepo\n", - " Obtaining dependency information for ucimlrepo from https://files.pythonhosted.org/packages/22/47/9350b2eeeaef8c0fd3ec3505c8a0481b576845b3df0d71c76f989c23d3c6/ucimlrepo-0.0.6-py3-none-any.whl.metadata\n", - " Downloading ucimlrepo-0.0.6-py3-none-any.whl.metadata (5.3 kB)\n", - "Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)\n", - "Installing collected packages: ucimlrepo\n", - "Successfully installed ucimlrepo-0.0.6\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "pip install ucimlrepo" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "cf67e63c-778b-4bb0-b48b-d50bc5fd2cba", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sandhu, K. Guppy, S. Lee, V. Froelicher', 'published_in': 'American Journal of Cardiology', 'year': 1989, 'url': 'https://www.semanticscholar.org/paper/a7d714f8f87bfc41351eb5ae1e5472f0ebbe0574', 'doi': None}, 'additional_info': {'summary': 'This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to date. The \"goal\" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0). \\n \\nThe names and social security numbers of the patients were recently removed from the database, replaced with dummy values.\\n\\nOne file has been \"processed\", that one containing the Cleveland database. All four unprocessed files also exist in this directory.\\n\\nTo see Test Costs (donated by Peter Turney), please see the folder \"Costs\" ', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Only 14 attributes used:\\r\\n 1. #3 (age) \\r\\n 2. #4 (sex) \\r\\n 3. #9 (cp) \\r\\n 4. #10 (trestbps) \\r\\n 5. #12 (chol) \\r\\n 6. #16 (fbs) \\r\\n 7. #19 (restecg) \\r\\n 8. #32 (thalach) \\r\\n 9. #38 (exang) \\r\\n 10. #40 (oldpeak) \\r\\n 11. #41 (slope) \\r\\n 12. #44 (ca) \\r\\n 13. #51 (thal) \\r\\n 14. #58 (num) (the predicted attribute)\\r\\n\\r\\nComplete attribute documentation:\\r\\n 1 id: patient identification number\\r\\n 2 ccf: social security number (I replaced this with a dummy value of 0)\\r\\n 3 age: age in years\\r\\n 4 sex: sex (1 = male; 0 = female)\\r\\n 5 painloc: chest pain location (1 = substernal; 0 = otherwise)\\r\\n 6 painexer (1 = provoked by exertion; 0 = otherwise)\\r\\n 7 relrest (1 = relieved after rest; 0 = otherwise)\\r\\n 8 pncaden (sum of 5, 6, and 7)\\r\\n 9 cp: chest pain type\\r\\n -- Value 1: typical angina\\r\\n -- Value 2: atypical angina\\r\\n -- Value 3: non-anginal pain\\r\\n -- Value 4: asymptomatic\\r\\n 10 trestbps: resting blood pressure (in mm Hg on admission to the hospital)\\r\\n 11 htn\\r\\n 12 chol: serum cholestoral in mg/dl\\r\\n 13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker)\\r\\n 14 cigs (cigarettes per day)\\r\\n 15 years (number of years as a smoker)\\r\\n 16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)\\r\\n 17 dm (1 = history of diabetes; 0 = no such history)\\r\\n 18 famhist: family history of coronary artery disease (1 = yes; 0 = no)\\r\\n 19 restecg: resting electrocardiographic results\\r\\n -- Value 0: normal\\r\\n -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)\\r\\n -- Value 2: showing probable or definite left ventricular hypertrophy by Estes\\' criteria\\r\\n 20 ekgmo (month of exercise ECG reading)\\r\\n 21 ekgday(day of exercise ECG reading)\\r\\n 22 ekgyr (year of exercise ECG reading)\\r\\n 23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no)\\r\\n 24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no)\\r\\n 25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no)\\r\\n 26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no)\\r\\n 27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no)\\r\\n 28 proto: exercise protocol\\r\\n 1 = Bruce \\r\\n 2 = Kottus\\r\\n 3 = McHenry\\r\\n 4 = fast Balke\\r\\n 5 = Balke\\r\\n 6 = Noughton \\r\\n 7 = bike 150 kpa min/min (Not sure if \"kpa min/min\" is what was written!)\\r\\n 8 = bike 125 kpa min/min \\r\\n 9 = bike 100 kpa min/min\\r\\n 10 = bike 75 kpa min/min\\r\\n 11 = bike 50 kpa min/min\\r\\n 12 = arm ergometer\\r\\n 29 thaldur: duration of exercise test in minutes\\r\\n 30 thaltime: time when ST measure depression was noted\\r\\n 31 met: mets achieved\\r\\n 32 thalach: maximum heart rate achieved\\r\\n 33 thalrest: resting heart rate\\r\\n 34 tpeakbps: peak exercise blood pressure (first of 2 parts)\\r\\n 35 tpeakbpd: peak exercise blood pressure (second of 2 parts)\\r\\n 36 dummy\\r\\n 37 trestbpd: resting blood pressure\\r\\n 38 exang: exercise induced angina (1 = yes; 0 = no)\\r\\n 39 xhypo: (1 = yes; 0 = no)\\r\\n 40 oldpeak = ST depression induced by exercise relative to rest\\r\\n 41 slope: the slope of the peak exercise ST segment\\r\\n -- Value 1: upsloping\\r\\n -- Value 2: flat\\r\\n -- Value 3: downsloping\\r\\n 42 rldv5: height at rest\\r\\n 43 rldv5e: height at peak exercise\\r\\n 44 ca: number of major vessels (0-3) colored by flourosopy\\r\\n 45 restckm: irrelevant\\r\\n 46 exerckm: irrelevant\\r\\n 47 restef: rest raidonuclid (sp?) ejection fraction\\r\\n 48 restwm: rest wall (sp?) motion abnormality\\r\\n 0 = none\\r\\n 1 = mild or moderate\\r\\n 2 = moderate or severe\\r\\n 3 = akinesis or dyskmem (sp?)\\r\\n 49 exeref: exercise radinalid (sp?) ejection fraction\\r\\n 50 exerwm: exercise wall (sp?) motion \\r\\n 51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect\\r\\n 52 thalsev: not used\\r\\n 53 thalpul: not used\\r\\n 54 earlobe: not used\\r\\n 55 cmo: month of cardiac cath (sp?) (perhaps \"call\")\\r\\n 56 cday: day of cardiac cath (sp?)\\r\\n 57 cyr: year of cardiac cath (sp?)\\r\\n 58 num: diagnosis of heart disease (angiographic disease status)\\r\\n -- Value 0: < 50% diameter narrowing\\r\\n -- Value 1: > 50% diameter narrowing\\r\\n (in any major vessel: attributes 59 through 68 are vessels)\\r\\n 59 lmt\\r\\n 60 ladprox\\r\\n 61 laddist\\r\\n 62 diag\\r\\n 63 cxmain\\r\\n 64 ramus\\r\\n 65 om1\\r\\n 66 om2\\r\\n 67 rcaprox\\r\\n 68 rcadist\\r\\n 69 lvx1: not used\\r\\n 70 lvx2: not used\\r\\n 71 lvx3: not used\\r\\n 72 lvx4: not used\\r\\n 73 lvf: not used\\r\\n 74 cathef: not used\\r\\n 75 junk: not used\\r\\n 76 name: last name of patient (I replaced this with the dummy string \"name\")', 'citation': None}}\n", - " name role type demographic \\\n", - "0 age Feature Integer Age \n", - "1 sex Feature Categorical Sex \n", - "2 cp Feature Categorical None \n", - "3 trestbps Feature Integer None \n", - "4 chol Feature Integer None \n", - "5 fbs Feature Categorical None \n", - "6 restecg Feature Categorical None \n", - "7 thalach Feature Integer None \n", - "8 exang Feature Categorical None \n", - "9 oldpeak Feature Integer None \n", - "10 slope Feature Categorical None \n", - "11 ca Feature Integer None \n", - "12 thal Feature Categorical None \n", - "13 num Target Integer None \n", - "\n", - " description units missing_values \n", - "0 None years no \n", - "1 None None no \n", - "2 None None no \n", - "3 resting blood pressure (on admission to the ho... mm Hg no \n", - "4 serum cholestoral mg/dl no \n", - "5 fasting blood sugar > 120 mg/dl None no \n", - "6 None None no \n", - "7 maximum heart rate achieved None no \n", - "8 exercise induced angina None no \n", - "9 ST depression induced by exercise relative to ... None no \n", - "10 None None no \n", - "11 number of major vessels (0-3) colored by flour... None yes \n", - "12 None None yes \n", - "13 diagnosis of heart disease None no \n" - ] - } - ], - "source": [ - "from ucimlrepo import fetch_ucirepo \n", - " \n", - "# fetch dataset \n", - "heart_disease = fetch_ucirepo(id=45) \n", - " \n", - "# data (as pandas dataframes) \n", - "X = heart_disease.data.features \n", - "y = heart_disease.data.targets \n", - " \n", - "# metadata \n", - "print(heart_disease.metadata) \n", - " \n", - "# variable information \n", - "print(heart_disease.variables) " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "b885f94c-e6b6-41d9-a484-57dc5ba98ac6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e3de4992aeec499aa299d79cde4a73ec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Summarize dataset: 0%| | 0/5 [00:00" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "profile = ProfileReport(X, title=\"heart disease data report\")\n", - "\n", - "profile.to_notebook_iframe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7b29ede-9e6e-4a0b-8803-827a579c2715", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}