From a292fe5a0f24833f2be07d5e8844ac7fe77970b7 Mon Sep 17 00:00:00 2001 From: mahehsma Date: Fri, 7 Jun 2024 10:59:33 +0200 Subject: [PATCH] added cleaning notebook --- Cleaning.ipynb | 269 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 Cleaning.ipynb diff --git a/Cleaning.ipynb b/Cleaning.ipynb new file mode 100644 index 0000000..7f71502 --- /dev/null +++ b/Cleaning.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c95fbd16-09ed-497b-892a-473496150996", + "metadata": {}, + "source": [ + "

Cleaning

\n", + "

Import dataset using the ucirepo package

" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3eb339fa-ef85-4544-9ad0-bc22d4de9f1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathalgoal
063111452331215002.330.06.00
167141602860210811.523.03.02
267141202290212912.622.07.01
337131302500018703.530.03.00
441021302040217201.410.03.00
\n", + "
" + ], + "text/plain": [ + " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n", + "0 63 1 1 145 233 1 2 150 0 2.3 3 \n", + "1 67 1 4 160 286 0 2 108 1 1.5 2 \n", + "2 67 1 4 120 229 0 2 129 1 2.6 2 \n", + "3 37 1 3 130 250 0 0 187 0 3.5 3 \n", + "4 41 0 2 130 204 0 2 172 0 1.4 1 \n", + "\n", + " ca thal goal \n", + "0 0.0 6.0 0 \n", + "1 3.0 3.0 2 \n", + "2 2.0 7.0 1 \n", + "3 0.0 3.0 0 \n", + "4 0.0 3.0 0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ucimlrepo import fetch_ucirepo\n", + "import pandas as pd\n", + "\n", + "# fetch dataset \n", + "heart_disease = fetch_ucirepo(id=45) \n", + " \n", + "# data (as pandas dataframes) \n", + "X = heart_disease.data.features \n", + "y = heart_disease.data.targets \n", + "\n", + "df = pd.concat([X, y], axis=1)\n", + "df = df.rename(columns={'num':'goal'})\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8c5ab8b9-e46a-4968-b0c8-fe393f093f73", + "metadata": {}, + "source": [ + "

Get overview of all missing values. As there are only a few, those rows can be dropped.

" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f7e6a3a-63cb-40e2-8746-937c24b184ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "age 0\n", + "sex 0\n", + "cp 0\n", + "trestbps 0\n", + "chol 0\n", + "fbs 0\n", + "restecg 0\n", + "thalach 0\n", + "exang 0\n", + "oldpeak 0\n", + "slope 0\n", + "ca 4\n", + "thal 2\n", + "goal 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d1639e92-d401-49fb-a1f1-67250ffa2c81", + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d7bf2c46-7885-4dfe-a4e7-8b8439cf0434", + "metadata": {}, + "outputs": [], + "source": [ + "# save 'cleaned' dataset as csv file to\n", + "df.to_csv('./data/dataset_cleaned.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}