{ "cells": [ { "cell_type": "markdown", "id": "c95fbd16-09ed-497b-892a-473496150996", "metadata": {}, "source": [ "

Cleaning

\n", "

Import dataset using the ucirepo package

" ] }, { "cell_type": "code", "execution_count": 1, "id": "3eb339fa-ef85-4544-9ad0-bc22d4de9f1a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathalgoal
063111452331215002.330.06.00
167141602860210811.523.03.02
267141202290212912.622.07.01
337131302500018703.530.03.00
441021302040217201.410.03.00
\n", "
" ], "text/plain": [ " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n", "0 63 1 1 145 233 1 2 150 0 2.3 3 \n", "1 67 1 4 160 286 0 2 108 1 1.5 2 \n", "2 67 1 4 120 229 0 2 129 1 2.6 2 \n", "3 37 1 3 130 250 0 0 187 0 3.5 3 \n", "4 41 0 2 130 204 0 2 172 0 1.4 1 \n", "\n", " ca thal goal \n", "0 0.0 6.0 0 \n", "1 3.0 3.0 2 \n", "2 2.0 7.0 1 \n", "3 0.0 3.0 0 \n", "4 0.0 3.0 0 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from ucimlrepo import fetch_ucirepo\n", "import pandas as pd\n", "\n", "# fetch dataset \n", "heart_disease = fetch_ucirepo(id=45) \n", " \n", "# data (as pandas dataframes) \n", "X = heart_disease.data.features \n", "y = heart_disease.data.targets \n", "\n", "df = pd.concat([X, y], axis=1)\n", "df = df.rename(columns={'num':'goal'})\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "id": "8c5ab8b9-e46a-4968-b0c8-fe393f093f73", "metadata": {}, "source": [ "

Get overview of all missing values. As there are only a few, those rows can be dropped.

" ] }, { "cell_type": "code", "execution_count": 2, "id": "6f7e6a3a-63cb-40e2-8746-937c24b184ef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "age 0\n", "sex 0\n", "cp 0\n", "trestbps 0\n", "chol 0\n", "fbs 0\n", "restecg 0\n", "thalach 0\n", "exang 0\n", "oldpeak 0\n", "slope 0\n", "ca 4\n", "thal 2\n", "goal 0\n", "dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 3, "id": "d1639e92-d401-49fb-a1f1-67250ffa2c81", "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True)" ] }, { "cell_type": "markdown", "id": "192da26d-0fb1-4b06-a046-a41b66576ed0", "metadata": {}, "source": [ "

Preprocessing

\n", "

Split

" ] }, { "cell_type": "code", "execution_count": 4, "id": "24675f41-d48f-4e27-a3d8-e303556ee7d1", "metadata": {}, "outputs": [], "source": [ "df['goal'] = df['goal'].replace({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})" ] }, { "cell_type": "code", "execution_count": 5, "id": "d7bf2c46-7885-4dfe-a4e7-8b8439cf0434", "metadata": {}, "outputs": [], "source": [ "# save 'cleaned' dataset as csv file for further processing\n", "import os\n", "os.makedirs('data', exist_ok=True)\n", "df.to_csv('./data/dataset_cleaned.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }