diff --git a/Cleaning.ipynb b/Cleaning.ipynb new file mode 100644 index 0000000..7f71502 --- /dev/null +++ b/Cleaning.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c95fbd16-09ed-497b-892a-473496150996", + "metadata": {}, + "source": [ + "
Import dataset using the ucirepo package
" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3eb339fa-ef85-4544-9ad0-bc22d4de9f1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | age | \n", + "sex | \n", + "cp | \n", + "trestbps | \n", + "chol | \n", + "fbs | \n", + "restecg | \n", + "thalach | \n", + "exang | \n", + "oldpeak | \n", + "slope | \n", + "ca | \n", + "thal | \n", + "goal | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "63 | \n", + "1 | \n", + "1 | \n", + "145 | \n", + "233 | \n", + "1 | \n", + "2 | \n", + "150 | \n", + "0 | \n", + "2.3 | \n", + "3 | \n", + "0.0 | \n", + "6.0 | \n", + "0 | \n", + "
1 | \n", + "67 | \n", + "1 | \n", + "4 | \n", + "160 | \n", + "286 | \n", + "0 | \n", + "2 | \n", + "108 | \n", + "1 | \n", + "1.5 | \n", + "2 | \n", + "3.0 | \n", + "3.0 | \n", + "2 | \n", + "
2 | \n", + "67 | \n", + "1 | \n", + "4 | \n", + "120 | \n", + "229 | \n", + "0 | \n", + "2 | \n", + "129 | \n", + "1 | \n", + "2.6 | \n", + "2 | \n", + "2.0 | \n", + "7.0 | \n", + "1 | \n", + "
3 | \n", + "37 | \n", + "1 | \n", + "3 | \n", + "130 | \n", + "250 | \n", + "0 | \n", + "0 | \n", + "187 | \n", + "0 | \n", + "3.5 | \n", + "3 | \n", + "0.0 | \n", + "3.0 | \n", + "0 | \n", + "
4 | \n", + "41 | \n", + "0 | \n", + "2 | \n", + "130 | \n", + "204 | \n", + "0 | \n", + "2 | \n", + "172 | \n", + "0 | \n", + "1.4 | \n", + "1 | \n", + "0.0 | \n", + "3.0 | \n", + "0 | \n", + "
Get overview of all missing values. As there are only a few, those rows can be dropped.
" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f7e6a3a-63cb-40e2-8746-937c24b184ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "age 0\n", + "sex 0\n", + "cp 0\n", + "trestbps 0\n", + "chol 0\n", + "fbs 0\n", + "restecg 0\n", + "thalach 0\n", + "exang 0\n", + "oldpeak 0\n", + "slope 0\n", + "ca 4\n", + "thal 2\n", + "goal 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d1639e92-d401-49fb-a1f1-67250ffa2c81", + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d7bf2c46-7885-4dfe-a4e7-8b8439cf0434", + "metadata": {}, + "outputs": [], + "source": [ + "# save 'cleaned' dataset as csv file to\n", + "df.to_csv('./data/dataset_cleaned.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}