{ "cells": [ { "cell_type": "markdown", "id": "c95fbd16-09ed-497b-892a-473496150996", "metadata": {}, "source": [ "
Import dataset using the ucirepo package
" ] }, { "cell_type": "code", "execution_count": 2, "id": "3eb339fa-ef85-4544-9ad0-bc22d4de9f1a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " | age | \n", "sex | \n", "cp | \n", "trestbps | \n", "chol | \n", "fbs | \n", "restecg | \n", "thalach | \n", "exang | \n", "oldpeak | \n", "slope | \n", "ca | \n", "thal | \n", "goal | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "63 | \n", "1 | \n", "1 | \n", "145 | \n", "233 | \n", "1 | \n", "2 | \n", "150 | \n", "0 | \n", "2.3 | \n", "3 | \n", "0.0 | \n", "6.0 | \n", "0 | \n", "
1 | \n", "67 | \n", "1 | \n", "4 | \n", "160 | \n", "286 | \n", "0 | \n", "2 | \n", "108 | \n", "1 | \n", "1.5 | \n", "2 | \n", "3.0 | \n", "3.0 | \n", "2 | \n", "
2 | \n", "67 | \n", "1 | \n", "4 | \n", "120 | \n", "229 | \n", "0 | \n", "2 | \n", "129 | \n", "1 | \n", "2.6 | \n", "2 | \n", "2.0 | \n", "7.0 | \n", "1 | \n", "
3 | \n", "37 | \n", "1 | \n", "3 | \n", "130 | \n", "250 | \n", "0 | \n", "0 | \n", "187 | \n", "0 | \n", "3.5 | \n", "3 | \n", "0.0 | \n", "3.0 | \n", "0 | \n", "
4 | \n", "41 | \n", "0 | \n", "2 | \n", "130 | \n", "204 | \n", "0 | \n", "2 | \n", "172 | \n", "0 | \n", "1.4 | \n", "1 | \n", "0.0 | \n", "3.0 | \n", "0 | \n", "
Get overview of all missing values. As there are only a few, those rows can be dropped.
" ] }, { "cell_type": "code", "execution_count": 4, "id": "6f7e6a3a-63cb-40e2-8746-937c24b184ef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "age 0\n", "sex 0\n", "cp 0\n", "trestbps 0\n", "chol 0\n", "fbs 0\n", "restecg 0\n", "thalach 0\n", "exang 0\n", "oldpeak 0\n", "slope 0\n", "ca 4\n", "thal 2\n", "goal 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 5, "id": "d1639e92-d401-49fb-a1f1-67250ffa2c81", "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True)" ] }, { "cell_type": "markdown", "id": "192da26d-0fb1-4b06-a046-a41b66576ed0", "metadata": {}, "source": [ "Split
" ] }, { "cell_type": "code", "execution_count": 17, "id": "24675f41-d48f-4e27-a3d8-e303556ee7d1", "metadata": {}, "outputs": [], "source": [ "df['goal'] = df['goal'].replace({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})" ] }, { "cell_type": "code", "execution_count": 23, "id": "d7bf2c46-7885-4dfe-a4e7-8b8439cf0434", "metadata": {}, "outputs": [], "source": [ "# save 'cleaned' dataset as csv file for further processing\n", "import os\n", "os.makedirs('data', exist_ok=True)\n", "df.to_csv('./data/dataset_cleaned.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "ce134731-e15e-4a2a-aa4e-a28de45fd3af", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }