added onehot encoding for factors and scaling for numeric columns

master
mahehsma 2024-06-05 12:27:29 +02:00
parent 8e80949a38
commit e2b6e45cc6
1 changed files with 209 additions and 148 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 2,
"id": "initial_id",
"metadata": {
"jupyter": {
@ -11,12 +11,13 @@
},
"outputs": [],
"source": [
"import pandas as pd"
"import pandas as pd\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 14,
"id": "67503952-9074-4cdb-9d7e-d9142f7c319c",
"metadata": {},
"outputs": [
@ -42,128 +43,180 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>cp</th>\n",
" <th>trestbps</th>\n",
" <th>chol</th>\n",
" <th>fbs</th>\n",
" <th>restecg</th>\n",
" <th>thalach</th>\n",
" <th>exang</th>\n",
" <th>oldpeak</th>\n",
" <th>slope</th>\n",
" <th>ca</th>\n",
" <th>thal</th>\n",
" <th>goal</th>\n",
" <th>sex_0</th>\n",
" <th>sex_1</th>\n",
" <th>cp_1</th>\n",
" <th>cp_2</th>\n",
" <th>cp_3</th>\n",
" <th>...</th>\n",
" <th>slope_1</th>\n",
" <th>slope_2</th>\n",
" <th>slope_3</th>\n",
" <th>thal_3.0</th>\n",
" <th>thal_6.0</th>\n",
" <th>thal_7.0</th>\n",
" <th>ca_0.0</th>\n",
" <th>ca_1.0</th>\n",
" <th>ca_2.0</th>\n",
" <th>ca_3.0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>63</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>145</td>\n",
" <td>233</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>150</td>\n",
" <td>0</td>\n",
" <td>2.3</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>0</td>\n",
" <td>0.708333</td>\n",
" <td>0.481132</td>\n",
" <td>0.244292</td>\n",
" <td>0.603053</td>\n",
" <td>0.370968</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>67</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>160</td>\n",
" <td>286</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>108</td>\n",
" <td>1</td>\n",
" <td>1.5</td>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>2</td>\n",
" <td>0.791667</td>\n",
" <td>0.622642</td>\n",
" <td>0.365297</td>\n",
" <td>0.282443</td>\n",
" <td>0.241935</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>67</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>120</td>\n",
" <td>229</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>129</td>\n",
" <td>1</td>\n",
" <td>2.6</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>7.0</td>\n",
" <td>1</td>\n",
" <td>0.791667</td>\n",
" <td>0.245283</td>\n",
" <td>0.235160</td>\n",
" <td>0.442748</td>\n",
" <td>0.419355</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>130</td>\n",
" <td>250</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>187</td>\n",
" <td>0</td>\n",
" <td>3.5</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0.166667</td>\n",
" <td>0.339623</td>\n",
" <td>0.283105</td>\n",
" <td>0.885496</td>\n",
" <td>0.564516</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>41</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>130</td>\n",
" <td>204</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>172</td>\n",
" <td>0</td>\n",
" <td>1.4</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0.250000</td>\n",
" <td>0.339623</td>\n",
" <td>0.178082</td>\n",
" <td>0.770992</td>\n",
" <td>0.225806</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n",
"0 63 1 1 145 233 1 2 150 0 2.3 3 \n",
"1 67 1 4 160 286 0 2 108 1 1.5 2 \n",
"2 67 1 4 120 229 0 2 129 1 2.6 2 \n",
"3 37 1 3 130 250 0 0 187 0 3.5 3 \n",
"4 41 0 2 130 204 0 2 172 0 1.4 1 \n",
" age trestbps chol thalach oldpeak sex_0 sex_1 cp_1 \\\n",
"0 0.708333 0.481132 0.244292 0.603053 0.370968 False True True \n",
"1 0.791667 0.622642 0.365297 0.282443 0.241935 False True False \n",
"2 0.791667 0.245283 0.235160 0.442748 0.419355 False True False \n",
"3 0.166667 0.339623 0.283105 0.885496 0.564516 False True False \n",
"4 0.250000 0.339623 0.178082 0.770992 0.225806 True False False \n",
"\n",
" ca thal goal \n",
"0 0.0 6.0 0 \n",
"1 3.0 3.0 2 \n",
"2 2.0 7.0 1 \n",
"3 0.0 3.0 0 \n",
"4 0.0 3.0 0 "
" cp_2 cp_3 ... slope_1 slope_2 slope_3 thal_3.0 thal_6.0 thal_7.0 \\\n",
"0 False False ... False False True False True False \n",
"1 False False ... False True False True False False \n",
"2 False False ... False True False False False True \n",
"3 False True ... False False True True False False \n",
"4 True False ... True False False True False False \n",
"\n",
" ca_0.0 ca_1.0 ca_2.0 ca_3.0 \n",
"0 True False False False \n",
"1 False False False True \n",
"2 False False True False \n",
"3 True False False False \n",
"4 True False False False \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 25,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -171,31 +224,7 @@
"source": [
"df = pd.read_csv('./data/dataset_cleaned.csv')\n",
"df.dropna(inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "8fa945ef-34d4-4e4c-a1cd-f1e1e6da79e7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0],\n",
" [1],\n",
" [1],\n",
" [0],\n",
" [0]], dtype=int64)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"# extract all columns except 'goal' --> X\n",
"X = df.loc[:, df.columns != 'goal']\n",
"# extract only the column 'goal' --> y\n",
@ -206,12 +235,24 @@
"y = y.reshape((len(y),1))\n",
"\n",
"# binarize y\n",
"y[y>0] = 1"
"y[y>0] = 1\n",
"\n",
"factor_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']\n",
"numeric_columns = [column for column in X.columns if column not in factor_columns]\n",
"\n",
"# transform factors into onehot vectors\n",
"X = pd.get_dummies(X, columns=factor_columns)\n",
"\n",
"# min max scaling of numeric columns\n",
"scaler = MinMaxScaler()\n",
"X[numeric_columns] = scaler.fit_transform(X[numeric_columns])\n",
"\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 18,
"id": "2bbee865-c000-43da-84d9-ce7e04874110",
"metadata": {},
"outputs": [],
@ -221,7 +262,6 @@
" tf.keras.layers.InputLayer(shape=(n_features,)),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(1, activation='sigmoid')\n",
" ], name='test')\n",
" model.compile(optimizer=tf.keras.optimizers.Adam(), \n",
@ -231,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 20,
"id": "38eb4f87-ca3c-4ecf-a8ca-29422822d933",
"metadata": {},
"outputs": [
@ -239,27 +279,47 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Training fold 0 for 5 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n",
"Accuracy of fold 0: 0.6166666666666667\n",
"Training fold 1 for 5 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n",
"Accuracy of fold 1: 0.75\n",
"Training fold 2 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 2: 0.6949152542372882\n",
"Training fold 3 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 3: 0.7457627118644068\n",
"Training fold 4 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 4: 0.6610169491525424\n",
"Avg accuracy 0.6936723163841808\n"
"Training fold 0 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 0: 0.8666666666666667\n",
"Training fold 1 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 1: 0.8666666666666667\n",
"Training fold 2 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 2: 0.8666666666666667\n",
"Training fold 3 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 3: 0.9333333333333333\n",
"Training fold 4 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 4: 0.8666666666666667\n",
"Training fold 5 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 5: 0.8333333333333334\n",
"Training fold 6 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 6: 0.8666666666666667\n",
"Training fold 7 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 7: 0.896551724137931\n",
"Training fold 8 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 8: 0.7931034482758621\n",
"Training fold 9 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 9: 0.7931034482758621\n",
"Avg accuracy 0.8582758620689654\n"
]
}
],
@ -269,12 +329,13 @@
"import tensorflow as tf\n",
"\n",
"# number of components extracted from the pca\n",
"n_features = 5 \n",
"n_features = 8\n",
"\n",
"epochs = 5\n",
"epochs = 20\n",
"k_folds = 10\n",
"\n",
"# used to split the dataset into k folds\n",
"kf = KFold(n_splits=5)\n",
"kf = KFold(n_splits=k_folds)\n",
"\n",
"accuracies = []\n",
"for i, (train_idx, test_idx) in enumerate(kf.split(X)):\n",