added first experiments (small neural network using pca output)

master
mahehsma 2024-06-05 11:20:55 +02:00
parent cc2ec24052
commit 8e80949a38
2 changed files with 350 additions and 4 deletions

336
Experiments.ipynb 100644
View File

@ -0,0 +1,336 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"id": "initial_id",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "67503952-9074-4cdb-9d7e-d9142f7c319c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>cp</th>\n",
" <th>trestbps</th>\n",
" <th>chol</th>\n",
" <th>fbs</th>\n",
" <th>restecg</th>\n",
" <th>thalach</th>\n",
" <th>exang</th>\n",
" <th>oldpeak</th>\n",
" <th>slope</th>\n",
" <th>ca</th>\n",
" <th>thal</th>\n",
" <th>goal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>63</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>145</td>\n",
" <td>233</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>150</td>\n",
" <td>0</td>\n",
" <td>2.3</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>67</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>160</td>\n",
" <td>286</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>108</td>\n",
" <td>1</td>\n",
" <td>1.5</td>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>67</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>120</td>\n",
" <td>229</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>129</td>\n",
" <td>1</td>\n",
" <td>2.6</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>7.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>130</td>\n",
" <td>250</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>187</td>\n",
" <td>0</td>\n",
" <td>3.5</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>41</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>130</td>\n",
" <td>204</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>172</td>\n",
" <td>0</td>\n",
" <td>1.4</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n",
"0 63 1 1 145 233 1 2 150 0 2.3 3 \n",
"1 67 1 4 160 286 0 2 108 1 1.5 2 \n",
"2 67 1 4 120 229 0 2 129 1 2.6 2 \n",
"3 37 1 3 130 250 0 0 187 0 3.5 3 \n",
"4 41 0 2 130 204 0 2 172 0 1.4 1 \n",
"\n",
" ca thal goal \n",
"0 0.0 6.0 0 \n",
"1 3.0 3.0 2 \n",
"2 2.0 7.0 1 \n",
"3 0.0 3.0 0 \n",
"4 0.0 3.0 0 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./data/dataset_cleaned.csv')\n",
"df.dropna(inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "8fa945ef-34d4-4e4c-a1cd-f1e1e6da79e7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0],\n",
" [1],\n",
" [1],\n",
" [0],\n",
" [0]], dtype=int64)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# extract all columns except 'goal' --> X\n",
"X = df.loc[:, df.columns != 'goal']\n",
"# extract only the column 'goal' --> y\n",
"y = df.loc[:, 'goal']\n",
"\n",
"# add new axis to y, new shape: (n, 1)\n",
"y = y.to_numpy()\n",
"y = y.reshape((len(y),1))\n",
"\n",
"# binarize y\n",
"y[y>0] = 1"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "2bbee865-c000-43da-84d9-ce7e04874110",
"metadata": {},
"outputs": [],
"source": [
"def get_model(n_features):\n",
" model = tf.keras.models.Sequential([\n",
" tf.keras.layers.InputLayer(shape=(n_features,)),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(1, activation='sigmoid')\n",
" ], name='test')\n",
" model.compile(optimizer=tf.keras.optimizers.Adam(), \n",
" loss=tf.keras.losses.BinaryCrossentropy())\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "38eb4f87-ca3c-4ecf-a8ca-29422822d933",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training fold 0 for 5 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n",
"Accuracy of fold 0: 0.6166666666666667\n",
"Training fold 1 for 5 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n",
"Accuracy of fold 1: 0.75\n",
"Training fold 2 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 2: 0.6949152542372882\n",
"Training fold 3 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 3: 0.7457627118644068\n",
"Training fold 4 for 5 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n",
"Accuracy of fold 4: 0.6610169491525424\n",
"Avg accuracy 0.6936723163841808\n"
]
}
],
"source": [
"from sklearn.model_selection import KFold\n",
"from sklearn import decomposition\n",
"import tensorflow as tf\n",
"\n",
"# number of components extracted from the pca\n",
"n_features = 5 \n",
"\n",
"epochs = 5\n",
"\n",
"# used to split the dataset into k folds\n",
"kf = KFold(n_splits=5)\n",
"\n",
"accuracies = []\n",
"for i, (train_idx, test_idx) in enumerate(kf.split(X)):\n",
" print(f'Training fold {i} for {epochs} epochs')\n",
"\n",
" # extract train and test data from the cleaned dataset\n",
" X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
" y_train, y_test = y[train_idx], y[test_idx]\n",
"\n",
" print(f'Train samples:\\t{len(X_train)}')\n",
" print(f'Test samples:\\t{len(X_test)}')\n",
"\n",
" # do pca based on the train data of the given fold to extract 'n_features'\n",
" pca = decomposition.PCA(n_components=n_features)\n",
" pca.fit(X_train)\n",
" X_train = pca.transform(X_train)\n",
"\n",
" # train the model using the components extracted from pca\n",
" model = get_model(n_features)\n",
" model.fit(X_train, y_train, epochs=epochs, verbose=0)\n",
"\n",
" # transform test data using on the pca model trained on the train data\n",
" X_test = pca.transform(X_test)\n",
" y_pred = model.predict(X_test, verbose=0)\n",
" y_pred = y_pred > 0.5\n",
"\n",
" # calculate the accuracy of the train data for the current fold\n",
" accuracy = sum(y_pred == y_test)[0] / len(y_pred)\n",
" accuracies.append(accuracy)\n",
" print(f'Accuracy of fold {i}: {accuracy}')\n",
"\n",
"# calculate the average accuracy over all folds\n",
"avg_accuracy = sum(accuracies) / len(accuracies)\n",
"print(f'Avg accuracy {avg_accuracy}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 120, "execution_count": 3,
"id": "37d611da-6f56-46d8-905a-62026750150c", "id": "37d611da-6f56-46d8-905a-62026750150c",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -25,7 +25,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 127, "execution_count": 6,
"id": "ae26378f-c104-4664-a313-ed8d9edfed42", "id": "ae26378f-c104-4664-a313-ed8d9edfed42",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -174,7 +174,7 @@
"4 0.0 3.0 0 " "4 0.0 3.0 0 "
] ]
}, },
"execution_count": 127, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -186,6 +186,16 @@
"df.head()" "df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 8,
"id": "feef6121-af85-4bd5-a04f-f2ff38b3c556",
"metadata": {},
"outputs": [],
"source": [
"# df.to_csv('./data/dataset_cleaned.csv', index=False)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 128, "execution_count": 128,
@ -496,7 +506,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.5" "version": "3.11.7"
} }
}, },
"nbformat": 4, "nbformat": 4,