{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "initial_id",
   "metadata": {
    "jupyter": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn import decomposition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "67503952-9074-4cdb-9d7e-d9142f7c319c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>trestbps</th>\n",
       "      <th>chol</th>\n",
       "      <th>thalach</th>\n",
       "      <th>oldpeak</th>\n",
       "      <th>sex_0</th>\n",
       "      <th>sex_1</th>\n",
       "      <th>cp_1</th>\n",
       "      <th>cp_2</th>\n",
       "      <th>cp_3</th>\n",
       "      <th>...</th>\n",
       "      <th>slope_1</th>\n",
       "      <th>slope_2</th>\n",
       "      <th>slope_3</th>\n",
       "      <th>thal_3.0</th>\n",
       "      <th>thal_6.0</th>\n",
       "      <th>thal_7.0</th>\n",
       "      <th>ca_0.0</th>\n",
       "      <th>ca_1.0</th>\n",
       "      <th>ca_2.0</th>\n",
       "      <th>ca_3.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.708333</td>\n",
       "      <td>0.481132</td>\n",
       "      <td>0.244292</td>\n",
       "      <td>0.603053</td>\n",
       "      <td>0.370968</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.791667</td>\n",
       "      <td>0.622642</td>\n",
       "      <td>0.365297</td>\n",
       "      <td>0.282443</td>\n",
       "      <td>0.241935</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.791667</td>\n",
       "      <td>0.245283</td>\n",
       "      <td>0.235160</td>\n",
       "      <td>0.442748</td>\n",
       "      <td>0.419355</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.339623</td>\n",
       "      <td>0.283105</td>\n",
       "      <td>0.885496</td>\n",
       "      <td>0.564516</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.339623</td>\n",
       "      <td>0.178082</td>\n",
       "      <td>0.770992</td>\n",
       "      <td>0.225806</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        age  trestbps      chol   thalach   oldpeak  sex_0  sex_1   cp_1  \\\n",
       "0  0.708333  0.481132  0.244292  0.603053  0.370968  False   True   True   \n",
       "1  0.791667  0.622642  0.365297  0.282443  0.241935  False   True  False   \n",
       "2  0.791667  0.245283  0.235160  0.442748  0.419355  False   True  False   \n",
       "3  0.166667  0.339623  0.283105  0.885496  0.564516  False   True  False   \n",
       "4  0.250000  0.339623  0.178082  0.770992  0.225806   True  False  False   \n",
       "\n",
       "    cp_2   cp_3  ...  slope_1  slope_2  slope_3  thal_3.0  thal_6.0  thal_7.0  \\\n",
       "0  False  False  ...    False    False     True     False      True     False   \n",
       "1  False  False  ...    False     True    False      True     False     False   \n",
       "2  False  False  ...    False     True    False     False     False      True   \n",
       "3  False   True  ...    False    False     True      True     False     False   \n",
       "4   True  False  ...     True    False    False      True     False     False   \n",
       "\n",
       "   ca_0.0  ca_1.0  ca_2.0  ca_3.0  \n",
       "0    True   False   False   False  \n",
       "1   False   False   False    True  \n",
       "2   False   False    True   False  \n",
       "3    True   False   False   False  \n",
       "4    True   False   False   False  \n",
       "\n",
       "[5 rows x 28 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/dataset_cleaned.csv')\n",
    "df.dropna(inplace=True)\n",
    "\n",
    "# extract all columns except 'goal' --> X\n",
    "X = df.loc[:, df.columns != 'goal']\n",
    "# extract only the column 'goal' --> y\n",
    "y = df.loc[:, 'goal']\n",
    "\n",
    "# add new axis to y, new shape: (n, 1)\n",
    "y = y.to_numpy()\n",
    "y = y.reshape((len(y),1))\n",
    "\n",
    "# binarize y\n",
    "y[y>0] = 1\n",
    "\n",
    "factor_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']\n",
    "numeric_columns = [column for column in X.columns if column not in factor_columns]\n",
    "\n",
    "# transform factors into onehot vectors\n",
    "X = pd.get_dummies(X, columns=factor_columns)\n",
    "\n",
    "# min max scaling of numeric columns\n",
    "scaler = MinMaxScaler()\n",
    "X[numeric_columns] = scaler.fit_transform(X[numeric_columns])\n",
    "\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2bbee865-c000-43da-84d9-ce7e04874110",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_model(n_features):\n",
    "    model = tf.keras.models.Sequential([\n",
    "        tf.keras.layers.InputLayer(shape=(n_features,)),\n",
    "        tf.keras.layers.Dense(30, activation='relu'),\n",
    "        tf.keras.layers.Dense(30, activation='relu'),\n",
    "        tf.keras.layers.Dense(1, activation='sigmoid')\n",
    "    ], name='test')\n",
    "    model.compile(optimizer=tf.keras.optimizers.Adam(), \n",
    "                  loss=tf.keras.losses.BinaryCrossentropy())\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "38eb4f87-ca3c-4ecf-a8ca-29422822d933",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training fold 0 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 0: 0.8666666666666667\n",
      "Training fold 1 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 1: 0.8\n",
      "Training fold 2 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 2: 0.9\n",
      "Training fold 3 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 3: 0.9\n",
      "Training fold 4 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 4: 0.8666666666666667\n",
      "Training fold 5 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 5: 0.8\n",
      "Training fold 6 for 20 epochs\n",
      "\tTrain samples:\t267\n",
      "\tTest samples:\t30\n",
      "\tAccuracy of fold 6: 0.8333333333333334\n",
      "Training fold 7 for 20 epochs\n",
      "\tTrain samples:\t268\n",
      "\tTest samples:\t29\n",
      "\tAccuracy of fold 7: 0.8620689655172413\n",
      "Training fold 8 for 20 epochs\n",
      "\tTrain samples:\t268\n",
      "\tTest samples:\t29\n",
      "\tAccuracy of fold 8: 0.7241379310344828\n",
      "Training fold 9 for 20 epochs\n",
      "\tTrain samples:\t268\n",
      "\tTest samples:\t29\n",
      "\tAccuracy of fold 9: 0.896551724137931\n",
      "Avg accuracy 0.8449425287356321\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "use_pca = True\n",
    "# number of components extracted from the pca\n",
    "n_features = 8\n",
    "n_features = n_features if use_pca else len(X.columns)\n",
    "\n",
    "epochs = 20\n",
    "k_folds = 10\n",
    "\n",
    "# used to split the dataset into k folds\n",
    "kf = KFold(n_splits=k_folds)\n",
    "\n",
    "accuracies = []\n",
    "for i, (train_idx, test_idx) in enumerate(kf.split(X)):\n",
    "    print(f'Training fold {i} for {epochs} epochs')\n",
    "\n",
    "    # extract train and test data from the cleaned dataset\n",
    "    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
    "    y_train, y_test = y[train_idx], y[test_idx]\n",
    "\n",
    "    print(f'\\tTrain samples:\\t{len(X_train)}')\n",
    "    print(f'\\tTest samples:\\t{len(X_test)}')\n",
    "\n",
    "    if use_pca:\n",
    "        # do pca based on the train data of the given fold to extract 'n_features'\n",
    "        pca = decomposition.PCA(n_components=n_features)\n",
    "        pca.fit(X_train)\n",
    "        X_train = pca.transform(X_train)\n",
    "\n",
    "    # train the model using the components extracted from pca\n",
    "    model = get_model(n_features)\n",
    "    model.fit(X_train, y_train, epochs=epochs, verbose=0)\n",
    "\n",
    "    if use_pca:\n",
    "        # transform test data using on the pca model trained on the train data\n",
    "        X_test = pca.transform(X_test)\n",
    "        \n",
    "    y_pred = model.predict(X_test, verbose=0)\n",
    "    y_pred = y_pred > 0.5 # threshold to binarize\n",
    "\n",
    "    # calculate the accuracy of the train data for the current fold\n",
    "    accuracy = sum(y_pred == y_test)[0] / len(y_pred)\n",
    "    accuracies.append(accuracy)\n",
    "    print(f'\\tAccuracy of fold {i}: {accuracy}')\n",
    "\n",
    "# calculate the average accuracy over all folds\n",
    "avg_accuracy = sum(accuracies) / len(accuracies)\n",
    "print(f'Avg accuracy {avg_accuracy}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "95215693-47c9-4202-92f5-efbc65bc32c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training fold 0 for 20 epochs\n",
      "\tTrain samples:\t237\n",
      "\tTest samples:\t60\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAccuracy of fold 0: 0.5833333333333334\n",
      "Training fold 1 for 20 epochs\n",
      "\tTrain samples:\t237\n",
      "\tTest samples:\t60\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAccuracy of fold 1: 0.5\n",
      "Training fold 2 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAccuracy of fold 2: 0.559322033898305\n",
      "Training fold 3 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAccuracy of fold 3: 0.576271186440678\n",
      "Training fold 4 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAccuracy of fold 4: 0.5254237288135594\n",
      "Avg accuracy 0.5488700564971751\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "use_pca = True\n",
    "# number of components extracted from the pca\n",
    "n_features = 10\n",
    "\n",
    "k_folds = 5\n",
    "\n",
    "# used to split the dataset into k folds\n",
    "kf = KFold(n_splits=k_folds)\n",
    "\n",
    "accuracies = []\n",
    "for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):\n",
    "    print(f'Training fold {i} for {epochs} epochs')\n",
    "\n",
    "    # extract train and test data from the cleaned dataset\n",
    "    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
    "    y_train, y_test = y[train_idx], y[test_idx]\n",
    "\n",
    "    print(f'\\tTrain samples:\\t{len(X_train)}')\n",
    "    print(f'\\tTest samples:\\t{len(X_test)}')\n",
    "\n",
    "    if use_pca:\n",
    "        # do pca based on the train data of the given fold to extract 'n_features'\n",
    "        pca = decomposition.PCA(n_components=n_features)\n",
    "        pca.fit(X_train)\n",
    "        X_train = pca.transform(X_train)\n",
    "\n",
    "    model = KMeans(n_clusters=2, n_init=10)\n",
    "    model.fit(X_train)\n",
    "\n",
    "    if use_pca:\n",
    "        X_test = pca.transform(X_test)\n",
    "        \n",
    "    y_pred = model.predict(X_test)\n",
    "\n",
    "    # calculate the accuracy of the train data for the current fold\n",
    "    accuracy1 = sum(y_pred == y_test)[0] / len(y_pred)\n",
    "    accuracy2 = sum(y_pred != y_test)[0] / len(y_pred)\n",
    "    accuracy = max(accuracy1, accuracy2)\n",
    "    accuracies.append(accuracy)\n",
    "    print(f'\\tAccuracy of fold {i}: {accuracy}')\n",
    "\n",
    "# calculate the average accuracy over all folds\n",
    "avg_accuracy = sum(accuracies) / len(accuracies)\n",
    "print(f'Avg accuracy {avg_accuracy}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "880302e4-82c1-47b9-9fe3-cb3567511639",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training fold 0 for 20 epochs\n",
      "\tTrain samples:\t237\n",
      "\tTest samples:\t60\n",
      "\tAccuracy of fold 0: 0.85\n",
      "Training fold 1 for 20 epochs\n",
      "\tTrain samples:\t237\n",
      "\tTest samples:\t60\n",
      "\tAccuracy of fold 1: 0.9\n",
      "Training fold 2 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n",
      "\tAccuracy of fold 2: 0.847457627118644\n",
      "Training fold 3 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n",
      "\tAccuracy of fold 3: 0.7627118644067796\n",
      "Training fold 4 for 20 epochs\n",
      "\tTrain samples:\t238\n",
      "\tTest samples:\t59\n",
      "\tAccuracy of fold 4: 0.7796610169491526\n",
      "Avg accuracy 0.8279661016949152\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "use_pca = True\n",
    "# number of components extracted from the pca\n",
    "n_features = 10\n",
    "\n",
    "k_folds = 5\n",
    "\n",
    "# used to split the dataset into k folds\n",
    "kf = KFold(n_splits=k_folds)\n",
    "\n",
    "accuracies = []\n",
    "for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):\n",
    "    print(f'Training fold {i} for {epochs} epochs')\n",
    "\n",
    "    # extract train and test data from the cleaned dataset\n",
    "    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
    "    y_train, y_test = y[train_idx], y[test_idx]\n",
    "    y_train, y_test = y_train[:, 0], y_test[:, 0]\n",
    "\n",
    "    print(f'\\tTrain samples:\\t{len(X_train)}')\n",
    "    print(f'\\tTest samples:\\t{len(X_test)}')\n",
    "\n",
    "    if use_pca:\n",
    "        # do pca based on the train data of the given fold to extract 'n_features'\n",
    "        pca = decomposition.PCA(n_components=n_features)\n",
    "        pca.fit(X_train)\n",
    "        X_train = pca.transform(X_train)\n",
    "\n",
    "    model = RandomForestClassifier(max_depth=2, random_state=0)\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    if use_pca:\n",
    "        X_test = pca.transform(X_test)\n",
    "        \n",
    "    y_pred = model.predict(X_test)\n",
    "\n",
    "    # calculate the accuracy of the train data for the current fold\n",
    "    accuracy = sum(y_pred == y_test) / len(y_pred)\n",
    "    accuracies.append(accuracy)\n",
    "    print(f'\\tAccuracy of fold {i}: {accuracy}')\n",
    "\n",
    "# calculate the average accuracy over all folds\n",
    "avg_accuracy = sum(accuracies) / len(accuracies)\n",
    "print(f'Avg accuracy {avg_accuracy}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}