diff --git a/Experiments.ipynb b/Experiments.ipynb
new file mode 100644
index 0000000..e81ee88
--- /dev/null
+++ b/Experiments.ipynb
@@ -0,0 +1,336 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "initial_id",
+ "metadata": {
+ "jupyter": {
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "67503952-9074-4cdb-9d7e-d9142f7c319c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " sex | \n",
+ " cp | \n",
+ " trestbps | \n",
+ " chol | \n",
+ " fbs | \n",
+ " restecg | \n",
+ " thalach | \n",
+ " exang | \n",
+ " oldpeak | \n",
+ " slope | \n",
+ " ca | \n",
+ " thal | \n",
+ " goal | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 63 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 145 | \n",
+ " 233 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 150 | \n",
+ " 0 | \n",
+ " 2.3 | \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 67 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 160 | \n",
+ " 286 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 108 | \n",
+ " 1 | \n",
+ " 1.5 | \n",
+ " 2 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 67 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 120 | \n",
+ " 229 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 129 | \n",
+ " 1 | \n",
+ " 2.6 | \n",
+ " 2 | \n",
+ " 2.0 | \n",
+ " 7.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 37 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 130 | \n",
+ " 250 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 187 | \n",
+ " 0 | \n",
+ " 3.5 | \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 41 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 130 | \n",
+ " 204 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 172 | \n",
+ " 0 | \n",
+ " 1.4 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n",
+ "0 63 1 1 145 233 1 2 150 0 2.3 3 \n",
+ "1 67 1 4 160 286 0 2 108 1 1.5 2 \n",
+ "2 67 1 4 120 229 0 2 129 1 2.6 2 \n",
+ "3 37 1 3 130 250 0 0 187 0 3.5 3 \n",
+ "4 41 0 2 130 204 0 2 172 0 1.4 1 \n",
+ "\n",
+ " ca thal goal \n",
+ "0 0.0 6.0 0 \n",
+ "1 3.0 3.0 2 \n",
+ "2 2.0 7.0 1 \n",
+ "3 0.0 3.0 0 \n",
+ "4 0.0 3.0 0 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('./data/dataset_cleaned.csv')\n",
+ "df.dropna(inplace=True)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "8fa945ef-34d4-4e4c-a1cd-f1e1e6da79e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0],\n",
+ " [1],\n",
+ " [1],\n",
+ " [0],\n",
+ " [0]], dtype=int64)"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# extract all columns except 'goal' --> X\n",
+ "X = df.loc[:, df.columns != 'goal']\n",
+ "# extract only the column 'goal' --> y\n",
+ "y = df.loc[:, 'goal']\n",
+ "\n",
+ "# add new axis to y, new shape: (n, 1)\n",
+ "y = y.to_numpy()\n",
+ "y = y.reshape((len(y),1))\n",
+ "\n",
+ "# binarize y\n",
+ "y[y>0] = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "id": "2bbee865-c000-43da-84d9-ce7e04874110",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_model(n_features):\n",
+ " model = tf.keras.models.Sequential([\n",
+ " tf.keras.layers.InputLayer(shape=(n_features,)),\n",
+ " tf.keras.layers.Dense(30, activation='relu'),\n",
+ " tf.keras.layers.Dense(30, activation='relu'),\n",
+ " tf.keras.layers.Dense(30, activation='relu'),\n",
+ " tf.keras.layers.Dense(1, activation='sigmoid')\n",
+ " ], name='test')\n",
+ " model.compile(optimizer=tf.keras.optimizers.Adam(), \n",
+ " loss=tf.keras.losses.BinaryCrossentropy())\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "id": "38eb4f87-ca3c-4ecf-a8ca-29422822d933",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training fold 0 for 5 epochs\n",
+ "Train samples:\t237\n",
+ "Test samples:\t60\n",
+ "Accuracy of fold 0: 0.6166666666666667\n",
+ "Training fold 1 for 5 epochs\n",
+ "Train samples:\t237\n",
+ "Test samples:\t60\n",
+ "Accuracy of fold 1: 0.75\n",
+ "Training fold 2 for 5 epochs\n",
+ "Train samples:\t238\n",
+ "Test samples:\t59\n",
+ "Accuracy of fold 2: 0.6949152542372882\n",
+ "Training fold 3 for 5 epochs\n",
+ "Train samples:\t238\n",
+ "Test samples:\t59\n",
+ "Accuracy of fold 3: 0.7457627118644068\n",
+ "Training fold 4 for 5 epochs\n",
+ "Train samples:\t238\n",
+ "Test samples:\t59\n",
+ "Accuracy of fold 4: 0.6610169491525424\n",
+ "Avg accuracy 0.6936723163841808\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import KFold\n",
+ "from sklearn import decomposition\n",
+ "import tensorflow as tf\n",
+ "\n",
+ "# number of components extracted from the pca\n",
+ "n_features = 5 \n",
+ "\n",
+ "epochs = 5\n",
+ "\n",
+ "# used to split the dataset into k folds\n",
+ "kf = KFold(n_splits=5)\n",
+ "\n",
+ "accuracies = []\n",
+ "for i, (train_idx, test_idx) in enumerate(kf.split(X)):\n",
+ " print(f'Training fold {i} for {epochs} epochs')\n",
+ "\n",
+ " # extract train and test data from the cleaned dataset\n",
+ " X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
+ " y_train, y_test = y[train_idx], y[test_idx]\n",
+ "\n",
+ " print(f'Train samples:\\t{len(X_train)}')\n",
+ " print(f'Test samples:\\t{len(X_test)}')\n",
+ "\n",
+ " # do pca based on the train data of the given fold to extract 'n_features'\n",
+ " pca = decomposition.PCA(n_components=n_features)\n",
+ " pca.fit(X_train)\n",
+ " X_train = pca.transform(X_train)\n",
+ "\n",
+ " # train the model using the components extracted from pca\n",
+ " model = get_model(n_features)\n",
+ " model.fit(X_train, y_train, epochs=epochs, verbose=0)\n",
+ "\n",
+ " # transform test data using on the pca model trained on the train data\n",
+ " X_test = pca.transform(X_test)\n",
+ " y_pred = model.predict(X_test, verbose=0)\n",
+ " y_pred = y_pred > 0.5\n",
+ "\n",
+ " # calculate the accuracy of the train data for the current fold\n",
+ " accuracy = sum(y_pred == y_test)[0] / len(y_pred)\n",
+ " accuracies.append(accuracy)\n",
+ " print(f'Accuracy of fold {i}: {accuracy}')\n",
+ "\n",
+ "# calculate the average accuracy over all folds\n",
+ "avg_accuracy = sum(accuracies) / len(accuracies)\n",
+ "print(f'Avg accuracy {avg_accuracy}')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Exploration.ipynb b/Exploration.ipynb
index 77413ab..be2a4a6 100644
--- a/Exploration.ipynb
+++ b/Exploration.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": 3,
"id": "37d611da-6f56-46d8-905a-62026750150c",
"metadata": {
"tags": []
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 127,
+ "execution_count": 6,
"id": "ae26378f-c104-4664-a313-ed8d9edfed42",
"metadata": {
"tags": []
@@ -174,7 +174,7 @@
"4 0.0 3.0 0 "
]
},
- "execution_count": 127,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -186,6 +186,16 @@
"df.head()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "feef6121-af85-4bd5-a04f-f2ff38b3c556",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df.to_csv('./data/dataset_cleaned.csv', index=False)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 128,
@@ -496,7 +506,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.11.7"
}
},
"nbformat": 4,