In [8]:
import pandas as pd

In [25]:
df = pd.read_csv('./data/dataset_cleaned.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,goal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [60]:
# extract all columns except 'goal' --> X
X = df.loc[:, df.columns != 'goal']
# extract only the column 'goal' --> y
y = df.loc[:, 'goal']

# add new axis to y, new shape: (n, 1)
y = y.to_numpy()
y = y.reshape((len(y),1))

# binarize y
y[y>0] = 1

array([[0],
       [1],
       [1],
       [0],
       [0]], dtype=int64)

In [91]:
def get_model(n_features):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(shape=(n_features,)),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name='test')
    model.compile(optimizer=tf.keras.optimizers.Adam(), 
                  loss=tf.keras.losses.BinaryCrossentropy())
    return model

In [97]:
from sklearn.model_selection import KFold
from sklearn import decomposition
import tensorflow as tf

# number of components extracted from the pca
n_features = 5 

epochs = 5

# used to split the dataset into k folds
kf = KFold(n_splits=5)

accuracies = []
for i, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f'Training fold {i} for {epochs} epochs')

    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f'Train samples:\t{len(X_train)}')
    print(f'Test samples:\t{len(X_test)}')

    # do pca based on the train data of the given fold to extract 'n_features'
    pca = decomposition.PCA(n_components=n_features)
    pca.fit(X_train)
    X_train = pca.transform(X_train)

    # train the model using the components extracted from pca
    model = get_model(n_features)
    model.fit(X_train, y_train, epochs=epochs, verbose=0)

    # transform test data using on the pca model trained on the train data
    X_test = pca.transform(X_test)
    y_pred = model.predict(X_test, verbose=0)
    y_pred = y_pred > 0.5

    # calculate the accuracy of the train data for the current fold
    accuracy = sum(y_pred == y_test)[0] / len(y_pred)
    accuracies.append(accuracy)
    print(f'Accuracy of fold {i}: {accuracy}')

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy}')

Training fold 0 for 5 epochs
Train samples:	237
Test samples:	60
Accuracy of fold 0: 0.6166666666666667
Training fold 1 for 5 epochs
Train samples:	237
Test samples:	60
Accuracy of fold 1: 0.75
Training fold 2 for 5 epochs
Train samples:	238
Test samples:	59
Accuracy of fold 2: 0.6949152542372882
Training fold 3 for 5 epochs
Train samples:	238
Test samples:	59
Accuracy of fold 3: 0.7457627118644068
Training fold 4 for 5 epochs
Train samples:	238
Test samples:	59
Accuracy of fold 4: 0.6610169491525424
Avg accuracy 0.6936723163841808
