In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold
from sklearn import decomposition

In [2]:
df = pd.read_csv('./data/dataset_cleaned.csv')
df.dropna(inplace=True)

# extract all columns except 'goal' --> X
X = df.loc[:, df.columns != 'goal']
# extract only the column 'goal' --> y
y = df.loc[:, 'goal']

# add new axis to y, new shape: (n, 1)
y = y.to_numpy()
y = y.reshape((len(y),1))

# binarize y
y[y>0] = 1

factor_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
numeric_columns = [column for column in X.columns if column not in factor_columns]

# transform factors into onehot vectors
X = pd.get_dummies(X, columns=factor_columns)

# min max scaling of numeric columns
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

X.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_1,cp_2,cp_3,...,slope_1,slope_2,slope_3,thal_3.0,thal_6.0,thal_7.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0
0,0.708333,0.481132,0.244292,0.603053,0.370968,False,True,True,False,False,...,False,False,True,False,True,False,True,False,False,False
1,0.791667,0.622642,0.365297,0.282443,0.241935,False,True,False,False,False,...,False,True,False,True,False,False,False,False,False,True
2,0.791667,0.245283,0.23516,0.442748,0.419355,False,True,False,False,False,...,False,True,False,False,False,True,False,False,True,False
3,0.166667,0.339623,0.283105,0.885496,0.564516,False,True,False,False,True,...,False,False,True,True,False,False,True,False,False,False
4,0.25,0.339623,0.178082,0.770992,0.225806,True,False,False,True,False,...,True,False,False,True,False,False,True,False,False,False


In [6]:
def get_model(n_features):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(shape=(n_features,)),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name='test')
    model.compile(optimizer=tf.keras.optimizers.Adam(), 
                  loss=tf.keras.losses.BinaryCrossentropy())
    return model

In [20]:
import tensorflow as tf

use_pca = True
# number of components extracted from the pca
n_features = 8
n_features = n_features if use_pca else len(X.columns)

epochs = 20
k_folds = 10

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
for i, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f'Training fold {i} for {epochs} epochs')

    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f'\tTrain samples:\t{len(X_train)}')
    print(f'\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    # train the model using the components extracted from pca
    model = get_model(n_features)
    model.fit(X_train, y_train, epochs=epochs, verbose=0)

    if use_pca:
        # transform test data using on the pca model trained on the train data
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test, verbose=0)
    y_pred = y_pred > 0.5 # threshold to binarize

    # calculate the accuracy of the train data for the current fold
    accuracy = sum(y_pred == y_test)[0] / len(y_pred)
    accuracies.append(accuracy)
    print(f'\tAccuracy of fold {i}: {accuracy}')

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy}')

Training fold 0 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 0: 0.8666666666666667
Training fold 1 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 1: 0.8
Training fold 2 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 2: 0.9
Training fold 3 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 3: 0.9
Training fold 4 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 4: 0.8666666666666667
Training fold 5 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 5: 0.8
Training fold 6 for 20 epochs
	Train samples:	267
	Test samples:	30
	Accuracy of fold 6: 0.8333333333333334
Training fold 7 for 20 epochs
	Train samples:	268
	Test samples:	29
	Accuracy of fold 7: 0.8620689655172413
Training fold 8 for 20 epochs
	Train samples:	268
	Test samples:	29
	Accuracy of fold 8: 0.7241379310344828
Training fold 9 for 20 epochs
	Train samples:	268
	Test samples:	29
	Accuracy of fold 9:

In [22]:
from sklearn.cluster import KMeans

use_pca = True
# number of components extracted from the pca
n_features = 10

k_folds = 5

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):
    print(f'Training fold {i} for {epochs} epochs')

    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f'\tTrain samples:\t{len(X_train)}')
    print(f'\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    model = KMeans(n_clusters=2, n_init=10)
    model.fit(X_train)

    if use_pca:
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test)

    # calculate the accuracy of the train data for the current fold
    accuracy1 = sum(y_pred == y_test)[0] / len(y_pred)
    accuracy2 = sum(y_pred != y_test)[0] / len(y_pred)
    accuracy = max(accuracy1, accuracy2)
    accuracies.append(accuracy)
    print(f'\tAccuracy of fold {i}: {accuracy}')

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy}')

Training fold 0 for 20 epochs
	Train samples:	237
	Test samples:	60




	Accuracy of fold 0: 0.5833333333333334
Training fold 1 for 20 epochs
	Train samples:	237
	Test samples:	60




	Accuracy of fold 1: 0.5
Training fold 2 for 20 epochs
	Train samples:	238
	Test samples:	59




	Accuracy of fold 2: 0.559322033898305
Training fold 3 for 20 epochs
	Train samples:	238
	Test samples:	59




	Accuracy of fold 3: 0.576271186440678
Training fold 4 for 20 epochs
	Train samples:	238
	Test samples:	59




	Accuracy of fold 4: 0.5254237288135594
Avg accuracy 0.5488700564971751


In [23]:
from sklearn.ensemble import RandomForestClassifier

use_pca = True
# number of components extracted from the pca
n_features = 10

k_folds = 5

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):
    print(f'Training fold {i} for {epochs} epochs')

    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    y_train, y_test = y_train[:, 0], y_test[:, 0]

    print(f'\tTrain samples:\t{len(X_train)}')
    print(f'\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(X_train, y_train)

    if use_pca:
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test)

    # calculate the accuracy of the train data for the current fold
    accuracy = sum(y_pred == y_test) / len(y_pred)
    accuracies.append(accuracy)
    print(f'\tAccuracy of fold {i}: {accuracy}')

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy}')

Training fold 0 for 20 epochs
	Train samples:	237
	Test samples:	60
	Accuracy of fold 0: 0.85
Training fold 1 for 20 epochs
	Train samples:	237
	Test samples:	60
	Accuracy of fold 1: 0.9
Training fold 2 for 20 epochs
	Train samples:	238
	Test samples:	59
	Accuracy of fold 2: 0.847457627118644
Training fold 3 for 20 epochs
	Train samples:	238
	Test samples:	59
	Accuracy of fold 3: 0.7627118644067796
Training fold 4 for 20 epochs
	Train samples:	238
	Test samples:	59
	Accuracy of fold 4: 0.7796610169491526
Avg accuracy 0.8279661016949152
