In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold
from sklearn import decomposition

In [2]:
df = pd.read_csv('./data/dataset_cleaned.csv')
df.dropna(inplace=True)

# extract all columns except 'goal' --> X
X = df.loc[:, df.columns != 'goal']
# extract only the column 'goal' --> y
y = df.loc[:, 'goal']

# add new axis to y, new shape: (n, 1)
y = y.to_numpy()
y = y.reshape((len(y),1))

# binarize y
y[y>0] = 1

factor_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
numeric_columns = [column for column in X.columns if column not in factor_columns]

# transform factors into onehot vectors
X = pd.get_dummies(X, columns=factor_columns)

# min max scaling of numeric columns
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

X.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_1,cp_2,cp_3,...,slope_1,slope_2,slope_3,thal_3.0,thal_6.0,thal_7.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0
0,0.708333,0.481132,0.244292,0.603053,0.370968,False,True,True,False,False,...,False,False,True,False,True,False,True,False,False,False
1,0.791667,0.622642,0.365297,0.282443,0.241935,False,True,False,False,False,...,False,True,False,True,False,False,False,False,False,True
2,0.791667,0.245283,0.23516,0.442748,0.419355,False,True,False,False,False,...,False,True,False,False,False,True,False,False,True,False
3,0.166667,0.339623,0.283105,0.885496,0.564516,False,True,False,False,True,...,False,False,True,True,False,False,True,False,False,False
4,0.25,0.339623,0.178082,0.770992,0.225806,True,False,False,True,False,...,True,False,False,True,False,False,True,False,False,False


In [3]:
def get_model(n_features):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(shape=(n_features,)),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name='test')
    model.compile(optimizer=tf.keras.optimizers.Adam(), 
                  loss=tf.keras.losses.BinaryCrossentropy())
    return model

In [4]:
import tensorflow as tf

use_pca = True
# number of components extracted from the pca
n_features = 8
n_features = n_features if use_pca else len(X.columns)

epochs = 20
k_folds = 10

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
print(f'Training {k_folds} folds for {epochs} epochs')
for i, (train_idx, test_idx) in enumerate(kf.split(X)):

    print(f'Fold {i}')
    
    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f'\tTrain samples:\t{len(X_train)}\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    # train the model using the components extracted from pca
    model = get_model(n_features)
    model.fit(X_train, y_train, epochs=epochs, verbose=0)

    if use_pca:
        # transform test data using on the pca model trained on the train data
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test, verbose=0)
    y_pred = y_pred > 0.5 # threshold to binarize

    # calculate the accuracy of the train data for the current fold
    accuracy = sum(y_pred == y_test)[0] / len(y_pred)
    accuracies.append(accuracy)
    print(f'\tAccuracy: {accuracy:.3%}')

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy:.3%}')

Training 10 folds for 20 epochs
Fold 0
	Train samples:	267	Test samples:	30
	Accuracy: 90.000%
Fold 1
	Train samples:	267	Test samples:	30
	Accuracy: 80.000%
Fold 2
	Train samples:	267	Test samples:	30
	Accuracy: 90.000%
Fold 3
	Train samples:	267	Test samples:	30
	Accuracy: 90.000%
Fold 4
	Train samples:	267	Test samples:	30
	Accuracy: 90.000%
Fold 5
	Train samples:	267	Test samples:	30
	Accuracy: 86.667%
Fold 6
	Train samples:	267	Test samples:	30
	Accuracy: 80.000%
Fold 7
	Train samples:	268	Test samples:	29
	Accuracy: 86.207%
Fold 8
	Train samples:	268	Test samples:	29
	Accuracy: 79.310%
Fold 9
	Train samples:	268	Test samples:	29
	Accuracy: 82.759%
Avg accuracy 85.494%


In [5]:
from sklearn.cluster import KMeans

use_pca = True
# number of components extracted from the pca
n_features = 10

k_folds = 5

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
print(f'Training {k_folds} folds')
for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):

    print(f'Fold {i}')
    
    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print(f'\tTrain samples:\t{len(X_train)}\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    model = KMeans(n_clusters=2, n_init=10)
    model.fit(X_train)

    if use_pca:
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test)

    # calculate the accuracy of the train data for the current fold
    accuracy1 = sum(y_pred == y_test)[0] / len(y_pred)
    accuracy2 = sum(y_pred != y_test)[0] / len(y_pred)
    accuracy = max(accuracy1, accuracy2)
    accuracies.append(accuracy)
    print(f'\tAccuracy {accuracy:.3%}')
    print()

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy:.3%}')

Training 5 folds
Fold 0
	Train samples:	237	Test samples:	60




	Accuracy 58.333%

Fold 1
	Train samples:	237	Test samples:	60




	Accuracy 50.000%

Fold 2
	Train samples:	238	Test samples:	59




	Accuracy 55.932%

Fold 3
	Train samples:	238	Test samples:	59




	Accuracy 57.627%

Fold 4
	Train samples:	238	Test samples:	59




	Accuracy 52.542%

Avg accuracy 54.887%


In [6]:
from sklearn.ensemble import RandomForestClassifier

use_pca = True
# number of components extracted from the pca
n_features = 10

k_folds = 5

# used to split the dataset into k folds
kf = KFold(n_splits=k_folds)

accuracies = []
print(f'Training {k_folds} folds')
for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):
    print(f'Fold {i}')

    # extract train and test data from the cleaned dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    y_train, y_test = y_train[:, 0], y_test[:, 0]

    print(f'\tTrain samples:\t{len(X_train)}\tTest samples:\t{len(X_test)}')

    if use_pca:
        # do pca based on the train data of the given fold to extract 'n_features'
        pca = decomposition.PCA(n_components=n_features)
        pca.fit(X_train)
        X_train = pca.transform(X_train)

    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(X_train, y_train)

    if use_pca:
        X_test = pca.transform(X_test)
        
    y_pred = model.predict(X_test)

    # calculate the accuracy of the train data for the current fold
    accuracy = sum(y_pred == y_test) / len(y_pred)
    accuracies.append(accuracy)
    print(f'\tAccuracy {accuracy:.3%}')
    print()

# calculate the average accuracy over all folds
avg_accuracy = sum(accuracies) / len(accuracies)
print(f'Avg accuracy {avg_accuracy:.3%}')

Training 5 folds
Fold 0
	Train samples:	237	Test samples:	60
	Accuracy 85.000%

Fold 1
	Train samples:	237	Test samples:	60
	Accuracy 90.000%

Fold 2
	Train samples:	238	Test samples:	59
	Accuracy 84.746%

Fold 3
	Train samples:	238	Test samples:	59
	Accuracy 76.271%

Fold 4
	Train samples:	238	Test samples:	59
	Accuracy 77.966%

Avg accuracy 82.797%
