DSA_SoSe_24/Experiments.ipynb

577 lines
20 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "initial_id",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "67503952-9074-4cdb-9d7e-d9142f7c319c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>trestbps</th>\n",
" <th>chol</th>\n",
" <th>thalach</th>\n",
" <th>oldpeak</th>\n",
" <th>sex_0</th>\n",
" <th>sex_1</th>\n",
" <th>cp_1</th>\n",
" <th>cp_2</th>\n",
" <th>cp_3</th>\n",
" <th>...</th>\n",
" <th>slope_1</th>\n",
" <th>slope_2</th>\n",
" <th>slope_3</th>\n",
" <th>thal_3.0</th>\n",
" <th>thal_6.0</th>\n",
" <th>thal_7.0</th>\n",
" <th>ca_0.0</th>\n",
" <th>ca_1.0</th>\n",
" <th>ca_2.0</th>\n",
" <th>ca_3.0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.708333</td>\n",
" <td>0.481132</td>\n",
" <td>0.244292</td>\n",
" <td>0.603053</td>\n",
" <td>0.370968</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.791667</td>\n",
" <td>0.622642</td>\n",
" <td>0.365297</td>\n",
" <td>0.282443</td>\n",
" <td>0.241935</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.791667</td>\n",
" <td>0.245283</td>\n",
" <td>0.235160</td>\n",
" <td>0.442748</td>\n",
" <td>0.419355</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.166667</td>\n",
" <td>0.339623</td>\n",
" <td>0.283105</td>\n",
" <td>0.885496</td>\n",
" <td>0.564516</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.250000</td>\n",
" <td>0.339623</td>\n",
" <td>0.178082</td>\n",
" <td>0.770992</td>\n",
" <td>0.225806</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" age trestbps chol thalach oldpeak sex_0 sex_1 cp_1 \\\n",
"0 0.708333 0.481132 0.244292 0.603053 0.370968 False True True \n",
"1 0.791667 0.622642 0.365297 0.282443 0.241935 False True False \n",
"2 0.791667 0.245283 0.235160 0.442748 0.419355 False True False \n",
"3 0.166667 0.339623 0.283105 0.885496 0.564516 False True False \n",
"4 0.250000 0.339623 0.178082 0.770992 0.225806 True False False \n",
"\n",
" cp_2 cp_3 ... slope_1 slope_2 slope_3 thal_3.0 thal_6.0 thal_7.0 \\\n",
"0 False False ... False False True False True False \n",
"1 False False ... False True False True False False \n",
"2 False False ... False True False False False True \n",
"3 False True ... False False True True False False \n",
"4 True False ... True False False True False False \n",
"\n",
" ca_0.0 ca_1.0 ca_2.0 ca_3.0 \n",
"0 True False False False \n",
"1 False False False True \n",
"2 False False True False \n",
"3 True False False False \n",
"4 True False False False \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./data/dataset_cleaned.csv')\n",
"df.dropna(inplace=True)\n",
"\n",
"# extract all columns except 'goal' --> X\n",
"X = df.loc[:, df.columns != 'goal']\n",
"# extract only the column 'goal' --> y\n",
"y = df.loc[:, 'goal']\n",
"\n",
"# add new axis to y, new shape: (n, 1)\n",
"y = y.to_numpy()\n",
"y = y.reshape((len(y),1))\n",
"\n",
"# binarize y\n",
"y[y>0] = 1\n",
"\n",
"factor_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']\n",
"numeric_columns = [column for column in X.columns if column not in factor_columns]\n",
"\n",
"# transform factors into onehot vectors\n",
"X = pd.get_dummies(X, columns=factor_columns)\n",
"\n",
"# min max scaling of numeric columns\n",
"scaler = MinMaxScaler()\n",
"X[numeric_columns] = scaler.fit_transform(X[numeric_columns])\n",
"\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2bbee865-c000-43da-84d9-ce7e04874110",
"metadata": {},
"outputs": [],
"source": [
"def get_model(n_features):\n",
" model = tf.keras.models.Sequential([\n",
" tf.keras.layers.InputLayer(shape=(n_features,)),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(30, activation='relu'),\n",
" tf.keras.layers.Dense(1, activation='sigmoid')\n",
" ], name='test')\n",
" model.compile(optimizer=tf.keras.optimizers.Adam(), \n",
" loss=tf.keras.losses.BinaryCrossentropy())\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "38eb4f87-ca3c-4ecf-a8ca-29422822d933",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training fold 0 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 0: 0.9\n",
"Training fold 1 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 1: 0.8666666666666667\n",
"Training fold 2 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 2: 0.8666666666666667\n",
"Training fold 3 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 3: 0.9\n",
"Training fold 4 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 4: 0.9\n",
"Training fold 5 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 5: 0.8333333333333334\n",
"Training fold 6 for 20 epochs\n",
"Train samples:\t267\n",
"Test samples:\t30\n",
"Accuracy of fold 6: 0.7666666666666667\n",
"Training fold 7 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 7: 0.8275862068965517\n",
"Training fold 8 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 8: 0.7586206896551724\n",
"Training fold 9 for 20 epochs\n",
"Train samples:\t268\n",
"Test samples:\t29\n",
"Accuracy of fold 9: 0.7586206896551724\n",
"Avg accuracy 0.837816091954023\n"
]
}
],
"source": [
"from sklearn.model_selection import KFold\n",
"from sklearn import decomposition\n",
"import tensorflow as tf\n",
"\n",
"# number of components extracted from the pca\n",
"n_features = 8\n",
"\n",
"epochs = 20\n",
"k_folds = 10\n",
"\n",
"# used to split the dataset into k folds\n",
"kf = KFold(n_splits=k_folds)\n",
"\n",
"accuracies = []\n",
"for i, (train_idx, test_idx) in enumerate(kf.split(X)):\n",
" print(f'Training fold {i} for {epochs} epochs')\n",
"\n",
" # extract train and test data from the cleaned dataset\n",
" X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
" y_train, y_test = y[train_idx], y[test_idx]\n",
"\n",
" print(f'Train samples:\\t{len(X_train)}')\n",
" print(f'Test samples:\\t{len(X_test)}')\n",
"\n",
" # do pca based on the train data of the given fold to extract 'n_features'\n",
" pca = decomposition.PCA(n_components=n_features)\n",
" pca.fit(X_train)\n",
" X_train = pca.transform(X_train)\n",
"\n",
" # train the model using the components extracted from pca\n",
" model = get_model(n_features)\n",
" model.fit(X_train, y_train, epochs=epochs, verbose=0)\n",
"\n",
" # transform test data using on the pca model trained on the train data\n",
" X_test = pca.transform(X_test)\n",
" y_pred = model.predict(X_test, verbose=0)\n",
" y_pred = y_pred > 0.5\n",
"\n",
" # calculate the accuracy of the train data for the current fold\n",
" accuracy = sum(y_pred == y_test)[0] / len(y_pred)\n",
" accuracies.append(accuracy)\n",
" print(f'Accuracy of fold {i}: {accuracy}')\n",
"\n",
"# calculate the average accuracy over all folds\n",
"avg_accuracy = sum(accuracies) / len(accuracies)\n",
"print(f'Avg accuracy {avg_accuracy}')"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "95215693-47c9-4202-92f5-efbc65bc32c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training fold 0 for 20 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1\n",
" 1 1 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 1]\n",
"Accuracy of fold 0: 0.5833333333333334\n",
"Training fold 1 for 20 epochs\n",
"Train samples:\t237\n",
"Test samples:\t60\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 0 1 0 1 1 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1\n",
" 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1]\n",
"Accuracy of fold 1: 0.5\n",
"Training fold 2 for 20 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0\n",
" 0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0]\n",
"Accuracy of fold 2: 0.559322033898305\n",
"Training fold 3 for 20 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1\n",
" 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 0 0 1 1 1]\n",
"Accuracy of fold 3: 0.576271186440678\n",
"Training fold 4 for 20 epochs\n",
"Train samples:\t238\n",
"Test samples:\t59\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\maxwi\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1\n",
" 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1]\n",
"Accuracy of fold 4: 0.5254237288135594\n",
"Avg accuracy 0.5488700564971751\n"
]
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"# number of components extracted from the pca\n",
"n_features = 10\n",
"\n",
"k_folds = 5\n",
"\n",
"# used to split the dataset into k folds\n",
"kf = KFold(n_splits=k_folds)\n",
"\n",
"accuracies = []\n",
"for i, (train_idx, test_idx) in enumerate(kf.split(X[numeric_columns])):\n",
" print(f'Training fold {i} for {epochs} epochs')\n",
"\n",
" # extract train and test data from the cleaned dataset\n",
" X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
" y_train, y_test = y[train_idx], y[test_idx]\n",
"\n",
" print(f'Train samples:\\t{len(X_train)}')\n",
" print(f'Test samples:\\t{len(X_test)}')\n",
"\n",
" # do pca based on the train data of the given fold to extract 'n_features'\n",
" #pca = decomposition.PCA(n_components=n_features)\n",
" #pca.fit(X_train)\n",
" #X_train = pca.transform(X_train)\n",
"\n",
" model = KMeans(n_clusters=2)\n",
" model.fit(X_train)\n",
"\n",
" #X_test = pca.transform(X_test)\n",
" y_pred = model.predict(X_test)\n",
" print(y_pred)\n",
" \n",
"\n",
" # calculate the accuracy of the train data for the current fold\n",
" accuracy1 = sum(y_pred == y_test)[0] / len(y_pred)\n",
" accuracy2 = sum(y_pred != y_test)[0] / len(y_pred)\n",
" accuracy = max(accuracy1, accuracy2)\n",
" accuracies.append(accuracy)\n",
" print(f'Accuracy of fold {i}: {accuracy}')\n",
"\n",
"# calculate the average accuracy over all folds\n",
"avg_accuracy = sum(accuracies) / len(accuracies)\n",
"print(f'Avg accuracy {avg_accuracy}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "880302e4-82c1-47b9-9fe3-cb3567511639",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}