DSA_SS24/notebooks/cluster_features.ipynb

508 lines
4.3 MiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import matplotlib.cm as cm\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import sqlite3\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"from pandas.plotting import parallel_coordinates\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_samples, silhouette_score\n",
"from sklearn.metrics.pairwise import euclidean_distances\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"from sklearn.metrics import confusion_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Data from Database"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# connect to the database\n",
"conn = sqlite3.connect('../features.db')\n",
"c = conn.cursor()\n",
"# get training, validation and test data\n",
"train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n",
"valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n",
"test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n",
"# close the connection\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature names: ['age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n",
"Label names: ['GSVT', 'AFIB', 'SR', 'SB']\n"
]
}
],
"source": [
"feature_names = train.columns[2:]\n",
"print('Feature names:', list(feature_names))\n",
"\n",
"with open('../settings.json', 'r') as f:\n",
" settings = json.load(f)\n",
"label_names = list(settings['labels'].keys())\n",
"print('Label names:', label_names)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare Data for CLustering"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train_x shape: (4378, 10)\n",
"features: ['id', 'age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n",
"number of classes: 4\n"
]
}
],
"source": [
"# get the target and features\n",
"train_y = train['y']\n",
"train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"train_x = train.drop(columns=['y'])\n",
"\n",
"valid_y = valid['y']\n",
"valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"valid_x = valid.drop(columns=['y'])\n",
"\n",
"test_y = test['y']\n",
"test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"test_x = test.drop(columns=['y'])\n",
"\n",
"# add train validation and test data wit concat\n",
"data_x = pd.concat([train_x, valid_x, test_x], axis=0)\n",
"data_y = pd.concat([train_y, valid_y, test_y], axis=0)\n",
"\n",
"# drop id column\n",
"data_x = data_x.drop(columns=['id'])\n",
"print('train_x shape:', data_x.shape)\n",
"\n",
"\n",
"# dealing with missing values\n",
"# Create an imputer object with a mean filling strategy\n",
"imputer = SimpleImputer(strategy='mean')\n",
"data_x = imputer.fit_transform(data_x)\n",
"\n",
"# Scale Data between 0 and 1\n",
"scaler = MinMaxScaler()\n",
"# Fit the scaler to your data and then transform it\n",
"data_x = scaler.fit_transform(data_x)\n",
"# convert to Series\n",
"data_x = pd.DataFrame(data_x)\n",
"\n",
"# print column names\n",
"print('features:', train_x.columns.to_list())\n",
"num_classes= len(set(valid_y.to_list()))\n",
"print('number of classes:', num_classes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cluster Data with K-means"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqUAAAHHCAYAAACGDCH+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOB0lEQVR4nOzdd1hT1xsH8G/CCBsEGaKCiDhQcaAibgVFxb1HFfcoah211rq1FfdedVSto+7Rah24F+6Fe4uDpcgQ2bm/P/iRGgEFDdwkfD8+eR5z7snNewfJm3PuOVciCIIAIiIiIiIRScUOgIiIiIiISSkRERERiY5JKRERERGJjkkpEREREYmOSSkRERERiY5JKRERERGJjkkpEREREYmOSSkRERERiY5JKRERERGJTuOS0ocPH6JJkyYwNzeHRCLBnj17VLr+Z8+eQSKRYN26dSpdryZr0KABGjRokO/vm5qaip9++gnFixeHVCpFmzZtvnmdJ06cgEQiwYkTJ755XRnWrVsHiUSCZ8+eKco+3Wd5dV5NnjwZEolEpevMSsY2Xr58Oc/fi+hj+XWOF1R5uX/F+u7Ia8wT8s5XJaWPHz/GwIEDUbJkSRgYGMDMzAy1a9fGwoULkZCQoOoYlfj5+SE4OBi//fYbNmzYgGrVquXp++WnXr16QSKRwMzMLMv9+PDhQ0gkEkgkEsyZMyfX63/9+jUmT56M69evqyDavPfHH39g9uzZ6NChA9avX48RI0ZkW1cul+PPP/+Eh4cHLC0tYWpqitKlS6Nnz544f/58PkYtvunTp6v8x1pe+PfffzF58mSxw8gzGV9cn/6tCoKAgQMHQiKRKLY/48eSRCLBxo0bs1xf7dq1IZFIUKFChbwOPV8kJiZi/vz58PDwgLm5OQwMDFC6dGkMGTIEDx48yLc4Nm/ejAULFuTb+2mqjO+njIeJiQlKliyJDh06YOfOnZDL5WKHqNFu3ryJ3r17w8nJCQYGBjAxMUHlypXx008/4cmTJ2KHl3+EXNq3b59gaGgoWFhYCMOGDRNWrlwpLFmyROjSpYugp6cn9O/fP7erzLEPHz4IAIRx48bl2XvI5XIhISFBSE1NzbP3yI6fn5+gq6sr6OjoCFu3bs20fNKkSYKBgYEAQJg9e3au13/p0iUBgLB27dpcvS4pKUlISkrK9ft9q86dOwtFixbNUV1/f38BgNC6dWth4cKFwtKlS4Vhw4YJLi4uwqRJkxT10tLShISEBCEtLU1lcaampgoJCQmCXC5XlNWvX1+oX7++4nlenVcpKSlCQkKCUpmxsbHg5+en0vdZu3atAEC4dOmSytaZccy01dOnTzP9rcrlcmHQoEECAGHChAmK8uPHjwsABAMDA6FZs2bZrsvAwEAoX758vsSflyIjIwV3d3cBgNCiRQthwYIFwurVq4XRo0cLxYsXF/T09BR1J02alKfnia+vr+Do6Jhn61d3Od2/fn5+gkwmEzZs2CBs2LBBWLlypTBu3DjBzc1NACA0aNBAiImJUXqNWN8deU3Vn+crV64UdHR0BFtbW2HkyJHCypUrhWXLlgnff/+9YGtrK+jp6YmSk4hBNzcJ7NOnT9GlSxc4Ojri2LFjKFKkiGKZv78/Hj16hP3796sgVc5aZGQkAMDCwiLP3kMikcDAwCDP1v8lMpkMtWvXxl9//YVOnTopLdu8eTN8fX2xc+fOfInlw4cPMDIygr6+fr6836ciIiJydKzDw8OxbNky9O/fHytXrlRatmDBAsV5AwBSqVTlx1dHRwc6OjqfraPq8yo+Ph7GxsbQ1dWFrm6u/oxJREOHDsWKFSswbtw4TJ06NdPy5s2b4++//8abN29QuHBhRfnmzZtha2sLFxcXvHv3Lj9DzhO9evXCtWvXsGPHDrRv315p2bRp0zBu3DiRIlMNuVyO5ORkUb9L8oKuri6+++47pbJff/0VM2bMwNixY9G/f39s3bpVsUys7468psrP83PnzmHw4MGoXbs29u3bB1NTU6Xlc+fOxW+//fbF9WR8X2u83GSwGb/wz549m6P6KSkpwtSpU4WSJUsK+vr6gqOjozB27FghMTFRqZ6jo6Pg6+srnD59Wqhevbogk8kEJycnYf369Yo6Gb/mPn5k/Lr18/PL8pduVr8ADx8+LNSuXVswNzcXjI2NhdKlSwtjx45VLM9okfi0NfHo0aNCnTp1BCMjI8Hc3Fxo1aqVcOfOnSzf7+HDh4Kfn59gbm4umJmZCb169RLi4+O/uL/8/PwEY2NjYd26dYJMJhPevXunWHbx4kUBgLBz585MrS9v374VRo0aJVSoUEEwNjYWTE1NhaZNmwrXr19X1Mloifn0kbGd9evXF8qXLy9cvnxZqFu3rmBoaCj88MMPimUft/r17NlTkMlkmba/SZMmgoWFhfDq1avPbuf79++FkSNHCsWKFRP09fWF0qVLC7Nnz1a0NGYcg08fx48fz3J9QUFBAgBh3bp1X9jD/+2Hj9eVse03btwQ6tWrJxgaGgrOzs7C9u3bBUEQhBMnTgg1atQQDAwMhNKlSwuBgYFK68xoRXz69KnSOj/eZ1mdVzdu3BD8/PwEJycnQSaTCba2tkLv3r2FN2/eKK0/47y6ffu20LVrV8HCwkKoXLmy0rIMWe03Pz8/4dixYwIAYdeuXZn2yaZNmwQAwrlz57LdbxnbePLkSWHAgAGCpaWlYGpqKvTo0UOIiorKVP/ff/9V/L2YmJgIzZs3F27duqVY7ufnl2WsgiAIVapUEdq2bau0vgoVKggAhBs3bijKtmzZIgBQOg9fvnwp9O7dW7CxsRH09fUFV1dXYc2aNZniS0xMFCZOnCg4OzsL+vr6QrFixYTRo0dn+mwCIPj7+wu7d+8Wypcvr1jngQMHst1XGT5tKR02bJgAQOnzJkPGebl+/XrB2NhYWLZsmdLy8uXLC0OHDlWcq5/asGGDULVqVcHAwEAoVKiQ0LlzZyEkJESpzqlTp4QOHToIxYsXV2zz8OHDhQ8fPijVy/gcevnypdC6dWvB2NhYKFy4sDBq1KhMrTV//fWXULVqVcHExEQwNTUVKlSoICxYsOCz++X8+fMCgBz3qn16jmf3GS0I6cfr456R2NhY4YcffhAcHR0FfX19wdraWvD29hauXLkiCEL632l23yuCkPvzZOPGjYKrq6ugq6sr7N69+6v3kSAIwuzZswVPT0/B0tJSMDAwEKpWrar4TMrqvXNyjp4+fVqoVq2aIJPJhJIlSworVqzIVUupsbFxtsubNGkiSCQS4f79+4qyTz8Hk5KShAkTJghVq1YVzMzMBCMjI6FOnTrCsWPHMq3vzZs3wnfffSeYmpoK5ubmQs+ePYXr169nOva5OV+/9N2T4WvyhNDQUKFXr15C0aJFBX19fcHOzk5o1aqV0vdCdvtNV1dXePHixWfrfexz39fh4eFCnz59BBsbG0Emkwlubm6Zvhuz+h7Mbrsy9u/jx4+FJk2aCEZGRkKRIkWEKVOmZNpvX3uufyxXTSz//PMPSpYsiVq1auWofr9+/bB+/Xp06NABo0aNwoULFxAQEIC7d+9i9+7dSnUfPXqEDh06oG/fvvDz88Mff/yBXr16wd3dHeXLl0e7du1gYWGBESNGoGvXrmjevDlMTExyEz5u376NFi1awM3NDVOnToVMJsOjR49w9uzZz77uyJEjaNasGUqWLInJkycjISEBixcvRu3atXH16lWUKFFCqX6nTp3g5OSEgIAAXL16FatXr4aNjQ1mzpyZozjbtWuHQYMGYdeuXejTpw+A9JaSsmXLomrVqpnqP3nyBHv27EHHjh3h5OSE8PBw/P7776hfvz7u3LkDe3t7lCtXDlOnTsXEiRMxYMAA1K1bFwCUjuXbt2/RrFkzdOnSBd999x1sbW2zjG/hwoU4duwY/Pz8EBQUBB0dHfz+++84fPgwNmzYAHt7+2y3TRAEtGrVCsePH0ffvn1RuXJlHDp0CKNHj8arV68wf/58WFtbY8OGDfjtt9/w/v17BAQEAADKlSuX5TodHR0BANu3b0fHjh2/6tfiu3fv0KJFC3Tp0gU
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"kmeans = KMeans(n_clusters=num_classes, random_state=42)\n",
"clusters = kmeans.fit_predict(data_x)\n",
"\n",
"corr_ma = confusion_matrix(data_y, clusters)\n",
"sns.heatmap(corr_ma, annot=True, fmt=\"d\", cmap='Blues', yticklabels=label_names)\n",
"plt.xlabel('K-Means Clusters')\n",
"plt.ylabel('Diagnosis Group Labels')\n",
"plt.title('Confusion Matrix of Similiarity between KMeans Clusters and Diagnosis Groups')\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cluster 0: 1464 patients\n",
"Cluster 1: 1772 patients\n",
"Cluster 2: 579 patients\n",
"Cluster 3: 563 patients\n"
]
}
],
"source": [
"# Initialize a dictionary to count patients in each cluster\n",
"cluster_patient_count = {i: 0 for i in range(kmeans.n_clusters)}\n",
"\n",
"# Iterate over the assigned clusters and increment the count for each cluster\n",
"for cluster in clusters:\n",
" cluster_patient_count[cluster] += 1\n",
"\n",
"# Print the number of patients in each cluster\n",
"for cluster, count in cluster_patient_count.items():\n",
" print(f\"Cluster {cluster}: {count} patients\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABmkAAAJwCAYAAABicYUDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUVfvw8e+WbEnvDUIKCb2HXgSEEDoqiFhoAoogiIoK+lOKII9KUZCiIh0boKigiCA8VEU6SoeElpCEVFI3uzvvH3mzD0sSCJgQyv25rlywM2dmzpnZmZ17zpxzVIqiKAghhBBCCCGEEEIIIYQQQog7Sl3RGRBCCCGEEEIIIYQQQgghhHgQSSWNEEIIIYQQQgghhBBCCCFEBZBKGiGEEEIIIYQQQgghhBBCiAoglTRCCCGEEEIIIYQQQgghhBAVQCpphBBCCCGEEEIIIYQQQgghKoBU0gghhBBCCCGEEEIIIYQQQlQAqaQRQgghhBBCCCGEEEIIIYSoAFJJI4QQQgghhBBCCCGEEEIIUQGkkkYIIYQQQgghhBBCCCGEEKICSCWNEGVkyZIlqFQqYmNjKzor5ebflHHr1q2oVCq2bt1a5vm6lkqlYuLEieW6jeuFhITQvXv3Ml1nRZTj1KlTdOrUCTc3N1QqFWvXrr2j2/83QkJCGDRoUEVn41+bOHEiKpWqorMBQGxsLCqViiVLltim3U35Kyt36tpUnIrenxcuXMBgMLBz584Ky0Nx+vXrR9++fSs6G0IIIW7RgxAPiQIVfQ9ztzObzbz++usEBQWhVqt55JFHKjpLpXa/HNviYpmK1K5dO9q1a2f7fLflr6xUVFx+N+zPrl27MmzYsArbfnE2bNiAs7MzSUlJFZ0VUQpSSSPue4XBQuGfwWCgWrVqvPjiiyQkJNzy+t57770Ke3idnZ3NxIkTb/lh4vnz5xk+fDghISHo9Xp8fX155JFH7roHc3dK4Q/49OnTKzord5WBAwdy5MgRpk6dyvLly2ncuDFffvklH3300R3NR7t27Wznq1qtxtXVlerVq9O/f39+++23O5oXcXPlcU0srDgp/NNoNPj6+tKnTx+OHTtWptsqb7d73b4TJk+eTLNmzWjVqpVt2qBBg+z2vaurK/Xr12fGjBnk5eUVWcfBgwd55plnCAoKQq/X4+npSceOHVm8eDEWi6VI+rS0NAwGAyqVqsRj+cYbb7BmzRoOHTpUdoUVQogH2IMcDxXeU6xevdpuuslkonv37qjVahYtWnTDZVUqFStWrCg2TatWrVCpVNSpU+eWynG3ys3NZdasWTRr1gw3Nze778rJkyfvWD4qIgYpS4sWLeLDDz+kT58+LF26lJdffpmjR48yceLEO1qJWdy5HxgYSHR0NLNnz+bq1at3LC/i5n7++edyeVEzJCTE7nvg5ORE06ZNWbZsWZlvq7zdrdeGnTt3snHjRt544w3btOtjWgcHB8LCwhgwYABnz54tso6MjAwmTZpE/fr1cXZ2xmg0UqdOHd544w3i4uKK3W7fvn1RqVR2271W586dCQ8PZ9q0aWVTUFG+FCHuc4sXL1YAZfLkycry5cuVzz//XBk4cKCiVquV0NBQJSsr65bW5+TkpAwcOLDIdLPZrOTk5ChWq7WMcl5UUlKSAigTJkwo9TI7duxQXF1dFVdXV+WVV15RFi5cqEyZMkUJDw9XVCqVMnv27FKv69+U0WKxKDk5OYrFYrnlZW9FafZPTEyMAigffvhhmWwzODhY6datW5msq9CtHud/Kzs7WwGUt956y256t27dlODg4DuWD0VRlLZt2yqVK1dWli9frixfvlxZsGCBMnbsWCUsLEwBlL59+yomk8lumdzc3CLT7kX5+flKTk5ORWdDUZT/nSeLFy+2TSsufyVdE/+NLVu2KIAyevRoZfny5cqiRYuUMWPGKAaDQfHy8lLi4+PLfFtbtmwps3Ve60bX7Yo83omJiYqDg4Py5Zdf2k0fOHCgotfrbeffnDlzlHbt2imA8sQTT9il/fzzzxWNRqMEBgYqb7zxhrJw4UJl1qxZSvfu3RWVSqVMnTq1yHY/++wzxWAwKP7+/kWuN9dq2rSp0r9//7IprBBCPOAe5Hio8Hd+1apVtmkmk0np0aOHolKplIULF950WYPBoHTp0qXI/MJ7JYPBoNSuXfuWy3K3SUpKUiIjIxVA6d69u/LRRx8pCxcuVF577TUlKChIcXBwsKWdMGGCUp6PkyoiBilLTzzxhFKpUiW7aatWrSrXe87iXH/uL1q0SHnvvfeUTp06KSqVSgkODlYOHTpkt8zdFI/8G1arVcnJyVHMZnNFZ0VRlIIYt23btrbPxeVv5MiR5XJeBQcHKw0aNLDd33/wwQdKtWrVFED57LPPynxbZR0bXquka0NFH+9evXopnTp1sptWXEz74osvKjqdTvH09FQuXbpkS3vmzBklNDRU0Wg0Sr9+/ZRPPvlE+eyzz5QXX3xR8fLyUiIiIopsMz09XTEYDEpISIgSFBRU4m/vvHnzFEdHRyUjI6NsCy3KnPbOVQcJUbG6dOlC48aNARg6dCheXl7MnDmTH374gSeffPJfr1+j0aDRaP71espSamoqffr0wWg0snPnTqpWrWqb98orrxAdHc2YMWOIjIykZcuWJa4nKysLJyenf1VGtVqNwWC4rWVF+Sts/uru7l7u27JarZhMpht+H9zc3HjmmWfspv3nP/9h9OjRzJs3j5CQEN5//33bPL1eX275vZO0Wi1a7d3703yn89emTRv69Olj+1y9enVeeOEFli1bxuuvv37H8lFeKvJ4r1ixAq1WS48ePYrM02q1duffiBEjaNasGd988w0zZ84kMDCQP/74g+HDh9OiRQt+/vlnXFxcbOnHjBnD3r17+fvvv4vdbteuXQkODubLL79kypQpxeavb9++TJgwgXnz5uHs7FwGJRZCCPEgxkPXy8/Pp2/fvqxbt45PP/2UIUOG3HSZrl278uOPP3LlyhW8vb1t07/88kv8/PyIiIggNTW1PLN9RwwaNIgDBw6wevVqevfubTfv3Xff5a233qqgnJWN0sQgZSUxMfGOxFXwv1j9Rq499wHGjx/P77//Tvfu3enZsyfHjh3DaDQCd388UlqFLYfuVnc6f5UqVbK7vx80aBBhYWHMmjXrruui63ZU5PFOTExk/fr1LFiwoNj518a0gwcPplq1aowePZqlS5cyfvx4zGYzjz32GAkJCWzdupXWrVvbLT916lS7Zx+F1qxZg8ViYdGiRTz88MNs27aNtm3bFknXu3dvRo0axapVq3j22WfLoMSivEh3Z+KB9fDDDwMQExMDwPTp02nZsiVeXl4YjUYiIyOLNItXqVRkZWWxdOlSW5PFwv42S+qD+ZdffqFNmzY4OTnh4uJCt27d+Oeff+zSDBo0CGdnZy5dusQjjzyCs7MzPj4+jB071tZdTGxsLD4+PgBMmjTJtv0bNYf99NNPuXz5Mh9++KFdBQ2A0Wi0lWPy5Mm26YXl+O9//8uIESPw9fWlcuXKJZbRarUyceJEAgMDcXR0pH379hw9erRIX6TFjfvQrl076tSpw9GjR2nfvj2Ojo5UqlSJDz74wC6vJpOJd955h8jISNzc3HBycqJNmzZs2bKlxLKXhcWLF/Pwww/j6+uLXq+nVq1azJ8/v8T0GzdupEGDBhgMBmrVqsV3331XJE1aWhpjxoyxdQ0UHh7O+++/j9VqvWFerl69ypgxY+y6rIuKimL//v03XO7cuXOMGDGC6tWrYzQa8fLy4vHHH7c7hhMnTiQ4OBiA1157DZVKRUhICO3atWP9+vWcO3fO9n0LCQmxLZeXl8eECRMIDw9Hr9cTFBTE66+/XqRLJJVKxYsvvsjKlSupXbs2er2eDRs23DDfxdFoNMyePZtatWrxySefkJ6ebpt3/fet8Lu6Y8cORo8ejY+PD+7u7jz//POYTCbS0tIYMGAAHh4eeHh48Prrr6Moit32rFYrH330EbVr18ZgMODn58fzzz9fJAgvHJNox44dNG3aFIPBQFhYWJHm4/n5+UyaNImIiAg
"text/plain": [
"<Figure size 2000x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Apply PCA to reduce to 2 dimensions\n",
"pca = PCA(n_components=2)\n",
"reduced_data = pca.fit_transform(data_x)\n",
"\n",
"# Create a figure with two subplots\n",
"fig, axs = plt.subplots(1, 2, figsize=(20, 7))\n",
"\n",
"# First subplot for patient labels after PCA reduction\n",
"for i in range(len(set(data_y))):\n",
" cluster_data = reduced_data[data_y == i]\n",
" axs[0].scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Label {label_names[i]}', alpha=0.7, edgecolors='w')\n",
"\n",
"axs[0].set_title('Patient Original Labels after Dimensionality Reduction (PCA)')\n",
"axs[0].set_xlabel('Reduced Dimension 1')\n",
"axs[0].set_ylabel('Reduced Dimension 2')\n",
"axs[0].legend()\n",
"\n",
"# Second subplot for patient clusters after PCA reduction\n",
"for i in range(kmeans.n_clusters):\n",
" cluster_data = reduced_data[clusters == i]\n",
" axs[1].scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {i}', alpha=0.7, edgecolors='w')\n",
"\n",
"axs[1].set_title('Patient K-Means Clusters after Dimensionality Reduction (PCA)')\n",
"axs[1].set_xlabel('Reduced Dimension 1')\n",
"axs[1].set_ylabel('Reduced Dimension 2')\n",
"axs[1].legend()\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABkgAAAKyCAYAAACezlJSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVxU1f8/8NcdGGaGZRhWWUQWAVHcEsUFTc0Fd/vkki1uqeWSpmWa9s0dyXIpzbTS1NQ2taw0Sy39uLSYqYgbAgoqoCgw7DDb+f3B797PDDPAgDOs7+fj4aOYuXPvubOe932f8z4cY4yBEEIIIYQQQgghhBBCCCGkCRHVdQMIIYQQQgghhBBCCCGEEEJqGyVICCGEEEIIIYQQQgghhBDS5FCChBBCCCGEEEIIIYQQQgghTQ4lSAghhBBCCCGEEEIIIYQQ0uRQgoQQQgghhBBCCCGEEEIIIU0OJUgIIYQQQgghhBBCCCGEENLkUIKEEEIIIYQQQgghhBBCCCFNDiVICCGEEEIIIYQQQgghhBDS5FCChBBCCCGEEEIIIYQQQgghTQ4lSOqpnTt3guM4pKSk1HVTiJUtW7YMHMfVdTPqLY1GgwULFsDPzw8ikQhPP/10XTfJbI3ltU1JSQHHcdi5c2ddNwUA0KdPH/Tp00f4u761z1ICAgIwadKkWj9ufXg+hwwZgmnTptXZ8U355Zdf4OjoiIcPH9Z1UwghhBCzNYW48nHO8eTJk+A4DidPnrR4u/RxHIdly5ZZ9RjlBQQEYNiwYRbdZ12cR2JiIgYOHAhnZ2dwHIeDBw/W6vEfR1315y2tPsW1pmKV+tQ+S6mt7yZT6vr5vHv3LqRSKc6ePVtnbTBl3LhxGDt2bF03g1gJJUiqie988f+kUilCQ0Px6quv4sGDB9Xe3+rVq+vsB76oqAjLli0z+wuX/4Lev3+/we0qlQrDhg2DSCTC559/XuljOY7Dnj17TG4TFRUFjuPQtm3bap1HfVVSUoINGzaga9eucHZ2Nniv3Lx5s9ba8eWXX+KDDz6oteNZ2ueff473338fo0ePxq5duzBv3jxcu3YNy5Ytq9VAz9Rn38fHB9HR0di4cSPy8/NrrS2kaj///LNVgreAgACD94GDgwMiIyPxxRdfWPxY1lZfvxvOnj2Lo0ePYuHChcJt+r8hHMdBLBYjKCgIEyZMwK1bt4z2kZeXh+XLl6NDhw5wdHSETCZD27ZtsXDhQqSnp5s87tixY8FxnMFx9Q0aNAjBwcGIjY21zIkSQghp0ppyXMm7c+cOpk+fjoCAAEgkEnh6euLpp5+udxfFagt/4Xft2rV13ZR6ZeLEiYiPj0dMTAx2796Nzp0710k/tk+fPsLnVSQSQS6Xo1WrVhg/fjyOHTtWq20hVbPGd2L5mMTGxgaenp4YPXo0rl+/btFjWVtNv7drw4oVK9C1a1dERUUJt02aNMnguZfL5ejQoQPWrVuH0tJSo31cunQJL774Ivz8/CCRSODq6or+/ftjx44d0Gq1RtsrlUpIpVJwHFfha7lw4UIcOHAAcXFxljtZUn8wUi07duxgANiKFSvY7t272WeffcYmTpzIRCIRCwwMZIWFhdXan4ODA5s4caLR7RqNhhUXFzOdTmehlht7+PAhA8CWLl1q1vYnTpxgANi+ffuE21QqFRs+fDjjOI5t27atysdKpVI2ePBgo/tv374t3B8eHl7tc6lvHj58yCIiIhgANmzYMPbBBx+wbdu2sTfffJP5+fkxsVgsbLt06VJmzY/i0KFDmb+/v9X2b23PPvss8/X1Nbht3759DAA7ceJErbWj/Gf/888/Z6tXr2YDBw5kHMcxf39/FhcXZ/AYtVrNiouLa62N1qLT6VhxcTHTaDR13RTGGGO9e/dmvXv3Fv421b5Zs2ZZ5XPl7+/POnbsyHbv3s12797N3nvvPRYaGsoAsE8//dTixzL1+2ApFX031PXrPXLkSDZw4ECD2/jfkDlz5gifv1dffZXZ2dkxV1dXlpaWJmybnJzMAgMDmY2NDRs3bhz76KOP2KeffspeffVV5ubmxkJCQoyOmZuby6RSKQsICGB+fn4V/vZ+/PHHzN7enuXl5Vn2pAkhhDQ5TTmuZIyxM2fOMLlczuRyOXv99dfZtm3b2KpVq1hwcDDjOI5t3LjR7H09zjlqtVpWXFzMtFpttR9bHeY8P3xM/P7771vkmP7+/mzo0KEW2Revuq/z4yoqKmIA2Ntvv21we13EuL1792bNmzcX4oCtW7ey+fPns6CgIAaAjR07lqlUKoPHlJSUGN3WENWnuJb/nOzYsUO4zVT7KvpOfBymYpK5c+cyqVTK3NzcWEZGhsWPZa1rHpV9b9fl652ZmcnEYjH78ssvDW6fOHEik0gkwudv06ZNrE+fPgwAe/bZZw22/eyzz5iNjQ3z8fFhCxcuZNu2bWMbNmxgw4YNYxzHsZiYGKPjfvrpp0wqlTIvLy+j7xt9kZGRbPz48ZY5WVKv2NZeKqZxGTx4MDp37gwAmDp1Ktzc3LB+/Xr88MMPeO655x57/zY2NrCxsXns/ViTWq3G2LFjcejQIXzyySeYMmVKlY8ZMmQIfvzxRzx69Aju7u7C7V9++SWaNWuGkJAQ5OTkWLPZtWLSpEm4ePEi9u/fj1GjRhnct3LlSrz99tt11DLL0Ol0UKlUkEqlVj9WZmYmFAqF1Y8DAIWFhXBwcKh0G/3PPgAsWrQIv//+O4YNG4YRI0bg+vXrkMlkAABbW1vY2jb8r1l+VGN9Vdvt8/X1xYsvvij8PWnSJAQFBWHDhg31rixUTdTl652ZmYnDhw9j69atJu/v1asXRo8eDQCYPHkyQkNDMWfOHOzatQuLFi2CRqPBM888gwcPHuDkyZPo2bOnweNjYmKwZs0ao/0eOHAAWq0Wn3/+OZ566imcOnUKvXv3Ntpu1KhRmD17Nvbt24eXXnrJAmdMCCGkqWuKcWVOTg5Gjx4NmUyGs2fPomXLlsJ9r7/+OqKjozF37lxERESgR48eFe6H77s/zjmKRKJ63c9t6vjSprURD5oT4zo7OxvEAQDw7rvvYs6cOfj4448REBBg0NeUSCRWa29tqu9xbW23Tz8mAYBWrVphxowZ+OKLL7BgwYJaa4e11OXrvWfPHtja2mL48OFG99na2hp8/mbOnImuXbvim2++wfr16+Hj44O//voL06dPR/fu3fHzzz/DyclJ2H7u3Lk4f/48rly5YvK4Q4YMgb+/P7788kusWrXKZPvGjh2LpUuX4uOPP4ajo6MFzpjUF1Riy0KeeuopAMDt27cBAGvXrkWPHj3g5uYGmUyGiIgIo9JUHMehsLAQu3btEqaJ8fUpK6qjeuTIEfTq1QsODg5wcnLC0KFDcfXqVYNtJk2aBEdHR6SlpeHpp5+Go6MjPDw8MH/+fGEqWUpKCjw8PAAAy5cvF45vbkkajUaDcePG4YcffsCWLVvMvig4cuRISCQS7Nu3z+D2L7/8EmPHjq2wY7tnzx5ERERAJpPB1dUV48aNw927dw22OX36NMaMGYMWLVpAIpHAz88P8+bNQ3FxscF25jw/vK+//hoRERFwcnKCXC5Hu3bt8OGHH1Z6jn///TcOHz6MKVOmGCVHgLJOUmXTpiur/1/+NcrPz8fcuXMNpqUPGDAAFy5cAFA2Dfjw4cNITU0VXuOAgADh8aWlpVi6dCmCg4OF52zBggVGUxQ5jsOrr76KvXv3Ijw8HBKJBL/88kuNnyOg6s8I/zycOHECV69eFdq/c+dOjBkzBgDQt29f4Xb9qaHV+ZwkJydjyJAhcHJywgsvvFBlu0156qmn8M477yA1NdWghJyp2p38c7lv3z60adMGMpkM3bt3R3x8PADgk08+QXBwMKRSKfr06WOyjNjff/+NQYMGwdnZGfb29ujdu7dRKQL+2ElJSZg0aRIUCgWcnZ0xefJkFBUVGWx77Ngx9OzZEwqFAo6OjmjVqhUWL14s3F/Re/L3338XnmeFQoGRI0caTUetTjt27NiBp556Cp6enpBIJGjTpg22bNlS+ZNvon2TJk3C5s2bheeb/8c
"text/plain": [
"<Figure size 2000x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA(n_components=3)\n",
"reduced_data = pca.fit_transform(data_x)\n",
"\n",
"fig = plt.figure(figsize=(20, 7))\n",
"\n",
"# First subplot\n",
"ax1 = fig.add_subplot(121, projection='3d')\n",
"for i in range(kmeans.n_clusters):\n",
" cluster_data = reduced_data[clusters == i]\n",
" ax1.scatter(cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f'Cluster {i}', alpha=0.7, edgecolors='w')\n",
"ax1.set_title('Patient K-Means Clusters after Dimensionality Reduction (PCA)')\n",
"ax1.set_xlabel('Reduced Dimension 1')\n",
"ax1.set_ylabel('Reduced Dimension 2')\n",
"ax1.set_zlabel('Reduced Dimension 3')\n",
"ax1.legend()\n",
"\n",
"# Second subplot\n",
"ax2 = fig.add_subplot(122, projection='3d')\n",
"for i in range(len(set(data_y))):\n",
" cluster_data = reduced_data[data_y == i]\n",
" ax2.scatter(cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f'Label {label_names[i]}', alpha=0.7, edgecolors='w') \n",
"ax2.set_title('Patient Original Labels after Dimensionality Reduction (PCA)')\n",
"ax2.set_xlabel('Reduced Dimension 1')\n",
"ax2.set_ylabel('Reduced Dimension 2')\n",
"ax2.set_zlabel('Reduced Dimension 3')\n",
"ax2.legend()\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Adjusted Rand Index (ARI): 0.15412707550646423\n",
"Normalized Mutual Information (NMI): 0.24282003848756695\n",
"Silhouette Score: 0.46722973644820026\n"
]
}
],
"source": [
"# Calculate Adjusted Rand Index (ARI)\n",
"ari = adjusted_rand_score(data_y, clusters)\n",
"print(f\"Adjusted Rand Index (ARI): {ari}\")\n",
"\n",
"# Calculate Normalized Mutual Information (NMI)\n",
"nmi = normalized_mutual_info_score(data_y, clusters)\n",
"print(f\"Normalized Mutual Information (NMI): {nmi}\")\n",
"\n",
"# Calculate Silhouette Score\n",
"silhouette_avg = silhouette_score(data_x, clusters)\n",
"print(f\"Silhouette Score: {silhouette_avg}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- The ARI and NMI scores suggest that the clustering algorithm has some effectiveness in mirroring the true label structure, but it's not highly accurate. The moderate scores indicate that while there is some alignment with the true labels, the clustering does not perfectly capture the underlying groupings.\n",
"\n",
"- The Silhouette Score indicates that the clustering has identified groups that are somewhat cohesive internally and separated from each other. This suggests that the clustering algorithm has been somewhat successful in identifying meaningful structures within the data, even if those structures don't perfectly align with the true labels."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAyUAAALSCAYAAADDdH6KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd7idRbX/P2/dvZx9em/poQQCBAIh9IB0BKQpIAgXC/4sKFyRKnCxo4DoRRGxgHSkd+m9Q+qpSU7vu79tfn/MOZsckkBAEK/ub579PCfvnv3OzJo1a9pa31GEEIIiiiiiiCKKKKKIIooooohPCeqnXYAiiiiiiCKKKKKIIooo4j8bxUVJEUUUUUQRRRRRRBFFFPGporgoKaKIIooooogiiiiiiCI+VRQXJUUUUUQRRRRRRBFFFFHEp4rioqSIIooooogiiiiiiCKK+FRRXJQUUUQRRRRRRBFFFFFEEZ8qiouSIooooogiiiiiiCKKKOJTRXFRUkQRRRRRRBFFFFFEEUV8qiguSooooogiiiiiiCKKKKKITxXFRUkR/xb40Y9+REtLC5qmsWDBgk+7OBvhpJNOoqmpadozRVG44IILPvC3F1xwAYqifDIF+xTR1NTESSed9GkXo4gi/i3xr2g3UqkUFRUV/OlPf/qn5/2vKI8iNg/btqmvr+fqq6/+tItSxD8RxUXJJ4Df//73KIrCSy+9tMnv99hjD7baaqtPtAz33nvvFk14/x3w4IMP8p3vfIddd92V6667jksvvXSzaU866SQURdnkx+/3/xNL/e+FPfbYoyBHVVWJRqPMnj2bz3/+8zz00EMfWz7/SXo9hffqaSgUYt68efzgBz8gk8l8pHc+88wzXHDBBYyNjX28hf0XwoYy03WdRCLBwoUL+frXv84777zzkd+byWS44IILePzxxz++wgKdnZ3TyqxpGg0NDRx++OG89tprH2teH4RPSj+uuOIKIpEIxxxzTOHZ1GJh6hMMBpk3bx7nnnsuExMTH+r9n1TbbCl6enq44IILtri9puYKm/qcffbZn0gZ/6/0fcMw+OY3v8kll1xCLpf7tItTxD8J+qddgCI+Gdx7771cddVV/xETuEcffRRVVfntb3+LaZofmN7n83Httddu9FzTtE+ieJtFNptF1/99umBdXR2XXXYZAOl0mjVr1nDbbbfxxz/+kaOPPpo//vGPGIZRSL9y5UpU9cPti/wn6fWG2HffffnCF74AyN3mJ598ku9///u8/vrr3HzzzR/6fc888wwXXnghJ510EvF4/GMu7b8OpuQmhGB8fJzXX3+d66+/nquvvprLL7+cb37zmx/6nZlMhgsvvBCQi/GPG8ceeyyf+cxncF2X5cuX86tf/Yr77ruP55577kOdAp977rkfeWL7SeiHbdtcccUVfOMb39ikrf3Vr35FOBwmlUrx4IMPcskll/Doo4/y9NNPb/EJx/u1zT8ijy1FT08PF154IU1NTR+qrS666CKam5unPfukNi7/L/X9k08+mbPPPps///nPfPGLX/y0i1PEPwH/PjOiIv5jMTAwQCAQ2KIFCYCu65xwwgmfcKk+GP9uJzOxWGwjuf7P//wPZ555JldffTVNTU1cfvnlhe98Pt8/u4j/ZzFr1qxpsv2v//ovLMvitttuI5fL/dvp0seF98oNpE4efPDBfOtb32LOnDl85jOf+ZRKt2lsv/3208q86667csghh/CrX/2KX//611v8Hl3X/6U2Pe6++24GBwc5+uijN/n9kUceSVlZGSD1+7Of/Sy33XYbzz33HLvssss/nP+/mjw2xAEHHMAOO+zwaRfjH0I6nSYUCn2s74zH4+y33378/ve/Ly5K/kNQdN/6F8If//hHFi5cSCAQIJFIcMwxx7B27dppaZ588kmOOuooGhoa8Pl81NfX841vfINsNltIc9JJJ3HVVVcB010Y4F0XgR//+MdcddVVtLS0EAwG2W+//Vi7di1CCC6++GLq6uoIBAIceuihjIyMTCvDnXfeyYEHHkhNTQ0+n4/W1lYuvvhiXNedlm7KTe3ll19m8eLFBAIBmpubueaaa7ZIHo7jcPHFF9Pa2orP56OpqYn//u//Jp/PF9IoisJ1111HOp0u1PP3v//9Fst8c9ic//HUcXtnZ+e05/fddx9Lly4lEokQjUbZcccd+fOf//y+eWwqpuSpp55ixx13xO/309ra+r6TkI9LX0DqTDgcZv369Rx22GGEw2HKy8v59re/vVG7fhhomsYvfvEL5s2bx5VXXsn4+Hjhu/fGlNi2zYUXXsjMmTPx+/2Ulpay2267Fdy/3k+vAX784x+zePFiSktLCQQCLFy4kFtuuWWjMimKwle/+lXuuOMOttpqK3w+H/Pnz+f+++/fKO369es55ZRTCrre3NzMGWecgWVZhTRjY2P8v//3/6ivr8fn8zFjxgwuv/xyPM+b9q4bb7yRhQsXFnRk66235oorrvhoggWqqqoKrkkb4vnnn2f//fcnFosRDAZZunQpTz/9dOH7Cy64gLPOOguA5ubmghw7Ozs54ogj2H777ae97+CDD0ZRFO66665peSiKwn333feh5eB5Hj//+c+ZP38+fr+fyspKTj/9dEZHR6ela2pq4qCDDuKpp55ip512wu/309LSwh/+8IePLDOA0tJSbrzxRnRd55JLLik8tyyL8847j4ULFxKLxQiFQixZsoTHHnuskKazs5Py8nIALrzwwoLspvrxG2+8wUknnURLSwt+v5+qqiq++MUvMjw8/JHLu9deewHQ0dFReHbzzTcX+n5ZWRknnHAC69evn/a7TdmwLdH999MPgIceeojddtuNeDxOOBxm9uzZ/Pd///cH1uOOO+6gqamJ1tbWD13vj6NtNmfTt8SOTo1l77zzDnvuuSfBYJDa2lp++MMfFtI8/vjj7LjjjoDc4f84x6P77ruPJUuWEAqFiEQiHHjggbz99tvT0myJ7r1f207NDTZV3veOVVOyfOeddzjuuOMoKSlht912K3y/JTJdvXo1n/3sZ6mqqsLv91NXV8cxxxwzbYwAedr51FNPbTQPKeLfE/+a2wb/JhgfH2doaGij57Ztb/Tskksu4fvf/z5HH300p556KoODg/zyl79k991359VXXy0cs958881kMhnOOOMMSktLeeGFF/jlL3/JunXrCm4cp59+Oj09PTz00EPccMMNmyzbn/70JyzL4mtf+xojIyP88Ic/5Oijj2avvfbi8ccf57vf/S5r1qzhl7/8Jd/+9rf53e9+V/jt73//e8LhMN/85jcJh8M8+uijnHfeeUxMTPCjH/1oWj6jo6N85jOf4eijj+bYY4/lr3/9K2eccQamaX7gzsepp57K9ddfz5FHHsm3vvUtnn/+eS677DKWL1/O7bffDsANN9zAb37zG1544YWCS9bixYvf973AJtvFNE2i0egH/va9mNrFmT9/Pueccw7xeJxXX32V+++/n+OOO26L3/Pmm2+y3377UV5ezgUXXIDjOJx//vlUVlZulPbj1JcpuK7LsmXLWLRoET/+8Y95+OGH+clPfkJraytnnHHGh5bLFDRN49hjj+X73/8+Tz31FAceeOAm011wwQVcdtllnHrqqey0005MTEzw0ksv8corr7Dvvvt+oF5fccUVHHLIIRx//PFYlsWNN97IUUcdxd13371Rnk899RS33XYbX/7yl4lEIvziF7/gs5/9LN3d3ZSWlgLSFWOnnXZibGyM0047jTlz5rB+/XpuueUWMpkMpmmSyWRYunQp69ev5/TTT6ehoYFnnnmGc845h97eXn7+858DciJ37LHHsvfeexdOi5YvX87TTz/N17/+9Q+UYS6XK+hsOp3m6aef5vrrr+e4446btih59NFHOeCAA1i4cCHnn38+qqpy3XXXsddee/Hkk0+y0047ccQRR7Bq1Sr+8pe/8LOf/aywO11eXs6SJUu48847mZiYIBqNIoTg6aefRlVVnnzySQ455BBALnZVVWXXXXcF2GI5gLRPv//97zn55JM588wz6ejo4Morr+TVV1/l6aefnubit2bNGo4
"text/plain": [
"<Figure size 1000x800 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 9.38 s\n",
"Wall time: 10.6 s\n"
]
}
],
"source": [
"%%time\n",
"# Compute the distance matrix based on the Euclidean distance between data points\n",
"distance_matrix = euclidean_distances(data_x, data_x)\n",
"\n",
"# Create the heatmap\n",
"plt.figure(figsize=(10, 8))\n",
"sns.heatmap(distance_matrix, cmap='viridis')\n",
"plt.title('Heatmap of Euclidean Distances Between Data Points (Patient Features)')\n",
"plt.xlabel('Data Point Index')\n",
"plt.ylabel('Data Point Index')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAggAAAK9CAYAAABB+5SlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACA+ElEQVR4nO3dd3wUdf4G8Ge27ybZTUJ6IYWS0DsISBUEKYoiIKKCYgcs2Cv2dp7iKWfhVDzPOz0VPX+Kep6C/TxUQFSKQBCEQALpm2TLzOf3x8JCCGASksxu9nn7ysvZyezsJ0OSffKdb1FEREBERER0GIPeBRAREVHoYUAgIiKiehgQiIiIqB4GBCIiIqqHAYGIiIjqYUAgIiKiehgQiIiIqB4GBCIiIqqHAYGIiIjqYUAgagXZ2dmYM2dO8PGqVaugKApWrVoV3Ddy5Eh079699YtrI+bMmYPs7Gy9yyBqMxgQiE7A+vXrcfbZZyMrKws2mw3p6ekYO3YsnnzySb1LaxE///wz7rrrLmzfvr3e5/785z9j2bJlzf6aI0eOhKIowY/4+HgMGDAAL7zwAjRNa5bXeOCBB/D22283y7mI2goGBKIm+uqrr9C/f3+sW7cOl1xyCZ566ilcfPHFMBgMeOKJJ+ocu2nTJixdulSnSpvPzz//jLvvvrtVAwIAZGRk4OWXX8bLL7+MO+64A36/H3PnzsWtt97aLOdnQCCqz6R3AUTh6v7774fL5cLq1asRGxtb53NFRUV1Hlut1lasrO1xuVw477zzgo8vu+wy5OXl4amnnsK9994Ls9msY3VEbRNbEIiaaOvWrejWrVu9cAAASUlJdR4f2QfheH7++WeMGjUKDocD6enpeOSRR+odU1RUhLlz5yI5ORk2mw29evXCSy+9VOeYo/VzAIDt27dDUZR6f+1v3LgRZ599NuLj42Gz2dC/f3+88847wc8vW7YM06ZNAwCMGjUq2OS/atUqZGdn46effsKnn34a3D9y5Mjgc8vKynDNNdcgMzMTVqsVHTt2xMMPP9zkWwQOhwMnnXQS3G43iouLj3mc2+3GddddF3zdvLw8PProozh8EVtFUeB2u/HSSy8Fa2/ovxVRW8YWBKImysrKwtdff40ff/yx2ToXlpaWYvz48TjrrLMwffp0vPHGG7jpppvQo0cPnHbaaQCAmpoajBw5Elu2bMH8+fORk5OD119/HXPmzEFZWRmuvvrqRr/uTz/9hKFDhyI9PR0333wzoqKi8M9//hNTpkzBm2++iTPPPBPDhw/HVVddhT/96U+49dZb0aVLFwBAly5dsHjxYixYsADR0dG47bbbAADJyckAgOrqaowYMQK7du3CZZddhvbt2+Orr77CLbfcgsLCQixevLhJ12rbtm0wGo1HDWgAICI4/fTTsXLlSsydOxe9e/fGhx9+iBtuuAG7du3C448/DgB4+eWXcfHFF2PgwIG49NJLAQAdOnRoUk1EbYoQUZP8+9//FqPRKEajUQYPHiw33nijfPjhh+L1eusdm5WVJbNnzw4+XrlypQCQlStXBveNGDFCAMhf//rX4D6PxyMpKSkyderU4L7FixcLAPnb3/4W3Of1emXw4MESHR0tFRUVx3wNEZGCggIBIC+++GJw3ymnnCI9evSQ2tra4D5N02TIkCHSqVOn4L7XX3/9qOcUEenWrZuMGDGi3v57771XoqKiZPPmzXX233zzzWI0GmXHjh31nnO4ESNGSH5+vhQXF0txcbFs2LBBrrrqKgEgkydPDh43e/ZsycrKCj5+++23BYDcd999dc539tlni6IosmXLluC+qKioOv8+RCTCWwxETTR27Fh8/fXXOP3007Fu3To88sgjGDduHNLT0+s0zTdGdHR0nXvtFosFAwcOxLZt24L7VqxYgZSUFMycOTO4z2w246qrrkJVVRU+/fTTRr1mSUkJPvnkE0yfPh2VlZXYt28f9u3bh/3792PcuHH45ZdfsGvXriZ9PQDw+uuvY9iwYYiLiwuee9++fRgzZgxUVcVnn332u+fYuHEjEhMTkZiYiC5duuDJJ5/ExIkT8cILLxzzOStWrIDRaMRVV11VZ/91110HEcH777/f5K+JKBLwFgPRCRgwYACWL18Or9eLdevW4a233sLjjz+Os88+G2vXrkXXrl0bdb6MjAwoilJnX1xcHH744Yfg419//RWdOnWCwVA33x9s8v/1118b9ZpbtmyBiOCOO+7AHXfccdRjioqKkJ6e3qjzHvTLL7/ghx9+QGJi4jHP/Xuys7OxdOlSKIoCm82GTp061evncaRff/0VaWlpiImJqbO/qdeJKNIwIBA1A4vFggEDBmDAgAHo3LkzLrzwQrz++utYtGhRo85jNBqPul8O61TXUEcGjYNUVa3z+GBHweuvvx7jxo076nM6duzY6Nc//Pxjx47FjTfeeNTPd+7c+XfPERUVhTFjxjS5BiJqPAYEombWv39/AEBhYWGLnD8rKws//PADNE2r04qwcePG4OeBQMsDEBhBcLgj/3LOzc0FELhN8XtvwscKHcf7XIcOHVBVVdXqb/BZWVn4z3/+g8rKyjqtCEdeJ+D4XxdRpGIfBKImWrly5VH/sl+xYgUAIC8vr0Ved8KECdizZw9ee+214D6/348nn3wS0dHRGDFiBIDAG6DRaKx3j//Pf/5zncdJSUkYOXIknn322aOGmsOHEUZFRQGoHzoOfu5o+6dPn46vv/4aH374Yb3PlZWVwe/3H/uLPQETJkyAqqp46qmn6ux//PHHoShKcFQIcOzaiSIZWxCImmjBggWorq7GmWeeifz8fHi9Xnz11Vd47bXXkJ2djQsvvLBFXvfSSy/Fs88+izlz5uC7775DdnY23njjDXz55ZdYvHhx8K9ll8uFadOm4cknn4SiKOjQoQPefffdo97zX7JkCU4++WT06NEDl1xyCXJzc7F37158/fXX+O2337Bu3ToAQO/evWE0GvHwww+jvLwcVqsVo0ePRlJSEvr164enn34a9913Hzp27IikpCSMHj0aN9xwA9555x1MmjQJc+bMQb9+/eB2u7F+/Xq88cYb2L59OxISEpr9Ok2ePBmjRo3Cbbfdhu3bt6NXr17497//jX/961+45ppr6gxl7NevH/7zn//gscceQ1paGnJycjBo0KBmr4korOg7iIIofL3//vty0UUXSX5+vkRHR4vFYpGOHTvKggULZO/evXWObegwx27dutV7nSOH74mI7N27Vy688EJJSEgQi8UiPXr0qDNs8aDi4mKZOnWqOBwOiYuLk8suu0x+/PHHesMcRUS2bt0qF1xwgaSkpIjZbJb09HSZNGmSvPHGG3WOW7p0qeTm5orRaKzzNezZs0cmTpwoMTExAqDOkMfKykq55ZZbpGPHjmKxWCQhIUGGDBkijz766FGHhR7uWNelIdepsrJSrr32WklLSxOz2SydOnWSP/zhD6JpWp3jNm7cKMOHDxe73S4AOOSRSEQUkSb0fiIiIqI2jX0QiIiIqB4GBCIiIqqHAYGIiIjqYUAgIiKiehgQiIiIqB4GBCIiIqonJCZK0jQNu3fvRkxMDKc8JSIiagQRQWVlJdLS0uot4nYiQiIg7N69G5mZmXqXQUREFLZ27tyJjIyMZjtfSASEg1PD7ty5E06nU+dqiEgXbjeQlhbY3r0bOLDuAxEdX0VFBTIzM+stbX6iQiIgHLyt4HQ6GRCIItXhS107nQwIRI3U3Lfo2UmRiIiI6mFAICIionoYEIiIiKgeBgQiIiKqhwGBiIiI6gmJUQxERDAagQkTDm0Tka4YEIgoNNhswHvv6V0FER3AWwxERERUDwMCERER1cOAQEShwe0OzJ4YFRXYJiJdsQ8CEYWO6mq9KyCiA9iCQERERPUwIBAREVE9DAhERERUDwMCERER1cOAQERERPVwFAMRhQaDARgx4tA2EemKP4XNbOTIkbjmmmuCj7Ozs7F48eLgY0VR8Pbbb7d6XUQhz24HVq0KfNjteldDFPHCOiB8/fXXMBqNmDhxYqu8nqqqeOihh5Cfnw+73Y74+HgMGjQIf/nLX4L
"text/plain": [
"<Figure size 600x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"n_clusters = kmeans.n_clusters\n",
"# Compute the silhouette scores for each sample\n",
"silhouette_vals = silhouette_samples(data_x, clusters)\n",
"\n",
"# Start plotting\n",
"fig, ax = plt.subplots(figsize=(6, 8))\n",
"y_lower, y_upper = 0, 0\n",
"yticks = []\n",
"\n",
"for i, cluster in enumerate(np.unique(clusters)):\n",
" cluster_silhouette_vals = silhouette_vals[clusters == cluster]\n",
" cluster_silhouette_vals.sort()\n",
" y_upper += len(cluster_silhouette_vals)\n",
" \n",
" color = cm.nipy_spectral(float(i) / n_clusters)\n",
" # Ensure the y-axis range matches the length of cluster_silhouette_vals\n",
" y_range = np.arange(y_lower, y_lower + len(cluster_silhouette_vals))\n",
" ax.fill_betweenx(y_range,\n",
" 0, cluster_silhouette_vals,\n",
" facecolor=color, edgecolor=color, alpha=0.7)\n",
" \n",
" # Label the silhouette plots with their cluster numbers at the middle\n",
" ax.text(-0.05, y_lower + 0.5 * len(cluster_silhouette_vals), str(cluster))\n",
" \n",
" # Compute the new y_lower for next plot\n",
" y_lower = y_upper + 10 # 10 for the 0 samples\n",
"\n",
"# The vertical line for average silhouette score of all the values\n",
"average_score = silhouette_score(data_x, clusters)\n",
"ax.axvline(x=average_score, color=\"red\", linestyle=\"--\")\n",
"\n",
"ax.set_yticks([]) # Clear the yaxis clusters / ticks\n",
"ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n",
"plt.title('Silhouette Plot')\n",
"plt.xlabel('Silhouette Coefficient Values')\n",
"plt.ylabel('Cluster Label')\n",
"\n",
"# Add the silhouette score to the plot\n",
"plt.text(0.02, 0.95, f'Avg Silhouette Score: {average_score:.2f}', transform=ax.transAxes)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAACVUAAAJOCAYAAAC9NnWuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hU1dbA4d85UzPpkEDoCUV6EwQUadJE7IoXK1ixfKJe27WL9dqxX2yo2BGwI0gREJHee0no6X162d8fkwwZkkACgVDWyzMPM6fuUzNnzdp7a0ophRBCCCGEEEIIIYQQQgghhBBCCCGEEEIIAPTaLoAQQgghhBBCCCGEEEIIIYQQQgghhBBCnEgkqUoIIYQQQgghhBBCCCGEEEIIIYQQQgghypCkKiGEEEIIIYQQQgghhBBCCCGEEEIIIYQoQ5KqhBBCCCGEEEIIIYQQQgghhBBCCCGEEKIMSaoSQgghhBBCCCGEEEIIIYQQQgghhBBCiDIkqUoIIYQQQgghhBBCCCGEEEIIIYQQQgghypCkKiGEEEIIIYQQQgghhBBCCCGEEEIIIYQoQ5KqhBBCCCGEEEIIIYQQQgghhBBCCCGEEKIMSaoSQgghhBBCCCGEEEIIIYQQQgghhBBCiDIkqUoIIYQQJ43Ro0eTnJxc28WoNa+88grNmzfHYDDQpUuX2i5OmD///BNN0/j+++9ruygnreTkZEaPHn3c1rd7926sVisLFy48bussVdvXcq9evXjooYdqbf1CCCGEEEIIcSp7+umn0TTtiOb99NNP0TSNtLS0mi1UGWlpaWiaxqeffnrM1lERTdP4v//7vxpb3rHYjoyMDK688krq1q2LpmmMHz++xpZdE0aPHk1UVFRtF+OkVRq/+/PPP4/bOl9++WXatGlDIBA4busspWkaTz/99HFfL8CGDRswGo2sW7euVtYvhBCi5khSlRBCiBpRGvCo6PWf//znmKzz77//5umnnyY/P/+YLP9olO6PZcuWhQ0vKCigR48eWK1Wfv/990POq2kaf/31V7nxSimaNGmCpmlceOGFx6T8x1thYSHjxo2jc+fOREVFERERQYcOHXj44YfZt2/fcSvHe++9d9wDalU1c+ZMHnroIXr37s3EiRN54YUXartINerge4jVaqVhw4YMHTqUt956i6KioiNe9rG6V5QGiUtfNpuNdu3a8fjjj1NYWFij6zqcIzl3n3nmGXr27Env3r1Dw0aPHh22TTExMXTu3JnXXnsNt9tdreXv27ePp59+mlWrVlVrvpqyYcMGnn766QoD8Q8//DDvvvsu6enpx79gQgghhBBCiFoncayKrV+/nuuuu45GjRphsVho2LAh1157LevXr6/totWK07EC2X333ceMGTN45JFHmDRpEueff35tF6lG9e/fP3St67pOTEwMrVu35vrrr+ePP/44qmUfq7hicnJy2D2qXr169OnTh2nTptX4ug7lSOI8hYWFvPTSSzz88MPo+oGfpMtuj67rNGzYkCFDhhxRstdvv/1Wa4lTAF999VWFyYft2rVj+PDhPPnkk8e/UEIIIWqUsbYLIIQQ4tTyzDPPkJKSEjasQ4cOx2Rdf//9N+PGjWP06NHExcUdk3XUpMLCQoYMGcKaNWuYNm3aYYMSVquVr776inPPPTds+Lx589izZw8Wi+VYFve42bFjB4MGDWLXrl2MGDGC2267DbPZzJo1a/j444+ZNm0aW7ZsOS5lee+990hISDiurQVV1Zw5c9B1nY8//hiz2VzbxTlmSu8hXq+X9PR0/vzzT+69915ef/11fvrpJzp16lTtZR7re8X7779PVFQUxcXFzJw5k+eff545c+awcOHCatXM3bx5c1iAqTqqe+5mZWXx2Wef8dlnn5UbZ7FY+OijjwDIz89nypQpPPDAAyxdupRvvvmmymXat28f48aNIzk5uVzLah9++OExr6G4YcMGxo0bR//+/cu1inXJJZcQExPDe++9xzPPPHNMyyGEEEIIIYQ4cUkc64CpU6dy9dVXU6dOHW6++WZSUlJIS0vj448/5vvvv+ebb77hsssuq9KyHn/88SNOTrv++usZOXLkKRP3OtnMmTOHSy65hAceeKC2i3LMNG7cmBdffBEAu93Otm3bmDp1Kl988QVXXXUVX3zxBSaTqdrLPZZxxS5dunD//fcDwXjLhAkTuPzyy3n//fe5/fbbq7ycvn374nQ6jyi2eKg4T2U++eQTfD4fV199dblxgwcP5oYbbkApRWpqKu+99x7nnXcev/76K8OGDatyuX777TfefffdChOrnE4nRuOx/Sn8q6++Yt26ddx7773lxt1+++1ccMEFbN++nRYtWhzTcgghhDh2JKlKCCFEjRo2bBjdu3ev7WIcFbvdTmRkZI0us6ioiKFDh7Jq1SqmTp1apQfDCy64gMmTJ/PWW2+FPfx99dVXdOvWjezs7BotY23w+XxcfvnlZGRk8Oeff5ZLIHv++ed56aWXaql0NcPn8xEIBI46ESozM5OIiIhTOqEKyt9DHnnkEebMmcOFF17IxRdfzMaNG4mIiKjFEpZ35ZVXkpCQAASDJVdccQVTp07ln3/+4eyzz67yco5nwPiLL77AaDRy0UUXlRtnNBq57rrrQp/vvPNOevbsybfffsvrr79Ow4YNj3r9RxKcrEm6rnPllVfy+eefM27cuCPulkIIIYQQQghxcpM4VtD27du5/vrrad68OfPnzycxMTE07p577qFPnz5cf/31rFmzhubNmx+2LEaj8YgTGQwGAwaD4YjmFUcvMzPzhEz6q0mxsbFhcQ+A//73v4wdO5b33nuP5OTkEy4e2ahRo7Ay33DDDbRs2ZI33nijWklVuq5jtVqPRRErNHHiRC6++OIK13nGGWeEbdNll11Gp06dGD9+fLWSqg7leG5rRQYNGkR8fDyfffaZVOoTQoiTmHT/J4QQ4riaPn06ffr0ITIykujoaIYPH16uCfE1a9YwevRomjdvjtVqJSkpiZtuuomcnJzQNE8//TQPPvggACkpKaHmgtPS0khLS0PTtAqbWz64H/XS7rs2bNjANddcQ3x8fFhizxdffEG3bt2IiIigTp06jBw5kt27d1drm4uLizn//PNZsWIFU6ZMYfjw4VWa7+qrryYnJyes6WmPx8P333/PNddcU+E8gUCA8ePH0759e6xWK/Xr12fMmDHk5eWFTffjjz8yfPhwGjZsiMVioUWLFjz77LP4/f6w6fr370+HDh3YsGEDAwYMwGaz0ahRI15++eVy63777bdp3749NpuN+Ph4unfvzldffXXIbZwyZQqrV6/mscceK5dQBRATE8Pzzz9f6fylzbAf3DR0RedAeno6N954I40bN8ZisdCgQQMuueSSUNdgycnJrF+/nnnz5oXOp/79+4fmz8/P595776VJkyZYLBZatmzJSy+9FNbaTul6X331VcaPH0+LFi2wWCxs2LCh0m3w+Xw8++yzoWmTk5N59NFHw7pa0zSNiRMnYrfbQ2U7XHPiixcv5vzzzyc2NhabzUa/fv1YuHBh2DQ7d+7kzjvvpHXr1kRERFC3bl1GjBhRYXdp+fn53HfffSQnJ2OxWGjcuDE33HBDueS+QCDA888/T+PGjbFarQwcOJBt27YdsqyHc9555/HEE0+wc+dOvvjii9Dwo71XQDC4c95551GvXj0sFgvt2rXj/fffP+ryAqSmpgLBoPL9998fOndat27Nq6++ilIqbL7k5OSw2oyl3VEsXLiQf//73yQmJhIZGclll11GVlZW2HyHOncr8sMPP9CzZ0+ioqIOuz26roeWl5aWRm5uLg888AAdO3YkKiqKmJgYhg0bxurVq0Pz/Pnnn5x11lkA3HjjjeXO29GjR5drPaqq96/k5GQuvPBC/vrrr1B3qs2bN+fzzz8P23cjRowAYMCAAaH1l71XDB48mJ07d9Za94RCCCGEEEKIE9/pEsd65ZVXcDgcfPDBB2EJVQAJCQlMmDABu90eFg86VFlKx5XldDoZO3YsCQkJREdHc/HFF7N3795y21j6LFw2NlGV50CgSs+rx8K
"text/plain": [
"<Figure size 2400x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create a figure with two subplots side by side\n",
"fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 6))\n",
"\n",
"# First subplot for clusters\n",
"plot_data_clusters = pd.DataFrame(data_x.copy())\n",
"plot_data_clusters['Cluster'] = clusters\n",
"new_columns_clusters = plot_data_clusters.columns.tolist()\n",
"new_columns_clusters[0:10] = feature_names\n",
"plot_data_clusters.columns = new_columns_clusters\n",
"parallel_coordinates(plot_data_clusters, 'Cluster', colormap='viridis', ax=axes[0])\n",
"axes[0].set_title('Feature K-Means Cluster of each Data Point (Patient)')\n",
"axes[0].set_xlabel('Feature')\n",
"axes[0].set_ylabel('Feature Value')\n",
"axes[0].tick_params(axis='x', rotation=90)\n",
"\n",
"# Second subplot for labels\n",
"plot_data_labels = pd.DataFrame(data_x.copy())\n",
"label_data = data_y.reset_index(drop=True)\n",
"plot_data_labels['Cluster'] = label_data\n",
"\n",
"new_columns_labels = plot_data_labels.columns.tolist()\n",
"new_columns_labels[0:10] = feature_names\n",
"plot_data_labels.columns = new_columns_labels\n",
"parallel_coordinates(plot_data_labels, 'Cluster', colormap='viridis', ax=axes[1])\n",
"axes[1].set_title('Feature Original Label of each Data Point (Patient)')\n",
"axes[1].set_xlabel('Feature')\n",
"axes[1].set_ylabel('Feature Value')\n",
"axes[1].tick_params(axis='x', rotation=90)\n",
"# set the legend\n",
"axes[1].legend(loc='upper right', labels=label_names)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}