DSA_SS24/notebooks/cluster_features.ipynb

486 lines
4.3 MiB
Plaintext
Raw Normal View History

2024-06-24 20:27:46 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import matplotlib.cm as cm\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import sqlite3\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"from pandas.plotting import parallel_coordinates\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_samples, silhouette_score\n",
"from sklearn.metrics.pairwise import euclidean_distances\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"# connect to the database\n",
"conn = sqlite3.connect('../features.db')\n",
"c = conn.cursor()\n",
"# get training, validation and test data\n",
"train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n",
"valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n",
"test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n",
"# close the connection\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature names: ['age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n",
"Label names: ['GSVT', 'AFIB', 'SR', 'SB']\n"
]
}
],
"source": [
"feature_names = train.columns[2:]\n",
"print('Feature names:', list(feature_names))\n",
"\n",
"with open('../settings.json', 'r') as f:\n",
" settings = json.load(f)\n",
"label_names = list(settings['labels'].keys())\n",
"print('Label names:', label_names)"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train_x shape: (4378, 10)\n",
"features: ['id', 'age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n",
"number of classes: 4\n"
]
}
],
"source": [
"# get the target and features\n",
"train_y = train['y']\n",
"train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"train_x = train.drop(columns=['y'])\n",
"\n",
"valid_y = valid['y']\n",
"valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"valid_x = valid.drop(columns=['y'])\n",
"\n",
"test_y = test['y']\n",
"test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
"test_x = test.drop(columns=['y'])\n",
"\n",
"# add train validation and test data wit concat\n",
"data_x = pd.concat([train_x, valid_x, test_x], axis=0)\n",
"data_y = pd.concat([train_y, valid_y, test_y], axis=0)\n",
"\n",
"# drop id column\n",
"data_x = data_x.drop(columns=['id'])\n",
"print('train_x shape:', data_x.shape)\n",
"\n",
"\n",
"# dealing with missing values\n",
"# Create an imputer object with a mean filling strategy\n",
"imputer = SimpleImputer(strategy='mean')\n",
"data_x = imputer.fit_transform(data_x)\n",
"\n",
"# Scale Data between 0 and 1\n",
"scaler = MinMaxScaler()\n",
"# Fit the scaler to your data and then transform it\n",
"data_x = scaler.fit_transform(data_x)\n",
"# convert to Series\n",
"data_x = pd.DataFrame(data_x)\n",
"\n",
"# print column names\n",
"print('features:', train_x.columns.to_list())\n",
"num_classes= len(set(valid_y.to_list()))\n",
"print('number of classes:', num_classes)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqUAAAHHCAYAAACGDCH+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOMElEQVR4nOzdd1hT1xsH8G/CCBsEGSJDRByoOFARt6Ki4t7WgXsUtYq11rq1FVfdq46qddQ9Wq0DR7Uq7oV7S1WWIkNk5/7+4EdqBBQ0cJPw/fjkecy5JzfvHSRvzrnnXIkgCAKIiIiIiEQkFTsAIiIiIiImpUREREQkOialRERERCQ6JqVEREREJDompUREREQkOialRERERCQ6JqVEREREJDompUREREQkOialRERERCQ6jUtKHzx4gObNm8Pc3BwSiQR79+5V6fqfPn0KiUSC9evXq3S9mqxRo0Zo1KhRob9veno6vvvuOzg6OkIqlaJ9+/ZfvM6///4bEokEf//99xevK8v69eshkUjw9OlTRdmH+6ygzqupU6dCIpGodJ05ydrGS5cuFfh7Eb2vsM7xoqog969Y3x0FjXlCwfmspPTRo0cYMmQISpcuDQMDA5iZmaFu3bpYtGgRkpKSVB2jEn9/f4SGhuKnn37Cxo0bUaNGjQJ9v8LUt29fSCQSmJmZ5bgfHzx4AIlEAolEgnnz5uV7/S9fvsTUqVNx7do1FURb8H799VfMnTsXnTt3xoYNGzB69Ohc68rlcvz222/w8vKCpaUlTE1NUbZsWfTp0wfnzp0rxKjFN3PmTJX/WCsIf/31F6ZOnSp2GAUm64vrw79VQRAwZMgQSCQSxfZn/ViSSCTYtGlTjuurW7cuJBIJKlWqVNChF4rk5GQsWLAAXl5eMDc3h4GBAcqWLYvhw4fj/v37hRbHli1bsHDhwkJ7P02V9f2U9TAxMUHp0qXRuXNn7Nq1C3K5XOwQNdqNGzfQr18/uLi4wMDAACYmJqhatSq+++47PH78WOzwCo+QT/v37xcMDQ0FCwsLYeTIkcKqVauEpUuXCt27dxf09PSEQYMG5XeVefbu3TsBgDBhwoQCew+5XC4kJSUJ6enpBfYeufH39xd0dXUFHR0dYdu2bdmWT5kyRTAwMBAACHPnzs33+i9evCgAENatW5ev16WkpAgpKSn5fr8v1a1bN6FkyZJ5qhsQECAAENq1aycsWrRIWLZsmTBy5EjBzc1NmDJliqJeRkaGkJSUJGRkZKgszvT0dCEpKUmQy+WKsoYNGwoNGzZUPC+o8yotLU1ISkpSKjM2Nhb8/f1V+j7r1q0TAAgXL15U2Tqzjpm2evLkSba/VblcLgwdOlQAIEyaNElRfuLECQGAYGBgILRs2TLXdRkYGAgVK1YslPgLUnR0tODp6SkAEFq3bi0sXLhQWLNmjTB27FjB0dFR0NPTU9SdMmVKgZ4nfn5+grOzc4GtX93ldf/6+/sLMplM2Lhxo7Bx40Zh1apVwoQJEwQPDw8BgNCoUSMhLi5O6TVifXcUNFV/nq9atUrQ0dERbG1thcDAQGHVqlXC8uXLha+//lqwtbUV9PT0RMlJxKCbnwT2yZMn6N69O5ydnXH8+HGUKFFCsSwgIAAPHz7EgQMHVJAq5yw6OhoAYGFhUWDvIZFIYGBgUGDr/xSZTIa6devi999/R9euXZWWbdmyBX5+fti1a1ehxPLu3TsYGRlBX1+/UN7vQ1FRUXk61pGRkVi+fDkGDRqEVatWKS1buHCh4rwBAKlUqvLjq6OjAx0dnY/WUfV5lZiYCGNjY+jq6kJXN19/xiSiESNGYOXKlZgwYQKmT5+ebXmrVq3wxx9/4NWrVyhevLiifMuWLbC1tYWbmxvevHlTmCEXiL59++Lq1avYuXMnOnXqpLRsxowZmDBhgkiRqYZcLkdqaqqo3yUFQVdXF7169VIq+/HHHzFr1iyMHz8egwYNwrZt2xTLxPruKGiq/Dw/e/Yshg0bhrp162L//v0wNTVVWv7zzz/jp59++uR6sr6vNV5+MtisX/hnzpzJU/20tDRh+vTpQunSpQV9fX3B2dlZGD9+vJCcnKxUz9nZWfDz8xP++ecfoWbNmoJMJhNcXFyEDRs2KOpk/Zp7/5H169bf3z/HX7o5/QI8cuSIULduXcHc3FwwNjYWypYtK4wfP16xPKtF4sPWxGPHjgn16tUTjIyMBHNzc6Ft27bC7du3c3y/Bw8eCP7+/oK5ublgZmYm9O3bV0hMTPzk/vL39xeMjY2F9evXCzKZTHjz5o1i2YULFwQAwq5du7K1vrx+/VoYM2aMUKlSJcHY2FgwNTUVWrRoIVy7dk1RJ6sl5sNH1nY2bNhQqFixonDp0iWhfv36gqGhofDNN98olr3f6tenTx9BJpNl2/7mzZsLFhYWwosXLz66nW/fvhUCAwMFBwcHQV9fXyhbtqwwd+5cRUtj1jH48HHixIkc1xcSEiIAENavX/+JPfzffnh/XVnbfv36daFBgwaCoaGh4OrqKuzYsUMQBEH4+++/hVq1agkGBgZC2bJlheDgYKV1ZrUiPnnyRGmd7++znM6r69evC/7+/oKLi4sgk8kEW1tboV+/fsKrV6+U1p91Xt26dUvo0aOHYGFhIVStWlVpWZac9pu/v79w/PhxAYCwe/fubPtk8+bNAgDh7Nmzue63rG08efKkMHjwYMHS0lIwNTUVevfuLcTExGSr/9dffyn+XkxMTIRWrVoJN2/eVCz39/fPMVZBEIRq1aoJHTp0UFpfpUqVBADC9evXFWVbt24VACidh8+fPxf69esn2NjYCPr6+oK7u7uwdu3abPElJycLkydPFlxdXQV9fX3BwcFBGDt2bLbPJgBCQECAsGfPHqFixYqKdR48eDDXfZXlw5bSkSNHCgCUPm+yZJ2XGzZsEIyNjYXly5crLa9YsaIwYsQIxbn6oY0bNwrVq1cXDAwMhGLFigndunUTwsLClOqcOnVK6Ny5s+Do6KjY5lGjRgnv3r1Tqpf1OfT8+XOhXbt2grGxsVC8eHFhzJgx2Vprfv/9d6F69eqCiYmJYGpqKlSqVElYuHDhR/fLuXPnBAB57lX78BzP7TNaEDKP1/s9I/Hx8cI333wjODs7C/r6+oK1tbXQtGlT4fLly4IgZP6d5va9Igj5P082bdokuLu7C7q6usKePXs+ex8JgiDMnTtX8Pb2FiwtLQUDAwOhevXqis+knN47L+foP//8I9SoUUOQyWRC6dKlhZUrV+arpdTY2DjX5c2bNxckEolw7949RdmHn4MpKSnCpEmThOrVqwtmZmaCkZGRUK9ePeH48ePZ1vfq1SuhV69egqmpqWBubi706dNHuHbtWrZjn5/z9VPfPVk+J08IDw8X+vbtK5QsWVLQ19cX7OzshLZt2yp9L+S233R1dYV///33o/Xe97Hv68jISKF///6CjY2NIJPJBA8Pj2zfjTl9D+a2XVn799GjR0Lz5s0FIyMjoUSJEsK0adOy7bfPPdffl68mlj///BOlS5dGnTp18lR/4MCB2LBhAzp37owxY8bg/PnzCAoKwp07d7Bnzx6lug8fPkTnzp0xYMAA+Pv749dff0Xfvn3h6emJihUromPHjrCwsMDo0aPRo0cPtGrVCiYmJvkJH7du3ULr1q3h4eGB6dOnQyaT4eHDhzhz5sxHX3f06FG0bNkSpUuXxtSpU5GUlIQlS5agbt26uHLlCkqVKqVUv2vXrnBxcUFQUBCuXLmCNWvWwMbGBrNnz85TnB07dsTQoUOxe/du9O/fH0BmS0n58uVRvXr1bPUfP36MvXv3okuXLnBxcUFkZCR++eUXNGzYELdv34a9vT0qVKiA6dOnY/LkyRg8eDDq168PAErH8vXr12jZsiW6d++OXr16wdbWNsf4Fi1ahOPHj8Pf3x8hISHQ0dHBL7/8giNHjmDjxo2wt7fPddsEQUDbtm1x4sQJDBgwAFWrVsXhw4cxduxYvHjxAgsWLIC1tTU2btyIn376CW/fvkVQUBAAoEKFCjmu09nZGQCwY8cOdOnS5bN+Lb558watW7dG9+7
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"kmeans = KMeans(n_clusters=num_classes, random_state=42)\n",
"clusters = kmeans.fit_predict(data_x)\n",
"\n",
"corr_ma = confusion_matrix(data_y, clusters)\n",
"sns.heatmap(corr_ma, annot=True, fmt=\"d\", cmap='Blues', yticklabels=label_names)\n",
"plt.xlabel('KMeans Clusters')\n",
"plt.ylabel('Diagnosis Group Labels')\n",
"plt.title('Confusion Matrix of Similiarity between KMeans Clusters and Diagnosis Groups')\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cluster 0: 1464 patients\n",
"Cluster 1: 1772 patients\n",
"Cluster 2: 579 patients\n",
"Cluster 3: 563 patients\n"
]
}
],
"source": [
"# Initialize a dictionary to count patients in each cluster\n",
"cluster_patient_count = {i: 0 for i in range(kmeans.n_clusters)}\n",
"\n",
"# Iterate over the assigned clusters and increment the count for each cluster\n",
"for cluster in clusters:\n",
" cluster_patient_count[cluster] += 1\n",
"\n",
"# Print the number of patients in each cluster\n",
"for cluster, count in cluster_patient_count.items():\n",
" print(f\"Cluster {cluster}: {count} patients\")"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABmkAAAJwCAYAAABicYUDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hUVfrA8e+UTEnPJCGNkEroNfQqAgYULAu4qFQBRRAWFRXcVaTJsgJWEAsKKj8soKiAVEGkKM0A0ktCSyVh0qff3x9sZokJEBAI5f08zzyae8899z13ZvS+c+45R6UoioIQQgghhBBCCCGEEEIIIYS4odRVHYAQQgghhBBCCCGEEEIIIcSdSDpphBBCCCGEEEIIIYQQQgghqoB00gghhBBCCCGEEEIIIYQQQlQB6aQRQgghhBBCCCGEEEIIIYSoAtJJI4QQQgghhBBCCCGEEEIIUQWkk0YIIYQQQgghhBBCCCGEEKIKSCeNEEIIIYQQQgghhBBCCCFEFZBOGiGEEEIIIYQQQgghhBBCiCognTRCCCGEEEIIIYQQQgghhBBVQDpphBB3lPnz56NSqUhNTa3qUK4blUrF008/fc3qS01NRaVSMX/+/GtWZ2Vs376dNm3a4OXlhUqlIjk5+YaeX/zPoEGDiI6OrpJzR0dHM2jQoCo5txBCCCHE7ep2yIvuuusu7rrrrqoO46aVmZlJ7969CQwMRKVS8eabb1Z1SHesqvy+VWUuJ4SoPOmkEULcFEpvWkpfBoOBhIQEnn76aTIzM6+4vtdee42lS5de+0Arobi4mFdffZUNGzZUqvyGDRtQqVQsXrz4+gZ2C7Hb7fTp04fc3FzeeOMNPvvsM6KiopgzZ84N7yyKjo4u89msVq0a7du359tvv62w/Lfffkv37t0JCgpCp9MRHh7Oww8/zE8//VRh+RUrVqBSqQgPD8flclU6rkGDBpWJS6/Xk5CQwCuvvILFYrmqtlaVLVu28Oqrr2I2m6s6FCGEEEKIKnUn50WlMjMzGTt2LLVr18bT0xMvLy8SExOZMmXKDb1frMprdy0888wzrFq1ivHjx/PZZ5/RrVs3VqxYwauvvnpD4/hz3uLr60ujRo2YOXMmVqu1XPnk5GT69etHZGQker0ek8lEly5d+OSTT3A6neXKm81mDAYDKpWKAwcOVDquP3/XtFotERERDBo0iDNnzvylNt9oaWlpvPrqq/JgoxC3MG1VByCEEBeaNGkSMTExWCwWNm3axHvvvceKFSv4448/8PT0rHQ9r732Gr179+bBBx8ss71///707dsXvV5/jSP/n+LiYiZOnAggT3ZdpWPHjnHixAk+/PBDhg4d6t4+Z84cgoKCbvjIisaNG/Pcc88B52+A33//ff72t7/x3nvvMXz4cAAUReHxxx9n/vz5NGnShGeffZbQ0FDS09P59ttv6dy5M5s3b6ZNmzZl6l64cCHR0dGkpqby008/0aVLl0rHpdfr+eijjwDIy8vju+++Y/LkyRw7doyFCxdeo9Zff1u2bGHixIkMGjQIf3//MvsOHTqEWi3PlAghhBDiznKn5kXbt2/n3nvvpbCwkH79+pGYmAjAjh07+Pe//83GjRtZvXr19Qq5jItdu1vFTz/9xAMPPMDYsWPd2959911mz559wztqLsxbzGYzS5YsYezYsWzfvp0vvvjCXe6jjz5i+PDhhISE0L9/f2rWrElBQQHr1q1jyJAhpKen89JLL5Wp++uvv0alUhEaGsrChQuZMmXKFcV24Xft119/Zf78+WzatIk//vgDg8Hw1xt/A6SlpTFx4kSio6Np3LhxmX0ffvjhFT0MKISoGtJJI4S4qXTv3p1mzZoBMHToUAIDA5k1axbfffcdjzzyyF+uX6PRoNFo/nI94vrKysoCKPeD/fXgcDhwuVzodLqLlomIiKBfv37uvwcMGEB8fDxvvPGGu5Nm5syZzJ8/nzFjxjBr1ixUKpW7/D//+U8+++wztNqy/9stKiriu+++Y9q0aXzyyScsXLjwijpptFptmbhGjBhBmzZtWLRoEbNmzSIkJKTSdd2srucPB0IIIYQQN6s7MS8ym8089NBDaDQafv/9d2rXrl1m/9SpU/nwww+rKLprw2KxoNPpbshDSFlZWTckn1IUBYvFgtFovGiZivKWli1b8uWXXzJr1izCw8P59ddfGT58OK1bt2bFihX4+Pi4y48ZM4YdO3bwxx9/lKv7888/59577yUqKor/+7//u+JOmj9/14KCgpg+fTrff/89Dz/88BXVdTPy8PCo6hCEEJUgj6YKIW5qd999NwApKSkAzJgxgzZt2hAYGIjRaCQxMbHcNGEqlYqioiIWLFjgHrpcOvLiYnPB/vjjj7Rv3x4vLy98fHy477772LdvX5kygwYNwtvbmzNnzvDggw/i7e1NcHAwY8eOdQ+7Tk1NJTg4GICJEye6z38tnlSqTNsvtHDhQmrVqoXBYCAxMZGNGzeWK3PmzBkef/xxQkJC0Ov11KtXj48//viysWRkZDB48GCqV6+OXq8nLCyMBx544LJz7O7Zs4dBgwYRGxuLwWAgNDSUxx9/nJycHHeZQYMG0bFjRwD69OmDSqXirrvuIjo6mn379vHzzz+7r+uFT+SZzWbGjBnjHhYfHx/P9OnTyzw1VLq+zowZM3jzzTeJi4tDr9ezf//+y7b5QqGhodSpU8f9uSwpKWHatGnUrl2bGTNmlOmgKdW/f39atGhRZtu3335LSUkJffr0oW/fvnzzzTd/aaoylUpFu3btUBSF48ePl9lXmc84wNKlS6lfvz4Gg4H69etXOK1b6RR9f5664mLrFx08eJCHH36Y4OBgjEYjtWrV4p///CcAr776Ks8//zwAMTEx7ve29LNU0Zo0x48fp0+fPphMJjw9PWnVqhXLly+vMMavvvqKqVOnUr16dQwGA507d+bo0aOXu5RCCCGEEDeVOyEvev/99zlz5gyzZs0q10EDEBISwr/+9a+LHn+xNlV073rkyBF69epFaGgoBoOB6tWr07dvX/Ly8i577aByeVTpeb/44gv+9a9/ERERgaenJ/n5+djtdiZOnEjNmjUxGAwEBgbSrl071qxZc9H2AeTm5jJ27FgaNGiAt7c3vr6+dO/end27d5e7DoqiMHv27DLxz549292+0lcpl8vFm2++Sb169TAYDISEhPDkk09y7ty5MjFER0fTo0cPVq1aRbNmzTAajbz//vuXjPvP1Gq1O5crfb9KPycLFy4s00FTqlmzZuXygpMnT/LLL7/Qt29f+vbtS0pKClu2bLmiWP6sffv2wPnZHS508OBBevfujclkwmAw0KxZM77//vtyx+/bt4+7774bo9FI9erVmTJlSoUjWS72fago/zGbzTzzzDNER0ej1+upXr06AwYM4OzZs2zYsIHmzZsDMHjwYPf7WpqTVbQmTVFREc8995w7d65VqxYzZsxAUZRyMT799NPuHLH0s75y5cpLXUIhxFWQkTRCiJta6Y1RYGAgAG+99Rb3338/jz32GDabjS+++II+ffqwbNky7rvvPgA+++wzhg4dSosWLXjiiScAiIuLu+g5PvvsMwYOHEhSUhLTp0+nuLiY9957j3bt2vH777+XuaFxOp0kJSXRsmVLZsyYwdq1a5k5cyZxcXE89dRTBAcH89577/HUU0/x0EMP8be//Q2Ahg0b/uVrUZm2l/r555/58ssvGT16NHq9njlz5tCtWze2bdtG/fr1gfNzPbdq1cp94xUcHMyPP/7IkCFDyM/PZ8yYMReNpVevXuzbt49Ro0YRHR1NVlYWa9as4eTJk5dclHDNmjUcP36cwYMHExoayr59+/jggw/Yt28fv/76KyqViieffJKIiAhee+01Ro8eTfPmzQkJCaGoqIhRo0bh7e3t/oG/dKRIcXExHTt25MyZMzz55JPUqFGDLVu2MH78eNLT08stkvnJJ59gsVh44okn3PMcXwm73c6pU6fcn8tNmzaRm5vLmDFjruiJxIULF9KpUydCQ0Pp27cv48aN44cffqBPnz5XFM+FSpOcgIAA97bKfsZXr15
"text/plain": [
"<Figure size 2000x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Apply PCA to reduce to 2 dimensions\n",
"pca = PCA(n_components=2)\n",
"reduced_data = pca.fit_transform(data_x)\n",
"\n",
"# Create a figure with two subplots\n",
"fig, axs = plt.subplots(1, 2, figsize=(20, 7))\n",
"\n",
"# First subplot for patient labels after PCA reduction\n",
"for i in range(len(set(data_y))):\n",
" cluster_data = reduced_data[data_y == i]\n",
" axs[0].scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Label {label_names[i]}', alpha=0.7, edgecolors='w')\n",
"\n",
"axs[0].set_title('Patient Labels after PCA Reduction')\n",
"axs[0].set_xlabel('PCA 1')\n",
"axs[0].set_ylabel('PCA 2')\n",
"axs[0].legend()\n",
"\n",
"# Second subplot for patient clusters after PCA reduction\n",
"for i in range(kmeans.n_clusters):\n",
" cluster_data = reduced_data[clusters == i]\n",
" axs[1].scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {i}', alpha=0.7, edgecolors='w')\n",
"\n",
"axs[1].set_title('Patient Clusters after PCA Reduction')\n",
"axs[1].set_xlabel('PCA 1')\n",
"axs[1].set_ylabel('PCA 2')\n",
"axs[1].legend()\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABkUAAAKyCAYAAABrMNniAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxTVfo/8M9Nk6ZplqalLS2llJa2bAJKoSxVkREsLjjOAA7jAiigLKI44oLfcQFBhlFwXFB0RFHH0RlF0XFjcXAE1EFUStkKLbTQhbVJ9+z39we/eydp0jZtk66f9+vVl5Lc3Htys53nPuc8RxBFUQQREREREREREREREVEXp2jvBhAREREREREREREREbUFJkWIiIiIiIiIiIiIiKhbYFKEiIiIiIiIiIiIiIi6BSZFiIiIiIiIiIiIiIioW2BShIiIiIiIiIiIiIiIugUmRYiIiIiIiIiIiIiIqFtgUoSIiIiIiIiIiIiIiLoFJkWIiIiIiIiIiIiIiKhbYFKEiIiIiIiIiIiIiIi6BSZFqFPZuHEjBEFAYWFhezelxa666ipcddVV7d2MDuvMmTOYOnUqevToAUEQ8Je//KW9m9RttefnbdasWejbt2+bH5eIiIiIuj7GlV0f48qOg3Fl6/Xt2xc33HBDQPcpCAKefPLJgO6TqDNhUoQCQvqRk/7CwsKQnp6Oe+65B2fOnGn2/p5++mls3rw58A31Q21tLZ588kl88803zXrcmTNnsGTJEgwYMADh4eHQarXIyMjAihUrYDabg9JWX9rz3AXC/fffjy1btmDp0qV45513MGnSJHzxxRdt/mM9a9Ysj/e0wWDAsGHDsGbNGlitVq/t9+3bh9tuuw2JiYlQq9WIiorChAkT8Oabb8LpdHptbzabERYWBkEQcPjwYb/bVf+zplQqkZCQgFmzZqGkpKRVz7mtlZaW4sknn8S+ffvauymywsJCCIKAZ5991uN2URRx9913s+NIREREFESMKxlXBgrjysYxrgyuhuJKIuo4lO3dAOpali9fjuTkZFgsFuzatQuvvPIKvvjiCxw4cADh4eF+7+fpp5/G1KlTcdNNN3ncfvvtt2P69OlQq9UBbvn/1NbWYtmyZQDg98ibH3/8Eddddx2qq6tx2223ISMjAwCwd+9e/OlPf8K3336LrVu3BqvJHho6d53Fv//9b/z617/GkiVL5NteeuklrFu3rs07sGq1Gq+//jqAi53NTZs2YcmSJfjxxx/x/vvvy9u9/vrrmDdvHnr27Inbb78daWlpqKqqwtdff43Zs2ejrKwMjz76qMe+P/jgAwiCgLi4OLz77rtYsWJFs9rm/ln74YcfsHHjRuzatQsHDhxAWFhY6598GygtLcWyZcvQt29fXHrppR73/fWvf4XL5WqfhtUjiiIWLFiA1157DY899hiTIkRERERBxriScWVrMa70D+NKIuqumBShgLr22msxYsQIAMCcOXPQo0cPrF27Fp988gl+//vft3r/ISEhCAkJafV+AslsNuM3v/kNQkJC8Msvv2DAgAEe969cuRJ//etf26l1gWGxWBAaGgqFIviTy86ePQuj0Rj044iiCIvFAo1G0+A2SqUSt912m/zvBQsWYNSoUfjHP/6BtWvXolevXvjhhx8wb948jBkzBl988QX0er28/eLFi7F3714cOHDAa99/+9vfcN111yEpKQl///vfm915rf9Zi46OxurVq/Hpp5/i5ptvbta+OiKVStXeTZAtWrQI69evx//93/9h+fLl7d0cIiIioi6PcSXjytZiXOkfxpVE1F2xfBYF1a9+9SsAwIkTJwAAzz77LMaOHYsePXpAo9EgIyMDH374ocdjBEFATU0N3nrrLXkq56xZswA0XIvyyy+/xBVXXAGtVgu9Xo/rr78eBw8e9Nhm1qxZ0Ol0KCkpwU033QSdToeYmBgsWbJEnoZaWFiImJgYAMCyZcvk4zc2kuTVV19FSUkJ1q5d69VxBYCePXvij3/8Y4OPb+g5ffPNNxAEwWO69bFjxzBlyhTExcUhLCwMvXv3xvTp01FRUdHkuQOAkpIS3HnnnejZsyfUajUGDx6MN954w+dx33//ffzxj39EQkICwsPDUVlZCbvdjmXLliEtLQ1hYWHo0aMHLr/8cmzbtq3B5wcA5eXlWLJkCYYMGQKdTgeDwYBrr70WOTk5XudBFEWsW7fOo/3r1q2Tn5/0J3G5XPjLX/6CwYMHIywsDD179sTdd98Nk8nk0QapBueWLVswYsQIaDQavPrqq422uz6FQiGP8pJeL+l98u6773p0XCUjRozweA0A4OTJk9i5cyemT5+O6dOn48SJE/juu++a1Zb6rrjiCgBAQUGBx+1HjhzB1KlTERUVhbCwMIwYMQKffvqp1+MPHjyIX/3qV9BoNOjduzdWrFjhc0RNQ5+Hvn37ej1Ps9mM+++/H3379oVarUbv3r0xY8YMnD9/Ht988w1GjhwJALjjjjvk13Xjxo0AfNd+rampwQMPPCBPJe/fvz+effZZiKLo1cZ77rkHmzdvxiWXXCK/17/66qvGTqFP9913H9atW4elS5c2GWBIn51//vOfWLZsGRISEqDX6zF16lRUVFTAarVi8eLFiI2NhU6nwx133OFzyvzf/vY3ZGRkQKPRICoqCtOnT8epU6c8ttm5cyemTZuGPn36QK1WIzExEffffz/q6uo8tvPne0/y/vvvIyMjA3q9HgaDAUOGDMHzzz/f7HNGREREFGiMKxlXAowrGVd23riyIW+++SZ+9atfITY2Fmq1GoMGDcIrr7zS4PZbt27FpZdeirCwMAwaNAgfffSR1zZmsxmLFy+Wn19qaipWr17d5IyZqqoqLF68WD7PsbGxmDhxIn7++edWP0+ijogzRSiopB/SHj16AACef/553Hjjjbj11lths9nw/vvvY9q0afjss89w/fXXAwDeeecdzJkzB5mZmbjrrrsAAP369WvwGO+88w5mzpyJ7OxsrF69GrW1tXjllVdw+eWX45dffvH4AXQ6ncjOzsaoUaPw7LPPYvv27VizZg369euH+fPnIyYmBq+88grmz5+P3/zmN/jtb38LABg6dGiDx//000+h0WgwderUVp2rpthsNmRnZ8NqtWLRokWIi4tDSUkJPvvsM5jNZkRERDR67s6cOYPRo0fLP+wxMTH48ssvMXv2bFRWVmLx4sUex3vqqacQGhqKJUuWwGq1IjQ0FE8++SRWrVolH6OyshJ79+7Fzz//jIkTJzbY9uPHj2Pz5s2YNm0akpOTcebMGbz66qsYN24cDh06hF69euHKK6/EO++8g9tvvx0TJ07EjBkz5PaXlpZi27ZteOedd7z2fffdd2Pjxo244447cO+99+LEiRN46aWX8Msvv2D37t0eI0Py8vLw+9//HnfffTfmzp2L/v37N/t1cH9P19bW4uuvv8aVV16JPn36+L2P9957D1qtFjfccAM0Gg369euHd999F2PHjm12eyRSZzoyMlK+7eDBg8jKykJCQgIeeeQRaLVa/POf/8RNN92ETZs24Te/+Q0A4PTp0xg/fjwcDoe83WuvvdboaKemVFdX44orrsDhw4dx5513Yvjw4Th//jw+/fRTFBcXY+DAgVi+fDkef/xx3HXXXXLnu6FzIIoibrzxRuzYsQOzZ8/GpZdeii1btuDBBx9ESUkJnnvuOY/td+3ahY8++ggLFiyAXq/HCy+8gClTpuDkyZPy91FT7r//frzwwgt4+OGH8fTTT/v93FetWgWNRoNHHnkE+fn5ePHFF6FSqaBQKGAymfDkk0/KU9OTk5Px+OOPy49duXIlHnvsMdx8882YM2cOzp07hxdffBFXXnklfvnlF3m02wcffIDa2lrMnz8fPXr0wJ49e/Diiy+iuLgYH3zwgUd7mvreA4Bt27bh97//Pa6++mqsXr0aAHD48GHs3r0b9913n9/PnYiIiCgYGFcGDuNKxpWNYVwZ+LiyMa+88goGDx6MG2+8EUqlEv/617+wYMECuFwuLFy40GPbY8eO4Xe/+x3mzZuHmTNn4s0338S0adP
"text/plain": [
"<Figure size 2000x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA(n_components=3)\n",
"reduced_data = pca.fit_transform(data_x)\n",
"\n",
"fig = plt.figure(figsize=(20, 7))\n",
"\n",
"# First subplot\n",
"ax1 = fig.add_subplot(121, projection='3d')\n",
"for i in range(kmeans.n_clusters):\n",
" cluster_data = reduced_data[clusters == i]\n",
" ax1.scatter(cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f'Cluster {i}', alpha=0.7, edgecolors='w')\n",
"ax1.set_title('Patient Clusters after PCA Reduction K-means')\n",
"ax1.set_xlabel('PCA 1')\n",
"ax1.set_ylabel('PCA 2')\n",
"ax1.set_zlabel('PCA 3')\n",
"ax1.legend()\n",
"\n",
"# Second subplot\n",
"ax2 = fig.add_subplot(122, projection='3d')\n",
"for i in range(len(set(data_y))):\n",
" cluster_data = reduced_data[data_y == i]\n",
" ax2.scatter(cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f'Label {label_names[i]}', alpha=0.7, edgecolors='w') \n",
"ax2.set_title('Patient Clusters after PCA Reduction Labels')\n",
"ax2.set_xlabel('PCA 1')\n",
"ax2.set_ylabel('PCA 2')\n",
"ax2.set_zlabel('PCA 3')\n",
"ax2.legend()\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Adjusted Rand Index (ARI): 0.15412707550646423\n",
"Normalized Mutual Information (NMI): 0.24282003848756695\n",
"Silhouette Score: 0.46722973644820026\n"
]
}
],
"source": [
"# Calculate Adjusted Rand Index (ARI)\n",
"ari = adjusted_rand_score(data_y, clusters)\n",
"print(f\"Adjusted Rand Index (ARI): {ari}\")\n",
"\n",
"# Calculate Normalized Mutual Information (NMI)\n",
"nmi = normalized_mutual_info_score(data_y, clusters)\n",
"print(f\"Normalized Mutual Information (NMI): {nmi}\")\n",
"\n",
"# Calculate Silhouette Score\n",
"silhouette_avg = silhouette_score(data_x, clusters)\n",
"print(f\"Silhouette Score: {silhouette_avg}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- The ARI and NMI scores suggest that the clustering algorithm has some effectiveness in mirroring the true label structure, but it's not highly accurate. The moderate scores indicate that while there is some alignment with the true labels, the clustering does not perfectly capture the underlying groupings.\n",
"\n",
"- The Silhouette Score indicates that the clustering has identified groups that are somewhat cohesive internally and separated from each other. This suggests that the clustering algorithm has been somewhat successful in identifying meaningful structures within the data, even if those structures don't perfectly align with the true labels."
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAyUAAALSCAYAAADDdH6KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd5ilRZX/P2+8OXbOafIMMMMAAwPDIDkIAgsIGABBWHRXf6uisipBFFYxrAFFF0WMIBnJWYac88SO09M53b75TfX7o7ovNDMkFxfD/c5zn2f6vXXfqjp16lQ651uKEEJQRhlllFFGGWWUUUYZZZTxPkF9vwtQRhlllFFGGWWUUUYZZfxzo7woKaOMMsooo4wyyiijjDLeV5QXJWWUUUYZZZRRRhlllFHG+4ryoqSMMsooo4wyyiijjDLKeF9RXpSUUUYZZZRRRhlllFFGGe8ryouSMsooo4wyyiijjDLKKON9RXlRUkYZZZRRRhlllFFGGWW8rygvSsooo4wyyiijjDLKKKOM9xXlRUkZZZRRRhlllFFGGWWU8b6ivCgpo4x/Qlx66aW0t7ejaRrLly9/v4uzHU499VRaW1vnPFMUhQsuuOBtf3vBBRegKMpfp2DvI1pbWzn11FPf72KUUcY/JP5R7UYZZfw9obwoKeN9x69+9SsUReHpp5/e4ff77bcfy5Yt+6uW4fbbb39HE95/BNx999188YtfZO+99+bKK6/k4osvftO0p556Koqi7PDj9/v/D0v9j4X99tuvJEdVVYlGoyxcuJCPfexj3HPPPe9ZPv9Mej2LN+ppKBRiyZIlfOMb3yCXy/1F73z00Ue54IILmJqaem8L+zeE18tM13WSySQrV67ks5/9LK+++upf/N5cLscFF1zAgw8++N4VFujp6ZlTZk3TaG5u5phjjuH5559/T/N6O/wz6EcZZfxfQH+/C1BGGX8LuP3227nsssv+KSZw999/P6qq8otf/ALTNN82vc/n44orrtjuuaZpf43ivSny+Ty6/o9jshobG7nkkksAyGazbNmyhRtuuIHf/va3nHDCCfz2t7/FMIxS+o0bN6Kq724f6Z9Jr1+Pgw46iI9//OMAZDIZ1q1bx9e+9jVeeOEFrr322nf9vkcffZQLL7yQU089lXg8/h6X9m8Hs3ITQpBKpXjhhRe46qqr+MlPfsK3vvUtPve5z73rd+ZyOS688EJALsbfa5x00kkcfvjhuK7L+vXr+elPf8odd9zB448//q5Ogb/61a/y5S9/+S8qwz+LfpRRxl8b/zgjfBlllPGOMDIyQiAQeEcLEgBd1/noRz/6Vy7V2+Mf7WQmFottJ9f/+q//4jOf+Qw/+clPaG1t5Vvf+lbpO5/P939dxL9bLFiwYI5s//Vf/xXLsrjhhhsoFAr/cLr0XuGNcgOpk0ceeSSf//znWbRoEYcffvj7VLodY9ddd51T5r333pujjjqKn/70p/zsZz97x+/Rdf0fatOjjDL+HlF23yrj7xa//e1vWblyJYFAgGQyyYknnsjWrVvnpFm3bh3HH388zc3N+Hw+mpqa+I//+A/y+Xwpzamnnspll10GzHVhgNdcBL7zne9w2WWX0d7eTjAY5OCDD2br1q0IIbjoootobGwkEAjwoQ99iImJiTlluPnmmzniiCOor6/H5/PR0dHBRRddhOu6c9LNuqk988wzrF69mkAgQFtbG5dffvk7kofjOFx00UV0dHTg8/lobW3lP//zPykWi6U0iqJw5ZVXks1mS/X81a9+9Y5l/mZ4M3/sWde8np6eOc/vuOMO1q5dSyQSIRqNsvvuu/P73//+LfPYUUzJww8/zO67747f76ejo+MtJyHvlb6A1JlwOMy2bds4+uijCYfDVFVV8YUvfGG7dn030DSNH/7whyxZsoQf//jHpFKp0ndvjCmxbZsLL7yQ+fPn4/f7qaioYJ999im5f72VXgN85zvfYfXq1VRUVBAIBFi5ciXXXXfddmVSFIV/+7d/46abbmLZsmX4fD6WLl3KnXfeuV3abdu2cfrpp5d0va2tjbPPPhvLskpppqam+H//7//R1NSEz+dj3rx5fOtb38LzvDnvuvrqq1m5cmVJR3baaSd+8IMf/GWCBWpra0uuSa/HE088waGHHkosFiMYDLJ27VoeeeSR0vcXXHAB55xzDgBtbW0lOfb09HDsscey6667znnfkUceiaIo3HLLLXPyUBSFO+64413LwfM8/vu//5ulS5fi9/upqanhrLPOYnJyck661tZWPvjBD/Lwww+zxx574Pf7aW9v59e//vVfLDOAiooKrr76anRd55vf/GbpuWVZnHfeeaxcuZJYLEYoFGLNmjU88MADpTQ9PT1UVVUBcOGFF5ZkN9uPX3zxRU499VTa29vx+/3U1tbyiU98gvHx8b+4vPvvvz8A3d3dpWfXXnttqe9XVlby0Y9+lG3bts353Y5s2DvR/bfSD4B77rmHffbZh3g8TjgcZuHChfznf/7nX1y/Msr4R0Z5W6CMvxmkUinGxsa2e27b9nbPvvnNb/K1r32NE044gTPOOIPR0VF+9KMfse+++/Lcc8+VjtCvvfZacrkcZ599NhUVFTz55JP86Ec/or+/v+TGcdZZZzEwMMA999zDb37zmx2W7Xe/+x2WZfHv//7vTExM8O1vf5sTTjiB/fffnwcffJAvfelLbNmyhR/96Ed84Qtf4Je//GXpt7/61a8Ih8N87nOfIxwOc//993PeeecxPT3NpZdeOiefyclJDj/8cE444QROOukk/vjHP3L22Wdjmiaf+MQn3lJ+Z5xxBldddRXHHXccn//853niiSe45JJLWL9+PTfeeCMAv/nNb/j5z3/Ok08+WXLJWr169Vu+F9hhu5imSTQafdvfvhG/+tWv+MQnPsHSpUs599xzicfjPPfcc9x5552cfPLJ7/g9L730EgcffDBVVVVccMEFOI7D+eefT01NzXZp30t9mYXruhxyyCGsWrWK73znO9x7771897vfpaOjg7PPPvtdy2UWmqZx0kkn8bWvfY2HH36YI444YofpLrjgAi655BLOOOMM9thjD6anp3n66ad59tlnOeigg95Wr3/wgx9w1FFH8ZGPfATLsrj66qs5/vjjufXWW7fL8+GHH+aGG27gU5/6FJFIhB/+8If8y7/8C319fVRUVAAwMDDAHnvswdTUFGeeeSaLFi1i27ZtXHfddeRyOUzTJJfLsXbtWrZt28ZZZ51Fc3Mzjz76KOeeey6Dg4P893//NyAncieddBIHHHBA6bRo/fr1PPLII3z2s599WxkWCoWSzmazWR555BGuuuoqTj755DmLkvvvv5/DDjuMlStXcv7556OqKldeeSX7778/69atY4899uDYY49l06ZN/OEPf+D73/8+lZWVAFRVVbFmzRpuvvlmpqeniUajCCF45JFHUFWVdevWcdRRRwFysauqKnvvvTfAO5YDSPv0q1/9itNOO43PfOYzdHd38+Mf/5jnnnuORx55ZI6L35YtWzjuuOM4/fTTOeWUU/jlL3/JqaeeysqVK1m6dOnbyu3N0NzczNq1a3nggQdKdZ2enuaKK67gpJNO4pOf/CTpdJpf/OIXHHLIITz55JMsX76cqqoqfvrTn3L22WdzzDHHcOyxxwKw8847A7Kdu7q6OO2006itreWVV17h5z//Oa+88gqPP/74XxR43tnZCVDSy1nZ7b777lxyySUMDw/zgx/8gEceeWRO338zvJ3uv5V+vPLKK3zwgx9k55135utf/zo+n48tW7bMWfSWUUYZr4Moo4z3GVdeeaUA3vKzdOnSUvqenh6haZr45je/Oec9L730ktB1fc7zXC63XX6XXHKJUBRF9Pb2lp59+tOfFjvqDt3d3QIQVVVVYmpqqvT83HPPFYDYZZddhG3bpecnnXSSME1TFAqFtyzDWWedJYLB4Jx0a9euFYD47ne/W3pWLBbF8uXLRXV1tbAsa3vhzeD5558XgDjjjDPmPP/CF74gAHH//feXnp1yyikiFAq96btej1NOOeVN2+SQQw4ppTv//PN3KL/Ztu3
"text/plain": [
"<Figure size 1000x800 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 8.95 s\n",
"Wall time: 11.5 s\n"
]
}
],
"source": [
"%%time\n",
"# Compute the distance matrix based on the Euclidean distance between data points\n",
"distance_matrix = euclidean_distances(data_x, data_x)\n",
"\n",
"# Create the heatmap\n",
"plt.figure(figsize=(10, 8))\n",
"sns.heatmap(distance_matrix, cmap='viridis')\n",
"plt.title('Heatmap of Euclidean Distances Between Data Points')\n",
"plt.xlabel('Data Point Index')\n",
"plt.ylabel('Data Point Index')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAggAAAK9CAYAAABB+5SlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACA+ElEQVR4nO3dd3wUdf4G8Ge27ybZTUJ6IYWS0DsISBUEKYoiIKKCYgcs2Cv2dp7iKWfhVDzPOz0VPX+Kep6C/TxUQFSKQBCEQALpm2TLzOf3x8JCCGASksxu9nn7ysvZyezsJ0OSffKdb1FEREBERER0GIPeBRAREVHoYUAgIiKiehgQiIiIqB4GBCIiIqqHAYGIiIjqYUAgIiKiehgQiIiIqB4GBCIiIqqHAYGIiIjqYUAgagXZ2dmYM2dO8PGqVaugKApWrVoV3Ddy5Eh079699YtrI+bMmYPs7Gy9yyBqMxgQiE7A+vXrcfbZZyMrKws2mw3p6ekYO3YsnnzySb1LaxE///wz7rrrLmzfvr3e5/785z9j2bJlzf6aI0eOhKIowY/4+HgMGDAAL7zwAjRNa5bXeOCBB/D22283y7mI2goGBKIm+uqrr9C/f3+sW7cOl1xyCZ566ilcfPHFMBgMeOKJJ+ocu2nTJixdulSnSpvPzz//jLvvvrtVAwIAZGRk4OWXX8bLL7+MO+64A36/H3PnzsWtt97aLOdnQCCqz6R3AUTh6v7774fL5cLq1asRGxtb53NFRUV1Hlut1lasrO1xuVw477zzgo8vu+wy5OXl4amnnsK9994Ls9msY3VEbRNbEIiaaOvWrejWrVu9cAAASUlJdR4f2QfheH7++WeMGjUKDocD6enpeOSRR+odU1RUhLlz5yI5ORk2mw29evXCSy+9VOeYo/VzAIDt27dDUZR6f+1v3LgRZ599NuLj42Gz2dC/f3+88847wc8vW7YM06ZNAwCMGjUq2OS/atUqZGdn46effsKnn34a3D9y5Mjgc8vKynDNNdcgMzMTVqsVHTt2xMMPP9zkWwQOhwMnnXQS3G43iouLj3mc2+3GddddF3zdvLw8PProozh8EVtFUeB2u/HSSy8Fa2/ovxVRW8YWBKImysrKwtdff40ff/yx2ToXlpaWYvz48TjrrLMwffp0vPHGG7jpppvQo0cPnHbaaQCAmpoajBw5Elu2bMH8+fORk5OD119/HXPmzEFZWRmuvvrqRr/uTz/9hKFDhyI9PR0333wzoqKi8M9//hNTpkzBm2++iTPPPBPDhw/HVVddhT/96U+49dZb0aVLFwBAly5dsHjxYixYsADR0dG47bbbAADJyckAgOrqaowYMQK7du3CZZddhvbt2+Orr77CLbfcgsLCQixevLhJ12rbtm0wGo1HDWgAICI4/fTTsXLlSsydOxe9e/fGhx9+iBtuuAG7du3C448/DgB4+eWXcfHFF2PgwIG49NJLAQAdOnRoUk1EbYoQUZP8+9//FqPRKEajUQYPHiw33nijfPjhh+L1eusdm5WVJbNnzw4+XrlypQCQlStXBveNGDFCAMhf//rX4D6PxyMpKSkyderU4L7FixcLAPnb3/4W3Of1emXw4MESHR0tFRUVx3wNEZGCggIBIC+++GJw3ymnnCI9evSQ2tra4D5N02TIkCHSqVOn4L7XX3/9qOcUEenWrZuMGDGi3v57771XoqKiZPPmzXX233zzzWI0GmXHjh31nnO4ESNGSH5+vhQXF0txcbFs2LBBrrrqKgEgkydPDh43e/ZsycrKCj5+++23BYDcd999dc539tlni6IosmXLluC+qKioOv8+RCTCWwxETTR27Fh8/fXXOP3007Fu3To88sgjGDduHNLT0+s0zTdGdHR0nXvtFosFAwcOxLZt24L7VqxYgZSUFMycOTO4z2w246qrrkJVVRU+/fTTRr1mSUkJPvnkE0yfPh2VlZXYt28f9u3bh/3792PcuHH45ZdfsGvXriZ9PQDw+uuvY9iwYYiLiwuee9++fRgzZgxUVcVnn332u+fYuHEjEhMTkZiYiC5duuDJJ5/ExIkT8cILLxzzOStWrIDRaMRVV11VZ/91110HEcH777/f5K+JKBLwFgPRCRgwYACWL18Or9eLdevW4a233sLjjz+Os88+G2vXrkXXrl0bdb6MjAwoilJnX1xcHH744Yfg419//RWdOnWCwVA33x9s8v/1118b9ZpbtmyBiOCOO+7AHXfccdRjioqKkJ6e3qjzHvTLL7/ghx9+QGJi4jHP/Xuys7OxdOlSKIoCm82GTp061evncaRff/0VaWlpiImJqbO/qdeJKNIwIBA1A4vFggEDBmDAgAHo3LkzLrzwQrz++utYtGhRo85jNBqPul8O61TXUEcGjYNUVa3z+GBHweuvvx7jxo076nM6duzY6Nc//Pxjx47FjTfeeNTPd+7c+XfPERUVhTFjxjS5BiJqPAYEombWv39/AEBhYWGLnD8rKws//PADNE2r04qwcePG4OeBQMsDEBhBcLgj/3LOzc0FELhN8XtvwscKHcf7XIcOHVBVVdXqb/BZWVn4z3/+g8rKyjqtCEdeJ+D4XxdRpGIfBKImWrly5VH/sl+xYgUAIC8vr0Ved8KECdizZw9ee+214D6/348nn3wS0dHRGDFiBIDAG6DRaKx3j//Pf/5zncdJSUkYOXIknn322aOGmsOHEUZFRQGoHzoOfu5o+6dPn46vv/4aH374Yb3PlZWVwe/3H/uLPQETJkyAqqp46qmn6ux//PHHoShKcFQIcOzaiSIZWxCImmjBggWorq7GmWeeifz8fHi9Xnz11Vd47bXXkJ2djQsvvLBFXvfSSy/Fs88+izlz5uC7775DdnY23njjDXz55ZdYvHhx8K9ll8uFadOm4cknn4SiKOjQoQPefffdo97zX7JkCU4++WT06NEDl1xyCXJzc7F37158/fXX+O2337Bu3ToAQO/evWE0GvHwww+jvLwcVqsVo0ePRlJSEvr164enn34a9913Hzp27IikpCSMHj0aN9xwA9555x1MmjQJc+bMQb9+/eB2u7F+/Xq88cYb2L59OxISEpr9Ok2ePBmjRo3Cbbfdhu3bt6NXr17497//jX/961+45ppr6gxl7NevH/7zn//gscceQ1paGnJycjBo0KBmr4korOg7iIIofL3//vty0UUXSX5+vkRHR4vFYpGOHTvKggULZO/evXWObegwx27dutV7nSOH74mI7N27Vy688EJJSEgQi8UiPXr0qDNs8aDi4mKZOnWqOBwOiYuLk8suu0x+/PHHesMcRUS2bt0qF1xwgaSkpIjZbJb09HSZNGmSvPHGG3WOW7p0qeTm5orRaKzzNezZs0cmTpwoMTExAqDOkMfKykq55ZZbpGPHjmKxWCQhIUGGDBkijz766FGHhR7uWNelIdepsrJSrr32WklLSxOz2SydOnWSP/zhD6JpWp3jNm7cKMOHDxe73S4AOOSRSEQUkSb0fiIiIqI2jX0QiIiIqB4GBCIiIqqHAYGIiIjqYUAgIiKiehgQiIiIqB4GBCIiIqonJCZK0jQNu3fvRkxMDKc8JSIiagQRQWVlJdLS0uot4nYiQiIg7N69G5mZmXqXQUREFLZ27tyJjIyMZjtfSASEg1PD7ty5E06nU+dqiEgXbjeQlhbY3r0bOLDuAxEdX0VFBTIzM+stbX6iQiIgHLyt4HQ6GRCIItXhS107nQwIRI3U3Lfo2UmRiIiI6mFAICIionoYEIiIiKgeBgQiIiKqhwGBiIiI6gmJUQxERDAagQkTDm0Tka4YEIgoNNhswHvv6V0FER3AWwxERERUDwMCERER1cOAQEShwe0OzJ4YFRXYJiJdsQ8CEYWO6mq9KyCiA9iCQERERPUwIBAREVE9DAhERERUDwMCERER1cOAQERERPVwFAMRhQaDARgx4tA2EemKP4XNbOTIkbjmmmuCj7Ozs7F48eLgY0VR8Pbbb7d6XUQhz24HVq0KfNjteldDFPHCOiB8/fXXMBqNmDhxYqu8nqqqeOihh5Cfnw+73Y74+HgMGjQIf/nLX4L
"text/plain": [
"<Figure size 600x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"n_clusters = kmeans.n_clusters\n",
"# Compute the silhouette scores for each sample\n",
"silhouette_vals = silhouette_samples(data_x, clusters)\n",
"\n",
"# Start plotting\n",
"fig, ax = plt.subplots(figsize=(6, 8))\n",
"y_lower, y_upper = 0, 0\n",
"yticks = []\n",
"\n",
"for i, cluster in enumerate(np.unique(clusters)):\n",
" cluster_silhouette_vals = silhouette_vals[clusters == cluster]\n",
" cluster_silhouette_vals.sort()\n",
" y_upper += len(cluster_silhouette_vals)\n",
" \n",
" color = cm.nipy_spectral(float(i) / n_clusters)\n",
" # Ensure the y-axis range matches the length of cluster_silhouette_vals\n",
" y_range = np.arange(y_lower, y_lower + len(cluster_silhouette_vals))\n",
" ax.fill_betweenx(y_range,\n",
" 0, cluster_silhouette_vals,\n",
" facecolor=color, edgecolor=color, alpha=0.7)\n",
" \n",
" # Label the silhouette plots with their cluster numbers at the middle\n",
" ax.text(-0.05, y_lower + 0.5 * len(cluster_silhouette_vals), str(cluster))\n",
" \n",
" # Compute the new y_lower for next plot\n",
" y_lower = y_upper + 10 # 10 for the 0 samples\n",
"\n",
"# The vertical line for average silhouette score of all the values\n",
"average_score = silhouette_score(data_x, clusters)\n",
"ax.axvline(x=average_score, color=\"red\", linestyle=\"--\")\n",
"\n",
"ax.set_yticks([]) # Clear the yaxis clusters / ticks\n",
"ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n",
"plt.title('Silhouette Plot')\n",
"plt.xlabel('Silhouette Coefficient Values')\n",
"plt.ylabel('Cluster Label')\n",
"\n",
"# Add the silhouette score to the plot\n",
"plt.text(0.02, 0.95, f'Avg Silhouette Score: {average_score:.2f}', transform=ax.transAxes)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAACVUAAAJOCAYAAAC9NnWuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wU1drA8d/M9k0lBUJoCYRelWaliIKIeu3XhmDv7Vruaxfrteu1ol5RsTdsqCBFUEQ6SK8JLb0n23fnvH9ssiQkgVBDeb757Cc7s2dmzuzMlnn2OedoSimFEEIIIYQQQgghhBBCCCGEEEIIIYQQQggA9KaugBBCCCGEEEIIIYQQQgghhBBCCCGEEEIcSiSpSgghhBBCCCGEEEIIIYQQQgghhBBCCCFqkKQqIYQQQgghhBBCCCGEEEIIIYQQQgghhKhBkqqEEEIIIYQQQgghhBBCCCGEEEIIIYQQogZJqhJCCCGEEEIIIYQQQgghhBBCCCGEEEKIGiSpSgghhBBCCCGEEEIIIYQQQgghhBBCCCFqkKQqIYQQQgghhBBCCCGEEEIIIYQQQgghhKhBkqqEEEIIIYQQQgghhBBCCCGEEEIIIYQQogZJqhJCCCGEEEIIIYQQQgghhBBCCCGEEEKIGiSpSgghhADS0tIYO3ZsZPq3335D0zR+++23PV7X+++/j6ZpZGVl7bf6HQrqe07Gjh1LWlpak9XpUPHoo4+iaVpTV6OWYDDIvffeS5s2bdB1nXPOOaepq3TI2pfX+4EwZMgQevTo0dTVqOVQPMeFEEIIIYQQ4kgmsardk1hVww7F63iJVTWexKr23YGo887vy0IIIY4OklQlhBDioKsO5FTf7HY7nTp14pZbbiEvL6+pq3fQLV26lMsvv5w2bdpgs9lISEjg1FNPZcKECYRCoaauXpP7888/efTRRyktLT3o2x47dmytczU2NpbevXvzwgsv4PP59ss23njjDd5///39sq6a3nvvPZ577jkuuOACPvjgA+68884Gyw4ZMiSyj7quExsbS+fOnRk9ejS//vrrPtXjQO1fWlparWPTvHlzTj75ZCZNmrTft7Ur2dnZPProoyxduvSgbnd/8Xq9vPTSSwwcOJC4uLha78fr1q07aPX45JNPePnllw/a9oQQQgghhBCiJolV1Saxql2TWNXekVjVwXE4x6o0TeOWW25p6moIIYQQtZibugJCCCGOXo899hjp6el4vV7++OMP3nzzTX766SdWrFiB0+ls6uodFO+++y433HADLVq0YPTo0XTs2JGKigqmT5/O1VdfTU5ODvfff39TV7NB77zzDoZhHNBt/Pnnn4wbN46xY8cSHx9/QLdVH5vNxrvvvgtAaWkpX3/9NXfffTcLFizgs88+2+f1v/HGGyQlJe33Vk4zZsygVatWvPTSS40q37p1a55++mkAXC4XGzZs4JtvvuGjjz7ioosu4qOPPsJisexxPQ7U/gH06dOHu+66CwgHjMaPH895553Hm2++yQ033NDo9QwaNAiPx4PVat3jOmRnZzNu3DjS0tLo06fPHi/flAoLCzn99NNZtGgRZ555JpdeeinR0dGsXbuWzz77jLfffhu/339Q6vLJJ5+wYsUK7rjjjoOyPSGEEEIIIYSoj8SqJFbVGBKr2jsSq5JYlRBCCHE4kqQqIYQQTWbkyJH069cPgGuuuYbExERefPFFvvvuOy655JK9Xq9hGPj9fux2+/6q6gHx119/ccMNN3D88cfz008/ERMTE3nsjjvuYOHChaxYseKg1UcphdfrxeFwNHqZvQlcHG7MZjOXX355ZPqmm25i4MCBfP7557z44oukpqY2Ye0alp+fv0eBvbi4uFr7CfCf//yH2267jTfeeIO0tDSeeeaZ/VzLfdOqVatadb7iiivIyMjgpZde2qNAla7rh/z7xYEwduxYlixZwldffcX5559f67HHH3+cBx54oIlqtn8cLp8FQgghhBBCiEOHxKokVnU4kFiVxKqEEEIIcfDI8H9CCCEOGaeccgoAmZmZADz//POccMIJJCYm4nA46Nu3L1999VWd5aq7Bf7444/p3r07NpuNX375ZY/W0Vjz5s3j9NNPJy4uDqfTyeDBg5kzZ85erWvcuHFomsbHH39cK0hVrV+/frVaTLlcLu66665I1+udO3fm+eefRylVa7lgMMjjjz9Ohw4dsNlspKWlcf/999fpAjwtLY0zzzyTKVOm0K9fPxwOB+PHjwdg27ZtnHPOOURFRdG8eXPuvPPOersQHzt2LGlpaZHprKwsNE3j+eef5+23347UoX///ixYsKDWsn///Tdjx46lffv22O12UlJSuOqqqygqKoqUefTRR7nnnnsASE9Pj3SfnZWVFSnz0Ucf0bdvXxwOBwkJCVx88cVs3bq11rbWr1/P+eefT0pKCna7ndatW3PxxRdTVlZWz5HZNV3XGTJkSGR/G9KY45CWlsbKlSuZNWtWZN+q192Q3Z0H1cdg5syZrFy5MrLe3377bY/31WQy8d///pdu3brx2muv1Xq+JkyYwCmnnELz5s2x2Wx069aNN998s9byu9q/4uJi7r77bnr27El0dDSxsbGMHDmSZcuW7XE9q6WkpNC1a9fIewjAkiVLGDlyJLGxsURHRzNs2DD++uuvWsv99ttvdZ6jIUOG0KNHD1atWsXQoUNxOp20atWKZ599ttZy/fv3B+DKK6+M7GN1F/L7et4tWrSIE044AYfDQXp6Om+99VbkscrKSqKiorj99tvrLLdt2zZMJlOkNWd95s2bx+TJk7n66qvrJFRBuNXr888/3+Dy1edZfd3la5rGo48+GpmuqKjgjjvuIC0tDZvNRvPmzTnttNNYvHgxEH6uJ0+ezObNmyPPYc33FZ/PxyOPPEJGRgY2m402bdpw77331nlP2tVngRBCCCGEEELsDYlV1SaxKolV1UdiVQ2TWFVYY2JVjfXdd98xatQoUlNTsdlsdOjQgccff7zBoUl3VedqjY097SwQCDBu3Dg6duyI3W4nMTGRk046aZ+HqBRCCHFokZ6qhBBCHDI2btwIQGJiIgCvvPIKZ599Npdddhl+v5/PPvuMCy+8kB9//JFRo0bVWnbGjBl88cUX3HLLLSQlJUWCJ3uyjt2ZMWMGI0eOpG/fvjzyyCPouh65WP/9998ZMGBAo9fldruZPn06gwYNom3btrstr5Ti7LPPZubMmVx99dX06dOHKVOmcM8997B9+/Za3WZfc801fPDBB1xwwQXcddddzJs3j6effprVq1czadKkWutdu3Ytl1xyCddffz3XXnstnTt3xuPxMGzYMLZs2cJtt91GamoqEydOZMaMGY3ev08++YSKigquv/56NE3j2Wef5bzzzmPTpk2RFoO//vormzZt4sorryQlJYWVK1fy9ttvs3LlSv766y80TeO8885j3bp1fPrpp7z00kskJSUBkJycDMCTTz7JQw89xEUXXcQ111xDQUEBr776KoMGDWLJkiXEx8fj9/sZMWIEPp+PW2+9lZSUFLZv386PP/5IaWkpcXFxjd6vajufq/VpzHF4+eWXufXWW4mOjo70CtSiRYsG19mY8yA5OZmJEyfy5JNPUllZGQlWdO3adY/3E8LBqksuuYSHHnqIP/74I/K6efPNN+nevTtnn302ZrOZH374gZtuugnDMLj55pt3u3+bNm3i22+/5cILLyQ9PZ28vDzGjx/P4MGDWbVq1V61qgwEAmzdujVyXFauXMnJJ59MbGws9957LxaLhfHjxzNkyBBmzZrFwIEDd7m+kpISTj/9dM477zwuuugivvrqK/7973/Ts2dPRo4cSdeuXXnsscd4+OGHue666zj55JMBOOGEE/b5vCspKeGMM87goosu4pJLLuGLL77gxhtvxGq1ctVVVxEdHc25554baYVqMpkiy3766acopbjssssaXP/3338PwOjRoxv13O6LG264ga+++opbbrmFbt26UVRUxB9//MHq1as59thjeeCBBygrK2Pbtm2R97Lo6Ggg3Jr77LPP5o8//uC6666ja9euLF++nJdeeol
"text/plain": [
"<Figure size 2400x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create a figure with two subplots side by side\n",
"fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 6))\n",
"\n",
"# First subplot for clusters\n",
"plot_data_clusters = pd.DataFrame(data_x.copy())\n",
"plot_data_clusters['Cluster'] = clusters\n",
"new_columns_clusters = plot_data_clusters.columns.tolist()\n",
"new_columns_clusters[0:10] = feature_names\n",
"plot_data_clusters.columns = new_columns_clusters\n",
"parallel_coordinates(plot_data_clusters, 'Cluster', colormap='viridis', ax=axes[0])\n",
"axes[0].set_title('Parallel Coordinates Plot of Data Points by Cluster')\n",
"axes[0].set_xlabel('Feature')\n",
"axes[0].set_ylabel('Feature Value')\n",
"axes[0].tick_params(axis='x', rotation=90)\n",
"\n",
"# Second subplot for labels\n",
"plot_data_labels = pd.DataFrame(data_x.copy())\n",
"label_data = data_y.reset_index(drop=True)\n",
"plot_data_labels['Cluster'] = label_data\n",
"\n",
"new_columns_labels = plot_data_labels.columns.tolist()\n",
"new_columns_labels[0:10] = feature_names\n",
"plot_data_labels.columns = new_columns_labels\n",
"parallel_coordinates(plot_data_labels, 'Cluster', colormap='viridis', ax=axes[1])\n",
"axes[1].set_title('Parallel Coordinates Plot of Data Points by Labels')\n",
"axes[1].set_xlabel('Feature')\n",
"axes[1].set_ylabel('Feature Value')\n",
"axes[1].tick_params(axis='x', rotation=90)\n",
"# set the legend\n",
"axes[1].legend(loc='upper right', labels=label_names)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}