DSA_SS24/notebooks/cluster_diagnosis.ipynb

219 lines
220 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"\n",
"sys.path.append('../scripts')\n",
"import data_helper"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'JS00001': array([164889003, 59118001, 164934002]), 'JS00002': array([426177001, 164934002]), 'JS00004': array([426177001]), 'JS00005': array([164890007, 429622005, 428750005]), 'JS00006': array([426177001]), 'JS00007': array([164889003, 164934002]), 'JS00008': array([426783006]), 'JS00009': array([426177001]), 'JS00010': array([426177001]), 'JS00011': array([426177001, 55827005])}\n"
]
}
],
"source": [
"data = data_helper.load_data(only_diagnosis_ids=True)\n",
"# print first 10 items of dictionary\n",
"print({k: data[k] for k in list(data)[:10]})"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSKklEQVR4nO3dd1gU5/o+8HtBdkFgQZQigoBiQ0UiKmI3IkTRaMTvUWMUewk2MCqcJFhSMHqMJbbk5ERMsbccRUGC7WfEhsEWNaIoepSiCCuogPD+/shhjiuoswrsSu7Pde11uTPPzjwzINy8M/uuQgghQERERETPZaTvBoiIiIheBwxNRERERDIwNBERERHJwNBEREREJANDExEREZEMDE1EREREMjA0EREREcnA0EREREQkA0MTERERkQwMTVStzZkzBwqFokr21a1bN3Tr1k16fuDAASgUCmzZsqVK9j9ixAi4urpWyb5eVl5eHsaMGQMHBwcoFApMmzZN3y3B1dUVI0aM0HcbVIH4NaXKwtBEr43o6GgoFArpYWpqCkdHRwQEBGDZsmW4f/9+hezn1q1bmDNnDpKTkytkexXJkHuT4/PPP0d0dDQmTpyIH374AcOGDXtmraurq9bX287ODp07d8b27dt13u+RI0cwZ84c5OTkvEL3L2/37t2YM2dOpWw7JycHpqamUCgUuHDhQqXsw1BcuXIF48ePR4MGDWBqagq1Wo2OHTti6dKlePjwYZX08ODBA8yZMwcHDhyokv2RgRFEr4k1a9YIAGLevHnihx9+EN999534/PPPhb+/v1AoFMLFxUWcPn1a6zVFRUXi4cOHOu3nxIkTAoBYs2aNTq8rKCgQBQUF0vP9+/cLAGLz5s06bedleyssLBSPHj2qsH1VBh8fH9GxY0dZtS4uLsLLy0v88MMP4ocffhBffPGFaNCggQAgVq1apdN+Fy5cKACI1NTUMusePXokCgsLddqerkJCQkRl/bj95ptvhKmpqXBwcBAffvhhpezDEOzatUuYmZkJa2trMWXKFPHNN9+I5cuXi8GDBwsTExMxduxYqdbFxUUEBwdXSh9ZWVkCgJg9e3albJ8MWw39xTWil9OrVy+0adNGeh4REYF9+/ahT58+ePvtt3HhwgWYmZkBAGrUqIEaNSr32/zBgweoWbMmlEplpe7nRUxMTPS6fzkyMzPh4eEhu75evXp47733pOfDhw+Hu7s7Fi9ejAkTJlRITyqVqkK2oy8//vgjevfuDRcXF6xbtw6ffvpphWxXCIFHjx5J/5f0KTU1FYMHD4aLiwv27duHunXrSutCQkKQkpKCmJgYPXb46vLz82Fubq7vNuhF9J3aiOQqHWk6ceJEues///xzAUB888030rLZs2eX+Qt/7969omPHjsLKykqYm5uLxo0bi4iICCHE/0aHnn6Ujux07dpVNG/eXJw8eVJ07txZmJmZialTp0rrunbtKu2ndFsbNmwQERERwt7eXtSsWVP07dtXpKWlafX0rL+Mn9zmi3oLDg4WLi4uWq/Py8sTYWFhwsnJSSiVStG4cWOxcOFCUVJSolUHQISEhIjt27eL5s2bC6VSKTw8PMSePXvKPddPy8jIEKNGjRJ2dnZCpVIJT09PER0dXeZcPP0ob+TnyXMSGBhYZnmbNm2EiYmJEEKI06dPi+DgYOHm5iZUKpWwt7cXI0eOFHfu3JHqS78HnrXv8s79vXv3xNSpU6Xz1rBhQzF//nxRXFws1aSmpgoAYuHCheLrr78WDRo0EEqlUrRp00YcP35cqgsODi53/6XWr18vWrduLSwsLISlpaVo0aKFWLJkiazzfv36daFQKMSmTZvEsWPHBADx66+/llv7ww8/iLZt20qjNZ07dxZxcXFlzndsbKzw9vYWKpVKLF68WAghxJUrV8TAgQNFrVq1hJmZmfDx8RG7du0qs49ly5YJDw8PaR/e3t7ip59+ktZrNBoxdepU4eLiIpRKpbC1tRV+fn4iKSnpucc5YcKE5x7b057+mpb3c0CI//1MefL78MSJE8Lf31/Url1bmJqaCldXVzFy5EghxP++5k8/nhx1unDhgggKChK1atUSKpVKeHt7i59//rnc/R44cEBMnDhR2NraCmtr61c6R1Q1ONJE1cawYcPw97//HXv37sXYsWPLrTl//jz69OkDT09PzJs3DyqVCikpKfj1118BAM2aNcO8efMQGRmJcePGoXPnzgCADh06SNu4e/cuevXqhcGDB+O9996Dvb39c/v67LPPoFAoMGvWLGRmZmLJkiXw8/NDcnKyTn/Fy+ntSUIIvP3229i/fz9Gjx4NLy8vxMXFYcaMGfjPf/6DxYsXa9UfPnwY27Ztw/vvvw9LS0ssW7YMQUFBSEtLQ+3atZ/Z18OHD9GtWzekpKRg0qRJcHNzw+bNmzFixAjk5ORg6tSpaNasGX744QeEhobCyckJ06dPBwDY2trKPn4AKCoqwo0bN6R+4uPjcfXqVYwcORIODg44f/48vvnmG5w/fx5Hjx6FQqHAgAED8Mcff2D9+vVYvHgx6tSp89x9P3jwAF27dsV//vMfjB8/HvXr18eRI0cQERGB27dvY8mSJVr169atw/379zF+/HgoFAosWLAAAwYMwNWrV2FiYoLx48fj1q1biI+Pxw8//KD12vj4eAwZMgQ9evTAF198AQC4cOECfv31V0ydOvWF52P9+vUwNzdHnz59YGZmhoYNG+Knn34q8z0xd+5czJkzBx06dMC8efOgVCpx7Ngx7Nu3D/7+/lLdpUuXMGTIEIwfPx5jx45FkyZNkJGRgQ4dOuDBgweYMmUKateujbVr1+Ltt9/Gli1b8M477wAA/vnPf2LKlCkYOHAgpk6dikePHuHMmTM4duwY3n33XQDAhAkTsGXLFkyaNAkeHh64e/cuDh8+jAsXLqB169bPPM6dO3eiQYMGz/xeryiZmZnw9/eHra0twsPDYW1tjWvXrmHbtm0A/vyeWbVqFSZOnIh33nkHAwYMAAB4enoC+PPnS8eOHVGvXj2Eh4fD3NwcmzZtQv/+/bF161bpXJV6//33YWtri8jISOTn5wN4+XNEVUTfqY1IrheNNAkhhJWVlXjjjTek50//hbl48WIBQGRlZT1zG8+7b6hr164CgFi9enW568obaapXr57QaDTS8k2bNgkAYunSpdIyOSNNL+rt6ZGmHTt2CADi008/1aobOHCgUCgUIiUlRVoGQCiVSq1lp0+fFgDEV199VWZfT1qyZIkAIH788UdpWWFhofD19RUWFhZax/6s0aPyuLi4CH9/f5GVlSWysrLE6dOnxeDBgwUAMXnyZCGEEA8ePCjzuvXr1wsA4tChQ9Ky593T9PS5/+STT4S5ubn4448/tOrCw8OFsbGxNEpYOupQu3ZtkZ2dLdX9/PPPAoDYuXOntOxZ9zRNnTpVqNVq8fjxY1nn5GktW7YUQ4cOlZ7//e9/F3Xq1BFFRUXSssuXLwsjIyPxzjvvaI2UCSG0RhxdXFwEABEbG6tVM23aNAFA/L//9/+kZffv3xdubm7C1dVV2ma/fv1E8+bNn9uvlZWVCAkJ0ekYc3NzBQDRr18/2a952ZGm7du3v/BnzPPuaerRo4do2bKl1r2FJSUlokOHDqJRo0Zl9tupU6cyX/uXOUdUdfjuOapWLCwsnvsuOmtrawDAzz//jJKSkpfah0qlwsiRI2XXDx8+HJaWltLzgQMHom7duti9e/dL7V+u3bt3w9jYGFOmTNFaPn36dAghsGfPHq3lfn5+aNiwofTc09MTarUaV69efeF+HBwcMGTIEGmZiYkJpkyZgry8PBw8ePClj2Hv3r2wtbWFra0tWrVqhc2bN2PYsGHSqMyTI3WPHj3CnTt30L59ewDAqVOnXmqfmzdvRufOnVGrVi3cuXNHevj5+aG4uBiHDh3Sqh80aBBq1aolPS8dAXzReQP+/H7Mz89HfHy8zn2eOXMGZ8+e1TrvQ4YMwZ07dxAXFyct27FjB0pKShAZGQkjI+0f+U9Px+Hm5oaAgACtZbt370a7du3QqVMnaZmFhQXGjRu
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# Assuming [`data`](command:_github.copilot.openSymbolFromReferences?%5B%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22c%3A%5C%5CUsers%5C%5Cfelix%5C%5COneDrive%5C%5CStudium%5C%5CMaster%20MDS%5C%5C1%20Semester%5C%5CDSA%5C%5Ccode%5C%5CDSA_SS24%5C%5Cnotebooks%5C%5Ccluster_diagnosis.ipynb%22%2C%22_sep%22%3A1%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Ffelix%2FOneDrive%2FStudium%2FMaster%20MDS%2F1%20Semester%2FDSA%2Fcode%2FDSA_SS24%2Fnotebooks%2Fcluster_diagnosis.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22W2sZmlsZQ%3D%3D%22%7D%2C%7B%22line%22%3A0%2C%22character%22%3A0%7D%5D \"c:\\Users\\felix\\OneDrive\\Studium\\Master MDS\\1 Semester\\DSA\\code\\DSA_SS24\\notebooks\\cluster_diagnosis.ipynb\") is your dictionary\n",
"\n",
"# Step 1 & 2: Collect all unique diagnosis IDs\n",
"all_diagnosis_ids = set(diagnosis_id for patient_diagnoses in data.values() for diagnosis_id in patient_diagnoses)\n",
"\n",
"# Create a mapping of diagnosis IDs to column indices\n",
"diagnosis_id_to_index = {diagnosis_id: index for index, diagnosis_id in enumerate(all_diagnosis_ids)}\n",
"\n",
"# Step 4: Create the binary matrix\n",
"patient_diagnosis_matrix = np.zeros((len(data), len(all_diagnosis_ids)))\n",
"\n",
"for patient_index, (patient, diagnoses) in enumerate(data.items()):\n",
" for diagnosis in diagnoses:\n",
" diagnosis_index = diagnosis_id_to_index[diagnosis]\n",
" patient_diagnosis_matrix[patient_index, diagnosis_index] = 1\n",
"\n",
"# Step 3: Preprocess the data (optional, depending on the algorithm)\n",
"scaler = StandardScaler()\n",
"patient_diagnosis_matrix_scaled = scaler.fit_transform(patient_diagnosis_matrix)\n",
"\n",
"# Step 6: Cluster the data\n",
"# Adjust `n_clusters` based on domain knowledge or experimentation\n",
"kmeans = KMeans(n_clusters=4, random_state=42)\n",
"clusters = kmeans.fit_predict(patient_diagnosis_matrix_scaled)\n",
"\n",
"# Step 7: Analyze the clusters\n",
"# This step is domain-specific and might involve looking at the characteristics of each cluster,\n",
"# such as the most common diagnosis IDs in each cluster.\n",
"# plot the clusters\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.hist(clusters, bins=range(6))\n",
"plt.xlabel('Cluster')\n",
"plt.ylabel('Number of Patients')\n",
"plt.title('Distribution of Patients Across Clusters')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cluster 0: {164912004, 89792004, 425856008, 251205003, 164917005, 365413008, 17338001, 55930002, 251146004, 164890007, 81898007, 251199005, 426783006, 39732003, 427172004, 106068003, 164873001, 164889003, 54329005, 426761007, 713422000, 59118001, 251198002, 164942001, 61721007, 428750005, 445118002, 733534002, 164937009, 57054005, 426995002, 164931005, 55827005, 713427006, 13640000, 67751000119106, 426648003, 426664006, 29320008, 164909002, 27885002, 61277005, 233917008, 251170000, 195042002, 10370003, 713426002, 164930006, 111975006, 251223006, 427084000, 698252002, 270492004, 251164006, 426177001, 164865005, 47665007, 427393009, 164934002, 74390002, 284470004, 429622005, 6374002, 59931005}\n",
"Cluster 1: {89792004, 164912004, 425856008, 251205003, 164917005, 365413008, 55930002, 251146004, 164890007, 49578007, 426783006, 233892002, 39732003, 427172004, 106068003, 164873001, 54329005, 5609005, 61721007, 713422000, 426761007, 445118002, 733534002, 233897008, 57054005, 426627000, 426995002, 251187003, 55827005, 713427006, 13640000, 29320008, 446813000, 27885002, 164909002, 61277005, 251170000, 233917008, 195042002, 713426002, 10370003, 418818005, 164930006, 426183003, 111975006, 251223006, 427084000, 698252002, 75532003, 270492004, 426177001, 164865005, 428417006, 47665007, 427393009, 164934002, 6374002, 284470004, 429622005, 446358003, 445211001, 59931005, 77867006}\n",
"Cluster 2: {54016002, 89792004, 425856008, 81898007, 49578007, 251199005, 106068003, 5609005, 63593006, 233897008, 251198002, 445118002, 55827005, 713427006, 164947007, 426664006, 233917008, 713426002, 111975006, 6374002, 365413008, 17338001, 55930002, 233892002, 427172004, 54329005, 713422000, 428750005, 251170000, 195042002, 427084000, 698252002, 75532003, 284470004, 429622005, 164896001, 164917005, 426783006, 195060002, 39732003, 164873001, 426761007, 164937009, 733534002, 426995002, 251187003, 13640000, 67751000119106, 29320008, 446813000, 61277005, 65778007, 426183003, 270492004, 251164006, 47665007, 164934002, 445211001, 59931005, 251120003, 164912004, 251205003, 164890007, 50799005, 164889003, 61721007, 59118001, 426627000, 164931005, 426648003, 164909002, 27885002, 10370003, 251223006, 426177001, 164865005, 427393009, 74390002, 446358003}\n",
"Cluster 3: {54016002, 89792004, 425856008, 11157007, 81898007, 49578007, 251199005, 106068003, 5609005, 63593006, 233897008, 251198002, 445118002, 55827005, 713427006, 164947007, 426664006, 233917008, 713426002, 111975006, 6374002, 251173003, 365413008, 17338001, 55930002, 233892002, 427172004, 54329005, 713422000, 164942001, 428750005, 111288001, 251170000, 195042002, 427084000, 698252002, 75532003, 284470004, 429622005, 164896001, 164917005, 426783006, 195060002, 39732003, 164873001, 426761007, 164937009, 733534002, 251166008, 426995002, 251187003, 13640000, 67751000119106, 67741000119109, 29320008, 446813000, 195101003, 61277005, 65778007, 426183003, 270492004, 251164006, 47665007, 164934002, 59931005, 164912004, 251205003, 164890007, 164889003, 61721007, 59118001, 426627000, 164931005, 426648003, 164909002, 27885002, 10370003, 251223006, 251180001, 426177001, 164865005, 427393009, 74390002, 446358003, 17366009}\n"
]
}
],
"source": [
"# Create a dictionary to hold diagnosis IDs for each cluster\n",
"cluster_diagnosis = {i: [] for i in range(kmeans.n_clusters)}\n",
"\n",
"# Iterate over each patient and their assigned cluster\n",
"for patient_index, cluster in enumerate(clusters):\n",
" # Retrieve the original diagnosis IDs for this patient\n",
" patient_diagnoses = data[list(data.keys())[patient_index]]\n",
" # Append these diagnosis IDs to the corresponding cluster entry in the dictionary\n",
" cluster_diagnosis[cluster].extend(patient_diagnoses)\n",
"\n",
"# Deduplicate diagnosis IDs in each cluster and print them\n",
"for cluster, diagnoses in cluster_diagnosis.items():\n",
" unique_diagnoses = set(diagnoses)\n",
" print(f\"Cluster {cluster}: {unique_diagnoses}\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cluster 0: 1035 patients\n",
"Cluster 1: 880 patients\n",
"Cluster 2: 9012 patients\n",
"Cluster 3: 34223 patients\n"
]
}
],
"source": [
"# Initialize a dictionary to count patients in each cluster\n",
"cluster_patient_count = {i: 0 for i in range(kmeans.n_clusters)}\n",
"\n",
"# Iterate over the assigned clusters and increment the count for each cluster\n",
"for cluster in clusters:\n",
" cluster_patient_count[cluster] += 1\n",
"\n",
"# Print the number of patients in each cluster\n",
"for cluster, count in cluster_patient_count.items():\n",
" print(f\"Cluster {cluster}: {count} patients\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1QAAAJwCAYAAACOKyDJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxU1f3/8dcsmZlkskwmhIQlICAIIqCELYAgAgUXrC3a2ta1aqui1q3fr/XXRa1Kv1+11lbcW22tS78VtVYEBFyoICpE3EFBMGxJgGGyTyYzc39/XDMwJIFkMtnfz8cjD+CemXvPTCZw35xzPsdiGIaBiIiIiIiItJi1ozsgIiIiIiLSVSlQiYiIiIiIxEmBSkREREREJE4KVCIiIiIiInFSoBIREREREYmTApWIiIiIiEicFKhERERERETipEAlIiIiIiISJwUqERERERGROClQiUiP9eSTT2KxWNi+fXtHdyVup5xyCqecckpHd6PTKikp4ZxzziErKwuLxcIf/vCHju5Sj9WRP28XX3wxxxxzTLtfV0R6BgUqEel06m+86r9cLhfDhg3j6quvpqSkpMXnu+uuu3jppZcS39FmqK6u5tZbb+XNN99s0fNKSkq46aabGD58OCkpKbjdbvLz87njjjvw+/1t0tfGdOR7lwjXX389y5cv5xe/+AVPPfUUc+fO5dVXX+XWW29t135cfPHFMZ/p9PR0xowZw7333kttbW2Dx2/cuJHzzz+fvLw8nE4nXq+XWbNm8cQTTxAOhxs83u/343K5sFgsfP75583u1+E/a3a7nX79+nHxxReza9euVr3m9rZ7925uvfVWNm7c2NFdEZEext7RHRARacrtt9/OoEGDCAQCvP322zz00EO8+uqrfPLJJ6SkpDT7PHfddRfnnHMOZ599dszxCy64gPPOOw+n05ngnh9UXV3NbbfdBtDskaT333+f008/ncrKSs4//3zy8/MBWL9+Pb/73e9YvXo1r732Wlt1OUZT711X8frrr/Ptb3+bm266KXrsgQceYNGiRe0eqpxOJ48//jhgBqDFixdz00038f777/Pcc89FH/f4449zxRVXkJOTwwUXXMDQoUOpqKhg1apVXHrppezZs4dbbrkl5tz//Oc/sVgs5Obm8vTTT3PHHXe0qG+H/qytW7eOJ598krfffptPPvkEl8vV+hffDnbv3s1tt93GMcccw4knnhjT9thjjxGJRDqmYyLS7SlQiUinddpppzFu3DgALrvsMrKysvj973/Pv/71L37wgx+0+vw2mw2bzdbq8ySS3+/nO9/5DjabjQ8++IDhw4fHtN9555089thjHdS7xAgEAjgcDqzWtp8kUVpaisfjafPrGIZBIBAgOTm5ycfY7XbOP//86J+vuuoqJk6cyD/+8Q9+//vf07dvX9atW8cVV1xBQUEBr776KmlpadHHX3fddaxfv55PPvmkwbn//ve/c/rppzNw4ECeeeaZFgeqw3/WevXqxf/8z//w8ssv873vfa9F5+qMkpKSOroLItKNacqfiHQZp556KgDbtm0D4J577mHy5MlkZWWRnJxMfn4+zz//fMxzLBYLVVVV/PWvf41Oa7r44ouBptd0LF26lJNPPhm3201aWhpnnHEGn376acxjLr74YlJTU9m1axdnn302qampZGdnc9NNN0WnZG3fvp3s7GwAbrvttuj1jzQy8sgjj7Br1y5+//vfNwhTADk5Ofzyl79s8vlNvaY333wTi8USM/Xwyy+/ZP78+eTm5uJyuejfvz/nnXceZWVlR33vAHbt2sWPf/xjcnJycDqdjBw5kr/85S+NXve5557jl7/8Jf369SMlJYXy8nLq6uq47bbbGDp0KC6Xi6ysLKZOncqKFSuafH0APp+Pm266iVGjRpGamkp6ejqnnXYaH374YYP3wTAMFi1aFNP/RYsWRV9f/Ve9SCTCH/7wB0aOHInL5SInJ4ef/vSnHDhwIKYPxxxzDGeeeSbLly9n3LhxJCcn88gjjxyx34ezWq3RUcv671f95+Tpp5+OCVP1xo0bF/M9ACgqKuI///kP5513Hueddx7btm1j7dq1LerL4U4++WQAtm7dGnN806ZNnHPOOXi9XlwuF+PGjePll19u8PxPP/2UU089leTkZPr3788dd9zR6AhRUz8PxxxzTIPX6ff7uf766znmmGNwOp3079+fCy+8kH379vHmm28yfvx4AC655JLo9/XJJ58EGl9DVVVVxY033hidVnncccdxzz33YBhGgz5effXVvPTSS5xwwgnRz/qyZcuO9BaKSA+iESoR6TLqb+6ysrIAuP/++znrrLP40Y9+RDAY5LnnnuPcc8/llVde4YwzzgDgqaee4rLLLmPChAn85Cc/AWDIkCFNXuOpp57ioosuYs6cOfzP//wP1dXVPPTQQ0ydOpUPPvgg5qYsHA4zZ84cJk6cyD333MPKlSu59957GTJkCFdeeSXZ2dk89NBDXHnllXznO9/hu9/9LgCjR49u8vovv/wyycnJnHPOOa16r44mGAwyZ84camtrueaaa8jNzWXXrl288sor+P1+MjIyjvjelZSUMGnSpOjNZnZ2NkuXLuXSSy+lvLyc6667LuZ6v/3tb3E4HNx0003U1tbicDi49dZbWbhwYfQa5eXlrF+/nsLCQmbPnt1k37/66iteeuklzj33XAYNGkRJSQmPPPII06dP57PPPqNv375MmzaNp556igsuuIDZs2dz4YUXRvu/e/duVqxYwVNPPdXg3D/96U958sknueSSS7j22mvZtm0bDzzwAB988AFr1qyJGenYvHkzP/jBD/jpT3/K5ZdfznHHHdfi78Ohn+nq6mpWrVrFtGnTGDBgQLPP8eyzz+J2uznzzDNJTk5myJAhPP3000yePLnF/alXH/AyMzOjxz799FOmTJlCv379uPnmm3G73fzf//0fZ599NosXL+Y73/kOAMXFxcyYMYNQKBR93KOPPnrE0bujqays5OSTT+bzzz/nxz/+MWPHjmXfvn28/PLL7Ny5kxEjRnD77bfz61//mp/85CfRQNjUe2AYBmeddRZvvPEGl156KSeeeCLLly/n5z//Obt27eK+++6Lefzbb7/NCy+8wFVXXUVaWhp//OMfmT9/PkVFRdG/j0SkBzNERDqZJ554wgCMlStXGnv37jV27NhhPPfcc0ZWVpaRnJxs7Ny50zAMw6iuro55XjAYNE444QTj1FNPjTnudruNiy66qMnrbNu2zTAMw6ioqDA8Ho9x+eWXxzyuuLjYyMjIiDl+0UUXGYBx++23xzz2pJNOMvLz86N/3rt3rwEYv/nNb5r12jMzM40xY8Y067GGYRjTp083pk+f3uRrqvfGG28YgPHGG28YhmEYH3zwgQEY//znP494/qbeu0svvdTo06ePsW/fvpjj5513npGRkRH93tRfd/DgwQ2+X2PGjDHOOOOM5r3QQwQCASMcDscc27Ztm+F0Oht8PwBjwYIFMccWLFhgNPbP33/+8x8DMJ5++umY48uWLWtwfODAgQZgLFu2rFl9vuiiiwy3223s3bvX2Lt3r7FlyxbjrrvuMiwWizF69GjDMAzjww8/NADjZz/7WbPOWW/UqFHGj370o+ifb7nlFqNXr15GXV3dUZ/b2M/a888/b2RnZxtOp9PYsWNH9LEzZ840Ro0aZQQCgeixSCRiTJ482Rg6dGj02HXXXWcAxrvvvhs9VlpaamRkZDT4bDb1szFw4MCYz92vf/1rAzBeeOGFBo+NRCKGYRjG+++/bwDGE0880eAxF110kTFw4MDon1966SUDMO64446Yx51zzjmGxWIxtmzZEtNHh8MRc6z+e/WnP/2pwbVEpOfRlD8R6bRmzZpFdnY2eXl5nHfeeaSmpvLiiy/Sr18/gJj/8T5w4ABlZWWcfPLJFBYWxnW9FStW4Pf7+cEPfsC+ffuiXzabjYkTJ/LGG280eM4VV1wR8+eTTz6Zr776Kq7rA5SXlzc61SvRMjIyAFi+fDnV1dUteq5hGCxevJh58+ZhGEbMezVnzhzKysoafA8uuuiiBiM
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"from sklearn.decomposition import PCA\n",
"\n",
"# Step 1: Apply PCA to reduce to 2 dimensions\n",
"pca = PCA(n_components=2)\n",
"reduced_data = pca.fit_transform(patient_diagnosis_matrix_scaled)\n",
"\n",
"# Step 2: Plot the results\n",
"plt.figure(figsize=(10, 7))\n",
"# Scatter plot for each cluster\n",
"for i in range(kmeans.n_clusters):\n",
" # Select only data points that belong to the current cluster\n",
" cluster_data = reduced_data[clusters == i]\n",
" plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {i}', alpha=0.7, edgecolors='w')\n",
"\n",
"# Add legend and labels for clarity\n",
"plt.title('Patient Clusters after PCA Reduction')\n",
"plt.xlabel('PCA 1')\n",
"plt.ylabel('PCA 2')\n",
"plt.legend()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}