DSA_SS24/notebooks/statistics.ipynb

258 lines
73 KiB
Plaintext
Raw Normal View History

2024-06-05 09:53:25 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hypothesis\n",
"This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n",
"For that instance the chi-squared test is used."
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 8,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pickle\n",
2024-06-05 16:47:14 +02:00
"import sys\n",
"\n",
"\n",
2024-06-05 09:53:25 +02:00
"from scipy.stats import chi2_contingency\n",
2024-06-05 16:47:14 +02:00
"sys.path.append('../scripts')\n",
"import data_helper\n"
2024-06-05 09:53:25 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 9,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading GSVT\n",
"Reading AFIB\n",
"Reading SR\n",
"Reading SB\n",
"Number of patients per category:\n",
"age: 37011\n",
"diag: 37011\n",
"gender: 37011\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-08 18:06:08 +02:00
"data = data_helper.load_data(only_demographic=True)\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(\"Number of patients per category:\")\n",
"for cat_name in data.keys():\n",
" print(f\"{cat_name}: {len(data[cat_name])}\")\n",
"\n",
"df_dgc = pd.DataFrame(data)"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 10,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of patients in a diagnosis category: SB 15826\n",
"SR 10426\n",
"AFIB 9756\n",
"GSVT 1003\n",
"Name: diag, dtype: int64\n",
"Min number of patients in a diagnosis category: 1003\n",
"unique values in the diagnosis category: ['GSVT' 'AFIB' 'SR' 'SB']\n",
"GSVT 1003\n",
"AFIB 1003\n",
"SR 1003\n",
"SB 1003\n",
"Name: diag, dtype: int64\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
"# get number of patients in a diagnosis category\n",
"num_patients = df_dgc['diag'].value_counts()\n",
"print(f\"Number of patients in a diagnosis category: {num_patients}\")\n",
"# get min number of patients in a diagnosis category\n",
"min_num_patients = df_dgc['diag'].value_counts().min()\n",
"print(f\"Min number of patients in a diagnosis category: {min_num_patients}\")\n",
"\n",
"# get the unique values of the diagnosis category\n",
"unique_vals = df_dgc['diag'].unique()\n",
"print(f\"unique values in the diagnosis category: {unique_vals}\")\n",
"\n",
"# get random sample of patients for each diagnosis category with min number of patients\n",
"sampled_data = pd.DataFrame()\n",
"for val in unique_vals:\n",
" sampled_data = pd.concat([sampled_data, df_dgc[df_dgc['diag'] == val].sample(min_num_patients)])\n",
"\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(sampled_data['diag'].value_counts())\n",
"\n",
2024-06-26 18:19:34 +02:00
"df_dgc = sampled_data\n",
"\n",
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 11,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
"data": {
2024-06-26 18:19:34 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoUAAAHJCAYAAADzW0NeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC/+0lEQVR4nOzdd1wT5x8H8E8SkrD3lik4UBERFVDrFlTcuBeOuorW9VNLa+uqYq17tLZ11tFq61YcuLHgLg5QXLhZMmWFkef3B+U0EpARSIDv+/W6l+a5u+e+uVyOb57n7jkeY4yBEEIIIYTUanxlB0AIIYQQQpSPkkJCCCGEEEJJISGEEEIIoaSQEEIIIYSAkkJCCCGEEAJKCgkhhBBCCCgpJIQQQgghoKSQEEIIIYSAkkJCCCGEEIIKJoXBwcEYM2YM6tevD11dXYjFYlhYWKBr165YvXo1EhISFBWn0ixYsAA8Hg8LFiyosm3a2dmBx+Ph2bNnVbbN2mL06NHg8XjYvn27skMp1uvXrzFy5EhYWlpCTU0NPB4Po0ePLnM9q1atAo/HA4/Hw/r16xUfaC1RnmNm+/bt3L4vnEQiEYyNjdGoUSMMGzYMv/76K9LS0oqt48KFC+DxeOjQoUPF30Qt9uzZM/B4PNjZ2VX6tiQSCdatW4d27drB0NAQQqEQxsbGcHJywqBBg7B27doa8XdRnsLjvCoxxnD48GGMHDkS9erVg66uLkQiEUxMTNC2bVvMnj0bV65cqdKYqrtyJYVv375F165d4eXlhe3btyM3NxcdO3aEr68vnJycEBoaipkzZ6Ju3bq4evWqomOu1qpDUqKKast+Y4yhf//+2LVrFwwMDDB48GD4+fmhbdu2Za5ry5Yt3P+3bt2qyDBJKWlpacHPzw9+fn4YMmQI2rRpA4FAgL1792LixImwtLTEunXrQE8bLT9V+REdFxeHli1bYtq0aQgNDUXDhg3Rv39/dOzYESKRCPv378f06dPpb6KCREdHo2XLlujbty927doFqVSKjh07YuDAgWjRogWePHmCFStWwNPTE/3791d2uNWGWllXSE1NRdu2bREVFYWGDRvi119/xWeffSazjEQiwY4dOzB//nzExMQoLNja4uzZs8jNzUWdOnWUHUqNExgYiK+++goWFhbKDkWu58+f49q1a7CxscHt27ehplbmrygA4MqVK4iMjIS+vj5yc3MRHh6OW7duoXnz5gqOmJTE2NhY7g+ZmJgYLF++HGvXrsW0adPw6tUrLF++XGaZVq1a4f79+9DU1KyiaGumOnXq4P79+xAKhZW6nSlTpuDu3bto3Lgxjh8/DltbW5n58fHx+OOPP2BmZlapcdQGz58/h4eHB+Lj4+Hp6YkNGzbIPbdduXIFP/74IyIjI5UQZfVU5r84U6dORVRUFOzs7PDPP//A0NCwyDJisRgTJkxAnz59kJKSoog4axUHBwdlh1BjWVhYqGxCCAAvXrwAANjb25c7IQTetxIOHToUWVlZ2L59O7Zs2UJJoYqwsLDA6tWrUa9ePfj7++PHH39Er169ZH5ga2pqomHDhkqMsmYQCoWVvh+zs7Nx+PBhAAWXbXycEAKAqakppk2bVqlx1BYjRozgEsLz589DLBbLXc7DwwP79+/HtWvXqjjCaoyVwZMnT5hAIGAA2IEDB8qyqow//viDderUiRkYGDCRSMRsbGzYmDFjWFRUlNzlbW1tGQAWHR3NDh06xDp27MgMDAwYAHb+/HnGCvpeWOHb2bp1K/Pw8GC6urrceoVev37NZsyYwRo2bMg0NDSYtrY2a9GiBVu/fj3Lzc0tsu358+czAGz+/Pky5Tk5OWznzp1s2LBhrEGDBkxHR4epq6uz+vXrs6lTp7LXr1/LLB8dHc3FKG/6sP4P3+/HMjIyWGBgIHN1dWXa2tpMQ0ODNWrUiH3zzTcsKSmpyPKF27W1tWVSqZT98ssvrHnz5kxTU5Pp6uqyrl27stDQULn7/cN9unPnTtayZUumpaXFjI2N2ZAhQ9jz588ZY4xJpVK2fv165uLiwjQ1NZmRkRHz8/NjcXFxReqszP1WmmPAz8+PAWDbtm3j1nvy5AnT09NjPB6PBQUFFYn59evXzMTEhAFgf/75p9x9VZyrV6+ygQMHMgsLCyYUCpmJiQnr2bMnO336dJnep7xjoTjp6elMR0eHAWA3btxgISEhDADT19dnWVlZxa4nlUrZli1bmJubG9PQ0GCGhoasW7du7J9//mHnz59nAFj79u3lrlvW71VJ0tLS2K+//sr69evHHB0dmaamJtPU1GRNmjRhX3/9NUtOTpa73offm3PnzrGuXbsyfX19pq6uzlxdXdmOHTuK3WZiYiKbNm0as7GxYSKRiFlbWzN/f3+WmJgo95j5lG3btnHfu09p2bIlA8B69uwpU17SPg8ODmZTpkxhLi4uzMjIiIlEIlanTh02aNAgdu3atWK3lZuby1asWMEaN27MxGIxMzExYQMGDGARERFczH5+fnLfi5+fH0tPT2dfffUVc3BwYCKRiJmZmbFRo0axV69eFbvN0n4HCmVnZ7Ply5ez5s2bM21tbSYUCpmZmRlr0aIFmz17NktMTJSJq7ip8G/Dh+dAeTIyMtjq1atZmzZtmL6+Pvc3qWfPnmz37t3Fvq8PvX79mtvuv//+W6p1CinieA8KCmLt27dnurq6TF9fn/n4+LA7d+5wy+7evZt5eHgwbW1tpqenx/r168ceP35cpM4Pj7mMjAwWEBDAHBwcmFgsZhYWFmzs2LHFftYfnn8/lpuby3777TfWvn177u++nZ0dmzRpEnvx4kWZ9ldhjADY7du3y7Tuh0qTVzDG2P3799no0aO5c4OBgQHr1KkT27t3r9x6P3W+KM337O3bt+yLL75g1tbW3PE4ffp0uX/jGSs4H/Ts2ZOZmpoyNTU1pq+vzxwdHdnw4cPZxYsXy7RfypQUrl27lvvjkpeXV6YNMVbwR2fUqFEMAFNTU2OdOnViQ4YMYfXr12cAmKamJjtx4kSR9Qo/vClTpjAArEWLFmzo0KGsffv27NKlSwVv5L+DZMqUKYzP57O2bduyoUOHMnd3d/bs2TPGGGMXL17kPnQ7OzvWu3dv5u3tzZV5eXmxnJwcmW0XlxS+fPmSAWB6enrMw8ODDRw4kPXo0YNZWloyAMzExIQ9evSIWz4hIYH5+fkxBwcHBoC1adOG+fn5cdPBgweLvN+PE4HExETWrFkzBoDp6uqy3r17M19fX2ZsbMwAMHt7+yLrfHhC9PPzY0KhkHXq1IkNGjSI2+9isZhduXKlyH4v3KdfffUV93kNGDCA2djYMADM2tqaJSUlsUGDBjF1dXXWrVs31q9fP2ZqasoAsKZNmzKJRFJl+600x0BxX9j9+/czAMzY2Ji9fPmSK8/Ly2OfffYZA8C++OKLIvuoJL/++ivj8/kMAHN1dWVDhw5lrVu35uJcsGBBkffp7e3NADAzMzOZ95mQkFDq7W7ZsoXb/4UKP+uS/shNnjyZAWB8Pp+1b9+eDRkyhDVu3JgJBAI2a9asYhOU8nyvSlKYxJqYmLC2bduywYMHMy8vL2ZkZMQAMEdHR/b27dsi6xV+b7799lvG4/GYm5sbGzJkCPPw8OD2+erVq4usFxsby+rVq8cAMAMDA9a/f3/Wt29fpq+vzxwcHFjv3r0rNSksPK9qa2vLJNAlJYWFSZmrqyvr3bs369+/P2vUqBF3bv3777+LrJOfn8969uzJADCRSMS8vLzY4MGDWd26dZmmpiZ3fi3uj1Xfvn1Z06ZNmb6+PuvVqxfr06cP9123tbVlKSkpRbZZlu9AYYydO3fmznHdu3dnQ4cOZV26dOE+38KkKyQkhPn5+TEtLS0GgPn6+sp8Z+7fv88YKzkpfPHiBbffNDU1WdeuXdmQIUPYZ599xvT09Er1+THGmEQiYZqamgwAGzt2LMvPzy/VeoXvoyLH+1dffcV4PB5r06aNzHldX1+fPX78mM2ePVvm/G1tbc0AMEtLyyJJRuEx5+npyTw8PJimpibr0aMHl9QDYObm5uzhw4dF4ikuKUx
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Correlation matrix\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"# Plot the correlation matrix\n",
"sns.heatmap(corr_matrix_age_diag, annot=True, cmap='coolwarm', fmt='d')\n",
"plt.title('Correlationmatrix of Age and Diagnostic Sample Groups', fontsize=16)\n",
"plt.xlabel('Diagnostic Group')\n",
"plt.ylabel('Age Group')\n",
"plt.show()"
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-06-26 18:19:34 +02:00
"Chi-Square Statistic: 1043.5644539016944\n",
"P-value: 4.935370162055676e-205\n",
"Chi-Square Statistic for SB in 60-70 vs others: 32.94855579340837\n",
"P-value for SB in 60-70 vs others: 9.463001659861763e-09\n"
2024-06-05 09:53:25 +02:00
]
}
],
"source": [
"# Change from group to category\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n",
"sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n",
"sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n",
"total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n",
"total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n",
"chi2_sb, p_sb = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n",
"print(f\"P-value for SB in 60-70 vs others: {p_sb}\")"
]
},
2024-06-26 13:17:02 +02:00
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 13,
2024-06-26 13:17:02 +02:00
"metadata": {},
2024-06-26 18:19:34 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chi-Square Statistic: 1043.5644539016944\n",
"P-value: 4.935370162055676e-205\n",
"Chi-Square Statistic for AFIB in 70-80 vs others: 120.60329273774582\n",
"P-value for AFIB in 70-80 vs others: 4.667227334873944e-28\n"
]
}
],
2024-06-26 13:17:02 +02:00
"source": [
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if AFIB (atrial fibrillation /atrial flutter) has a significantly higher frequency in the 70-80 age group\n",
"afib_70_80 = corr_matrix_age_diag.loc[pd.Interval(70, 80, closed='right'), 'AFIB']\n",
"afib_other = corr_matrix_age_diag.drop(pd.Interval(70, 80, closed='right')).sum()['AFIB']\n",
"total_70_80 = corr_matrix_age_diag.loc[pd.Interval(70, 80, closed='right')].sum()\n",
"total_other_70_80 = corr_matrix_age_diag.drop(pd.Interval(70, 80, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[afib_70_80, total_70_80 - afib_70_80], [afib_other, total_other_70_80 - afib_other]]\n",
"chi2_afib, p_afib = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for AFIB in 70-80 vs others: {chi2_afib}\")\n",
"print(f\"P-value for AFIB in 70-80 vs others: {p_afib}\")"
]
},
2024-06-05 09:53:25 +02:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results can be interpreted as followed:\n",
"\n",
"- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n",
"\n",
"- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
2024-06-05 09:53:25 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}