DSA_SS24/notebooks/statistics.ipynb

279 lines
76 KiB
Plaintext
Raw Permalink Normal View History

2024-06-05 09:53:25 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hypothesis\n",
"This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n",
"For that instance the chi-squared test is used."
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 8,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pickle\n",
2024-06-05 16:47:14 +02:00
"import sys\n",
"\n",
"\n",
2024-06-05 09:53:25 +02:00
"from scipy.stats import chi2_contingency\n",
2024-06-05 16:47:14 +02:00
"sys.path.append('../scripts')\n",
"import data_helper\n"
2024-06-05 09:53:25 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 9,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading GSVT\n",
"Reading AFIB\n",
"Reading SR\n",
"Reading SB\n",
"Number of patients per category:\n",
"age: 37011\n",
"diag: 37011\n",
"gender: 37011\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-08 18:06:08 +02:00
"data = data_helper.load_data(only_demographic=True)\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(\"Number of patients per category:\")\n",
"for cat_name in data.keys():\n",
" print(f\"{cat_name}: {len(data[cat_name])}\")\n",
"\n",
"df_dgc = pd.DataFrame(data)"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 10,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of patients in a diagnosis category: SB 15826\n",
"SR 10426\n",
"AFIB 9756\n",
"GSVT 1003\n",
"Name: diag, dtype: int64\n",
"Min number of patients in a diagnosis category: 1003\n",
"unique values in the diagnosis category: ['GSVT' 'AFIB' 'SR' 'SB']\n",
"GSVT 1003\n",
"AFIB 1003\n",
"SR 1003\n",
"SB 1003\n",
"Name: diag, dtype: int64\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
"# get number of patients in a diagnosis category\n",
"num_patients = df_dgc['diag'].value_counts()\n",
"print(f\"Number of patients in a diagnosis category: {num_patients}\")\n",
"# get min number of patients in a diagnosis category\n",
"min_num_patients = df_dgc['diag'].value_counts().min()\n",
"print(f\"Min number of patients in a diagnosis category: {min_num_patients}\")\n",
"\n",
"# get the unique values of the diagnosis category\n",
"unique_vals = df_dgc['diag'].unique()\n",
"print(f\"unique values in the diagnosis category: {unique_vals}\")\n",
"\n",
"# get random sample of patients for each diagnosis category with min number of patients\n",
"sampled_data = pd.DataFrame()\n",
"for val in unique_vals:\n",
" sampled_data = pd.concat([sampled_data, df_dgc[df_dgc['diag'] == val].sample(min_num_patients)])\n",
"\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(sampled_data['diag'].value_counts())\n",
"\n",
2024-06-26 18:19:34 +02:00
"df_dgc = sampled_data\n",
"\n",
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 11,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
"data": {
2024-06-26 18:19:34 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoUAAAHJCAYAAADzW0NeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC/+0lEQVR4nOzdd1wT5x8H8E8SkrD3lik4UBERFVDrFlTcuBeOuorW9VNLa+uqYq17tLZ11tFq61YcuLHgLg5QXLhZMmWFkef3B+U0EpARSIDv+/W6l+a5u+e+uVyOb57n7jkeY4yBEEIIIYTUanxlB0AIIYQQQpSPkkJCCCGEEEJJISGEEEIIoaSQEEIIIYSAkkJCCCGEEAJKCgkhhBBCCCgpJIQQQgghoKSQEEIIIYSAkkJCCCGEEIIKJoXBwcEYM2YM6tevD11dXYjFYlhYWKBr165YvXo1EhISFBWn0ixYsAA8Hg8LFiyosm3a2dmBx+Ph2bNnVbbN2mL06NHg8XjYvn27skMp1uvXrzFy5EhYWlpCTU0NPB4Po0ePLnM9q1atAo/HA4/Hw/r16xUfaC1RnmNm+/bt3L4vnEQiEYyNjdGoUSMMGzYMv/76K9LS0oqt48KFC+DxeOjQoUPF30Qt9uzZM/B4PNjZ2VX6tiQSCdatW4d27drB0NAQQqEQxsbGcHJywqBBg7B27doa8XdRnsLjvCoxxnD48GGMHDkS9erVg66uLkQiEUxMTNC2bVvMnj0bV65cqdKYqrtyJYVv375F165d4eXlhe3btyM3NxcdO3aEr68vnJycEBoaipkzZ6Ju3bq4evWqomOu1qpDUqKKast+Y4yhf//+2LVrFwwMDDB48GD4+fmhbdu2Za5ry5Yt3P+3bt2qyDBJKWlpacHPzw9+fn4YMmQI2rRpA4FAgL1792LixImwtLTEunXrQE8bLT9V+REdFxeHli1bYtq0aQgNDUXDhg3Rv39/dOzYESKRCPv378f06dPpb6KCREdHo2XLlujbty927doFqVSKjh07YuDAgWjRogWePHmCFStWwNPTE/3791d2uNWGWllXSE1NRdu2bREVFYWGDRvi119/xWeffSazjEQiwY4dOzB//nzExMQoLNja4uzZs8jNzUWdOnWUHUqNExgYiK+++goWFhbKDkWu58+f49q1a7CxscHt27ehplbmrygA4MqVK4iMjIS+vj5yc3MRHh6OW7duoXnz5gqOmJTE2NhY7g+ZmJgYLF++HGvXrsW0adPw6tUrLF++XGaZVq1a4f79+9DU1KyiaGumOnXq4P79+xAKhZW6nSlTpuDu3bto3Lgxjh8/DltbW5n58fHx+OOPP2BmZlapcdQGz58/h4eHB+Lj4+Hp6YkNGzbIPbdduXIFP/74IyIjI5UQZfVU5r84U6dORVRUFOzs7PDPP//A0NCwyDJisRgTJkxAnz59kJKSoog4axUHBwdlh1BjWVhYqGxCCAAvXrwAANjb25c7IQTetxIOHToUWVlZ2L59O7Zs2UJJoYqwsLDA6tWrUa9ePfj7++PHH39Er169ZH5ga2pqomHDhkqMsmYQCoWVvh+zs7Nx+PBhAAWXbXycEAKAqakppk2bVqlx1BYjRozgEsLz589DLBbLXc7DwwP79+/HtWvXqjjCaoyVwZMnT5hAIGAA2IEDB8qyqow//viDderUiRkYGDCRSMRsbGzYmDFjWFRUlNzlbW1tGQAWHR3NDh06xDp27MgMDAwYAHb+/HnGCvpeWOHb2bp1K/Pw8GC6urrceoVev37NZsyYwRo2bMg0NDSYtrY2a9GiBVu/fj3Lzc0tsu358+czAGz+/Pky5Tk5OWznzp1s2LBhrEGDBkxHR4epq6uz+vXrs6lTp7LXr1/LLB8dHc3FKG/6sP4P3+/HMjIyWGBgIHN1dWXa2tpMQ0ODNWrUiH3zzTcsKSmpyPKF27W1tWVSqZT98ssvrHnz5kxTU5Pp6uqyrl27stDQULn7/cN9unPnTtayZUumpaXFjI2N2ZAhQ9jz588ZY4xJpVK2fv165uLiwjQ1NZmRkRHz8/NjcXFxReqszP1WmmPAz8+PAWDbtm3j1nvy5AnT09NjPB6PBQUFFYn59evXzMTEhAFgf/75p9x9VZyrV6+ygQMHMgsLCyYUCpmJiQnr2bMnO336dJnep7xjoTjp6elMR0eHAWA3btxgISEhDADT19dnWVlZxa4nlUrZli1bmJubG9PQ0GCGhoasW7du7J9//mHnz59nAFj79u3lrlvW71VJ0tLS2K+//sr69evHHB0dmaamJtPU1GRNmjRhX3/9NUtOTpa73offm3PnzrGuXbsyfX19pq6uzlxdXdmOHTuK3WZiYiKbNm0as7GxYSKRiFlbWzN/f3+WmJgo95j5lG3btnHfu09p2bIlA8B69uwpU17SPg8ODmZTpkxhLi4uzMjIiIlEIlanTh02aNAgdu3atWK3lZuby1asWMEaN27MxGIxMzExYQMGDGARERFczH5+fnLfi5+fH0tPT2dfffUVc3BwYCKRiJmZmbFRo0axV69eFbvN0n4HCmVnZ7Ply5ez5s2bM21tbSYUCpmZmRlr0aIFmz17NktMTJSJq7ip8G/Dh+dAeTIyMtjq1atZmzZtmL6+Pvc3qWfPnmz37t3Fvq8PvX79mtvuv//+W6p1CinieA8KCmLt27dnurq6TF9fn/n4+LA7d+5wy+7evZt5eHgwbW1tpqenx/r168ceP35cpM4Pj7mMjAwWEBDAHBwcmFgsZhYWFmzs2LHFftYfnn8/lpuby3777TfWvn177u++nZ0dmzRpEnvx4kWZ9ldhjADY7du3y7Tuh0qTVzDG2P3799no0aO5c4OBgQHr1KkT27t3r9x6P3W+KM337O3bt+yLL75g1tbW3PE4ffp0uX/jGSs4H/Ts2ZOZmpoyNTU1pq+vzxwdHdnw4cPZxYsXy7RfypQUrl27lvvjkpeXV6YNMVbwR2fUqFEMAFNTU2OdOnViQ4YMYfXr12cAmKamJjtx4kSR9Qo/vClTpjAArEWLFmzo0KGsffv27NKlSwVv5L+DZMqUKYzP57O2bduyoUOHMnd3d/bs2TPGGGMXL17kPnQ7OzvWu3dv5u3tzZV5eXmxnJwcmW0XlxS+fPmSAWB6enrMw8ODDRw4kPXo0YNZWloyAMzExIQ9evSIWz4hIYH5+fkxBwcHBoC1adOG+fn5cdPBgweLvN+PE4HExETWrFkzBoDp6uqy3r17M19fX2ZsbMwAMHt7+yLrfHhC9PPzY0KhkHXq1IkNGjSI2+9isZhduXKlyH4v3KdfffUV93kNGDCA2djYMADM2tqaJSUlsUGDBjF1dXXWrVs31q9fP2ZqasoAsKZNmzKJRFJl+600x0BxX9j9+/czAMzY2Ji9fPmSK8/Ly2OfffYZA8C++OKLIvuoJL/++ivj8/kMAHN1dWVDhw5lrVu35uJcsGBBkffp7e3NADAzMzOZ95mQkFDq7W7ZsoXb/4UKP+uS/shNnjyZAWB8Pp+1b9+eDRkyhDVu3JgJBAI2a9asYhOU8nyvSlKYxJqYmLC2bduywYMHMy8vL2ZkZMQAMEdHR/b27dsi6xV+b7799lvG4/GYm5sbGzJkCPPw8OD2+erVq4usFxsby+rVq8cAMAMDA9a/f3/Wt29fpq+vzxwcHFjv3r0rNSksPK9qa2vLJNAlJYWFSZmrqyvr3bs369+/P2vUqBF3bv3777+LrJOfn8969uzJADCRSMS8vLzY4MGDWd26dZmmpiZ3fi3uj1Xfvn1Z06ZNmb6+PuvVqxfr06cP9123tbVlKSkpRbZZlu9AYYydO3fmznHdu3dnQ4cOZV26dOE+38KkKyQkhPn5+TEtLS0GgPn6+sp8Z+7fv88YKzkpfPHiBbffNDU1WdeuXdmQIUPYZ599xvT09Er1+THGmEQiYZqamgwAGzt2LMvPzy/VeoXvoyLH+1dffcV4PB5r06aNzHldX1+fPX78mM2ePVvm/G1tbc0AMEtLyyJJRuEx5+npyTw8PJimpibr0aMHl9QDYObm5uzhw4dF4ikuKUx
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Correlation matrix\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"# Plot the correlation matrix\n",
"sns.heatmap(corr_matrix_age_diag, annot=True, cmap='coolwarm', fmt='d')\n",
"plt.title('Correlationmatrix of Age and Diagnostic Sample Groups', fontsize=16)\n",
"plt.xlabel('Diagnostic Group')\n",
"plt.ylabel('Age Group')\n",
"plt.show()"
]
},
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-06-26 18:19:34 +02:00
"Chi-Square Statistic: 1043.5644539016944\n",
"P-value: 4.935370162055676e-205\n",
"Chi-Square Statistic for SB in 60-70 vs others: 32.94855579340837\n",
"P-value for SB in 60-70 vs others: 9.463001659861763e-09\n"
2024-06-05 09:53:25 +02:00
]
}
],
"source": [
"# Change from group to category\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n",
"sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n",
"sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n",
"total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n",
"total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n",
"chi2_sb, p_sb = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n",
"print(f\"P-value for SB in 60-70 vs others: {p_sb}\")"
]
},
2024-06-26 13:17:02 +02:00
{
"cell_type": "code",
2024-06-26 18:19:34 +02:00
"execution_count": 13,
2024-06-26 13:17:02 +02:00
"metadata": {},
2024-06-26 18:19:34 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chi-Square Statistic: 1043.5644539016944\n",
"P-value: 4.935370162055676e-205\n",
"Chi-Square Statistic for AFIB in 70-80 vs others: 120.60329273774582\n",
"P-value for AFIB in 70-80 vs others: 4.667227334873944e-28\n"
]
}
],
2024-06-26 13:17:02 +02:00
"source": [
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if AFIB (atrial fibrillation /atrial flutter) has a significantly higher frequency in the 70-80 age group\n",
"afib_70_80 = corr_matrix_age_diag.loc[pd.Interval(70, 80, closed='right'), 'AFIB']\n",
"afib_other = corr_matrix_age_diag.drop(pd.Interval(70, 80, closed='right')).sum()['AFIB']\n",
"total_70_80 = corr_matrix_age_diag.loc[pd.Interval(70, 80, closed='right')].sum()\n",
"total_other_70_80 = corr_matrix_age_diag.drop(pd.Interval(70, 80, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[afib_70_80, total_70_80 - afib_70_80], [afib_other, total_other_70_80 - afib_other]]\n",
"chi2_afib, p_afib = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for AFIB in 70-80 vs others: {chi2_afib}\")\n",
"print(f\"P-value for AFIB in 70-80 vs others: {p_afib}\")"
]
},
2024-06-05 09:53:25 +02:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results can be interpreted as followed:\n",
"\n",
"- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n",
"\n",
"- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant."
]
2024-06-28 13:09:46 +02:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The significant appearance of sinus bradycardia in the age group 60-70 could be caused by multiple factors. \n",
"In this case, the physiological age could play a huge factor. The sinus node continuously generates electrical impulses, thus setting the normal rhythm and rate in a healthy heart. With increasing age, the sinus node becomes less responsive which leads to a slower heart rate of 60 bpm or less.\n",
"Another reason could be increased medication, which is more likely to be the case when older. A sinus bradycardia could appear as a side effect of that medication.\n",
"<br>(source: https://doi.org/10.1253/jcj.57.760, last visit: 10.06.2024)\n",
"<br>(source: https://doi.org/10.7861%2Fclinmed.2022-0431, last visit: 10.06.2024)<br>\n",
"<br>\n",
"But what could be the reason for the more frequent appearance of the sinus bradycardia in the age group 60-70 than in other older age groups?<br>\n",
"The lower number of sinus bradycardia cases in older age groups could be due to the increasing mortality with higher ages. People with sinus bradycardia might not reach older ages because of comorbidities and further complications.\n",
"Besides that, older people are more likely to receive medical support such as medication and pacemakers which can prevent sinus bradycardia or at least lower its effect.\n",
"The higher frequency of older people in the database may lead to a slight bias in the distribution. See also [Demographic Bias](#demographic-bias).<br>\n",
"The sample size in the study conducted may also play a role in the significance of the frequency.\n",
"\n",
"The significant appearance of atrial fibrillation/atrial flutter in the age group 70-80 could be caused by multiple factors.\n",
"The physiological age is the main reason. With increasing age, various age-related changes in the cardiovascular system occur. Older people are more likely to have hypertension. The increased pressure can lead to thickening of the heart walls and a change of the structure, potentially leading to AFIB. Chronic inflammation which is more prevalent in older people, can damage heart tissue and lead to atrial issues. The change of hormone levels when getting older can also have an influence on the heart function and contribute to the development of arrhythmias. Older adults are also more likely to have comorbidities like diabetes, obesity or chronic kidney disease.\n",
"<br>(source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5460064/, last visit: 28.06.2024)"
]
2024-06-05 09:53:25 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-06-28 13:09:46 +02:00
"version": "3.11.9"
2024-06-05 09:53:25 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}