2024-06-05 09:53:25 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hypothesis\n",
"This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n",
"For that instance the chi-squared test is used."
]
},
{
"cell_type": "code",
2024-06-24 17:57:42 +02:00
"execution_count": 1,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pickle\n",
2024-06-05 16:47:14 +02:00
"import sys\n",
"\n",
"\n",
2024-06-05 09:53:25 +02:00
"from scipy.stats import chi2_contingency\n",
2024-06-05 16:47:14 +02:00
"sys.path.append('../scripts')\n",
"import data_helper\n"
2024-06-05 09:53:25 +02:00
]
},
{
"cell_type": "code",
2024-06-24 17:57:42 +02:00
"execution_count": 2,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-06-24 17:57:42 +02:00
"Reading GSVT\n",
"Reading AFIB\n",
"Reading SR\n",
"Reading SB\n",
"Number of patients per category:\n",
"age: 37011\n",
"diag: 37011\n",
"gender: 37011\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-08 18:06:08 +02:00
"data = data_helper.load_data(only_demographic=True)\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(\"Number of patients per category:\")\n",
"for cat_name in data.keys():\n",
2024-06-24 17:57:42 +02:00
" print(f\"{cat_name}: {len(data[cat_name])}\")\n",
"\n",
"df_dgc = pd.DataFrame(data)"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
2024-06-24 17:57:42 +02:00
"execution_count": 15,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
2024-06-24 17:57:42 +02:00
"name": "stdout",
"output_type": "stream",
"text": [
"Number of patients in a diagnosis category: SB 15826\n",
"SR 10426\n",
"AFIB 9756\n",
"GSVT 1003\n",
"Name: diag, dtype: int64\n",
"Min number of patients in a diagnosis category: 1003\n",
"unique values in the diagnosis category: ['GSVT' 'AFIB' 'SR' 'SB']\n",
"GSVT 1003\n",
"AFIB 1003\n",
"SR 1003\n",
"SB 1003\n",
"Name: diag, dtype: int64\n"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-24 17:57:42 +02:00
"# get number of patients in a diagnosis category\n",
"num_patients = df_dgc['diag'].value_counts()\n",
"print(f\"Number of patients in a diagnosis category: {num_patients}\")\n",
"# get min number of patients in a diagnosis category\n",
"min_num_patients = df_dgc['diag'].value_counts().min()\n",
"print(f\"Min number of patients in a diagnosis category: {min_num_patients}\")\n",
"\n",
"# get the unique values of the diagnosis category\n",
"unique_vals = df_dgc['diag'].unique()\n",
"print(f\"unique values in the diagnosis category: {unique_vals}\")\n",
"\n",
"# get random sample of patients for each diagnosis category with min number of patients\n",
"sampled_data = pd.DataFrame()\n",
"for val in unique_vals:\n",
" sampled_data = pd.concat([sampled_data, df_dgc[df_dgc['diag'] == val].sample(min_num_patients)])\n",
"\n",
2024-06-05 16:47:14 +02:00
"\n",
2024-06-24 17:57:42 +02:00
"print(sampled_data['diag'].value_counts())\n",
"\n",
"df_dgc = sampled_data"
2024-06-05 16:47:14 +02:00
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
2024-06-24 17:57:42 +02:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoUAAAHJCAYAAADzW0NeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAADEc0lEQVR4nOzdd1QUVxsH4N8usEtdll6UJlhAEREL2CuIWMFuFLsSNZZ8FozGFsWosbdoNBpLEhOjiYpRLFgiUaMiCooVG1V6Xcre7w/C6IYFARd2kfc5Z86BmTsz78zOzL5778wdHmOMgRBCCCGE1Gl8ZQdACCGEEEKUj5JCQgghhBBCSSEhhBBCCKGkkBBCCCGEgJJCQgghhBACSgoJIYQQQggoKSSEEEIIIaCkkBBCCCGEgJJCQgghhBCCD0wKQ0JCMHbsWDRq1AgikQhCoRAWFhbo2bMn1q9fj6SkJEXFqTRLliwBj8fDkiVLamydtra24PF4iImJqbF11hVjxowBj8fD3r17lR1KmV6/fo1Ro0bB0tIS6urq4PF4GDNmTKWXs27dOvB4PPB4PGzevFnxgdYRVTlm9u7dy+37kkEgEMDY2BhOTk4YMWIEdu7ciYyMjDKXERoaCh6Phy5dunz4RtRhMTEx4PF4sLW1rfZ1SSQSbNq0CZ06dYKhoSE0NDRgbGwMR0dHDBkyBBs3bvwovhflKTnOaxJjDL///jtGjRqFhg0bQiQSQSAQwMTEBB06dMCcOXPw999/12hMtV2VksI3b96gZ8+e8PT0xN69e1FQUICuXbvCz88Pjo6OuHr1KmbPno0GDRrg2rVrio65VqsNSYkqqiv7jTEGX19fHDhwAAYGBhg6dCj8/f3RoUOHSi9r9+7d3N979uxRZJikgnR0dODv7w9/f38MGzYM7du3h5qaGn7++WdMnjwZlpaW2LRpE+hto1WnKj+iExIS0Lp1a8yYMQNXr15FkyZN4Ovri65du0IgEODIkSOYOXMmfScqyLNnz9C6dWsMGDAABw4cgFQqRdeuXTF48GC0atUKT548wdq1a+Hh4QFfX19lh1trqFd2hvT0dHTo0AHR0dFo0qQJdu7ciY4dO8qUkUgk2LdvHxYvXoy4uDiFBVtXnDt3DgUFBahXr56yQ/noBAUFYf78+bCwsFB2KHI9f/4c169fh7W1Ne7cuQN19UqfogCAv//+G1FRURCLxSgoKEB4eDhu3bqFli1bKjhiUh5jY2O5P2Ti4uKwevVqbNy4ETNmzMCrV6+wevVqmTJt2rTB/fv3oa2tXUPRfpzq1auH+/fvQ0NDo1rXM23aNNy9exdNmzbFyZMnYWNjIzM9MTERP/74I8zMzKo1jrrg+fPncHd3R2JiIjw8PLBlyxa517a///4ba9asQVRUlBKirJ0q/Y0zffp0REdHw9bWFn/99RcMDQ1LlREKhZg0aRL69++PtLQ0RcRZp9jb2ys7hI+WhYWFyiaEAPDixQsAgJ2dXZUTQuBtLeHw4cORm5uLvXv3Yvfu3ZQUqggLCwusX78eDRs2xNSpU7FmzRr07dtX5ge2trY2mjRposQoPw4aGhrVvh/z8vLw+++/Ayi+beO/CSEAmJqaYsaMGdUaR13xySefcAnhhQsXIBQK5ZZzd3fHkSNHcP369RqOsBZjlfDkyROmpqbGALDffvutMrPK+PHHH1m3bt2YgYEBEwgEzNramo0dO5ZFR0fLLW9jY8MAsGfPnrFjx46xrl27MgMDAwaAXbhwgbHithdWsjl79uxh7u7uTCQScfOVeP36NZs1axZr0qQJ09LSYrq6uqxVq1Zs8+bNrKCgoNS6Fy9ezACwxYsXy4zPz89n+/fvZyNGjGCNGzdmenp6TFNTkzVq1IhNnz6dvX79Wqb8s2fPuBjlDe8u/93t/a/s7GwWFBTEXF1dma6uLtPS0mJOTk7siy++YCkpKaXKl6zXxsaGSaVS9u2337KWLVsybW1tJhKJWM+ePdnVq1fl7vd39+n+/ftZ69atmY6ODjM2NmbDhg1jz58/Z4wxJpVK2ebNm5mLiwvT1tZmRkZGzN/fnyUkJJRaZnXut4ocA/7+/gwA+/7777n5njx5wvT19RmPx2PBwcGlYn79+jUzMTFhANhPP/0kd1+V5dq1a2zw4MHMwsKCaWhoMBMTE9anTx925syZSm2nvGOhLFlZWUxPT48BYP/88w+7fPkyA8DEYjHLzc0tcz6pVMp2797N3NzcmJaWFjM0NGS9evVif/31F7tw4QIDwDp37ix33sqeV+XJyMhgO3fuZAMHDmQODg5MW1ubaWtrs2bNmrEFCxaw1NRUufO9e96cP3+e9ezZk4nFYqapqclcXV3Zvn37ylxncnIymzFjBrO2tmYCgYBZWVmxqVOnsuTkZLnHzPt8//333Hn3Pq1bt2YAWJ8+fWTGl7fPQ0JC2LRp05iLiwszMjJiAoGA1atXjw0ZMoRdv369zHUVFBSwtWvXsqZNmzKhUMhMTEzYoEGDWGRkJBezv7+/3G3x9/dnWVlZbP78+cze3p4JBAJmZmbGRo8ezV69elXmOit6DpTIy8tjq1evZi1btmS6urpMQ0ODmZmZsVatWrE5c+aw5ORkmbjKGkq+G969BsqTnZ3N1q9fz9q3b8/EYjH3ndSnTx928ODBMrfrXa9fv+bWe/v27QrNU0IRx3twcDDr3LkzE4lETCwWMx8fHxYREcGVPXjwIHN3d2e6urpMX1+fDRw4kD1+/LjUMt895rKzs1lgYCCzt7dnQqGQWVhYsHHjxpX5Wb97/f2vgoICtmvXLta5c2fue9/W1pZNmTKFvXjxolL7qyRGAOzOnTuVmvddFckrGGPs/v37bMyYMdy1wcDAgHXr1o39/PPPcpf7vutFRc6zN2/esE8//ZRZWVlxx+PMmTPlfsczVnw96NOnDzM1NWXq6upMLBYzBwcHNnLkSHbx4sVK7ZdKJYUbN27kvlwKCwsrtSLGir90Ro8ezQAwdXV11q1bNzZs2DDWqFEjBoBpa2uzU6dOlZqv5MObNm0aA8BatWrFhg8fzjp37swuXbpUvCH/HiTTpk1jfD6fdejQgQ0fPpy1bduWxcTEMMYYu3jxIveh29rasn79+jEvLy9unKenJ8vPz5dZd1lJ4cuXLxkApq+vz9zd3dngwYNZ7969maWlJQPATExM2KNHj7jySUlJzN/fn9nb2zMArH379szf358bjh49Wmp7/5sIJCcnsxYtWjAATCQSsX79+jE/Pz9mbGzMADA7O7tS87x7QfT392caGhqsW7dubMiQIdx+FwqF7O+//y6130v26fz587nPa9CgQcza2poBYFZWViwlJYUNGTKEaWpqsl69erGBAwcyU1NTBoA1b96cSSSSGttvFTkGyjphjxw5wgAwY2Nj9vLlS258YWEh69ixIwPAPv3001L7qDw7d+5kfD6fAWCurq5s+PDhrF27dlycS5YsKbWdXl5eDAAzMzOT2c6kpKQKr3f37t3c/i9R8lmX9yUXEBDAADA+n886d+7Mhg0bxpo2bcrU1NTY559/XmaCUpXzqjwlSayJiQnr0KEDGzp0KPP09GRGRkYMAHNwcGBv3rwpNV/JebNo0SLG4/GYm5sbGzZsGHN3d+f2+fr160vNFx8fzxo2bMgAMAMDA+br68sGDBjAxGIxs7e3Z/369avWpLDkuqqrqyuTQJeXFJYkZa6urqxfv37M19eXOTk5cdfWX3/9tdQ8RUVFrE+fPgwAEwgEzNPTkw0dOpQ1aNCAaWtrc9fXsr6sBgwYwJo3b87EYjHr27cv69+/P3eu29jYsLS0tFLrrMw5UBJj9+7duWuct7c3Gz58OOvRowf3+ZYkXZcvX2b+/v5MR0eHAWB+fn4y58z9+/cZY+UnhS9evOD2m7a2NuvZsycbNmwY69ixI9PX16/Q58cYYxKJhGlrazMAbNy4cayoqKhC85Vsx4cc7/Pnz2c8Ho+1b99e5rouFovZ48eP2Zw5c2Su31ZWVgwAs7S0LJVklBxzHh4ezN3dnWlra7PevXtzST0AZm5uzh4+fFgqnrKSwoyMDNa
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Correlation matrix\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"# Plot the correlation matrix\n",
"sns.heatmap(corr_matrix_age_diag, annot=True, cmap='coolwarm', fmt='d')\n",
"plt.title('Correlationmatrix of Age and Diagnostic Sample Groups', fontsize=16)\n",
"plt.xlabel('Diagnostic Group')\n",
"plt.ylabel('Age Group')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chi-Square Statistic: 1054.3796287658074\n",
"P-value: 2.4773868106437145e-207\n",
"Chi-Square Statistic for SB in 60-70 vs others: 49.305576225492736\n",
"P-value for SB in 60-70 vs others: 2.1903897342655923e-12\n"
2024-06-05 09:53:25 +02:00
]
}
],
"source": [
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n",
"sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n",
"sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n",
"total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n",
"total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n",
"chi2_sb, p_sb = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n",
"print(f\"P-value for SB in 60-70 vs others: {p_sb}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results can be interpreted as followed:\n",
"\n",
"- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n",
"\n",
"- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-06-24 17:57:42 +02:00
"version": "3.10.4"
2024-06-05 09:53:25 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}