2024-06-05 09:53:25 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hypothesis\n",
"This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n",
"For that instance the chi-squared test is used."
]
},
{
"cell_type": "code",
2024-06-08 18:06:08 +02:00
"execution_count": 2,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pickle\n",
2024-06-05 16:47:14 +02:00
"import sys\n",
"\n",
"\n",
2024-06-05 09:53:25 +02:00
"from scipy.stats import chi2_contingency\n",
2024-06-05 16:47:14 +02:00
"sys.path.append('../scripts')\n",
"import data_helper\n"
2024-06-05 09:53:25 +02:00
]
},
{
"cell_type": "code",
2024-06-08 18:06:08 +02:00
"execution_count": 3,
2024-06-05 09:53:25 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-06-08 18:06:08 +02:00
"Reading GSVT\n"
]
},
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'C:/Studium/dsa/data/GSVT.pkl'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata_helper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43monly_demographic\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNumber of patients per category:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cat_name \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mkeys():\n",
"File \u001b[1;32mc:\\Users\\klara\\projects\\DSA\\DSA_SS24\\notebooks\\../scripts\\data_helper.py:37\u001b[0m, in \u001b[0;36mload_data\u001b[1;34m(only_demographic, path_settings)\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cat_name \u001b[38;5;129;01min\u001b[39;00m labels\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m 36\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReading \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcat_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 37\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mpath_data\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mcat_name\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.pkl\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m 38\u001b[0m records \u001b[38;5;241m=\u001b[39m pickle\u001b[38;5;241m.\u001b[39mload(f)\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m only_demographic:\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:/Studium/dsa/data/GSVT.pkl'"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-08 18:06:08 +02:00
"data = data_helper.load_data(only_demographic=True)\n",
2024-06-05 16:47:14 +02:00
"\n",
"print(\"Number of patients per category:\")\n",
"for cat_name in data.keys():\n",
" print(f\"{cat_name}: {len(data[cat_name])}\")"
]
},
{
"cell_type": "code",
2024-06-08 18:06:08 +02:00
"execution_count": 1,
2024-06-05 16:47:14 +02:00
"metadata": {},
"outputs": [
{
2024-06-08 18:06:08 +02:00
"ename": "NameError",
"evalue": "name 'data_helper' is not defined",
2024-06-05 16:47:14 +02:00
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2024-06-08 18:06:08 +02:00
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data_org \u001b[38;5;241m=\u001b[39m \u001b[43mdata_helper\u001b[49m\u001b[38;5;241m.\u001b[39mload_data(only_demographic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 3\u001b[0m df_dgc \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data_org)\n",
"\u001b[1;31mNameError\u001b[0m: name 'data_helper' is not defined"
2024-06-05 16:47:14 +02:00
]
}
],
"source": [
2024-06-08 18:06:08 +02:00
"data_org = data_helper.load_data(only_demographic=True)\n",
2024-06-05 16:47:14 +02:00
"\n",
"df_dgc = pd.DataFrame(data_org)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df_dgc' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[21], line 36\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# #path = \"C:/Studium/dsa/data\"\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# #path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# path = \"C:/Users/klara/projects/DSA/data\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 33\u001b[0m \n\u001b[0;32m 34\u001b[0m \u001b[38;5;66;03m# Change from group to category\u001b[39;00m\n\u001b[0;32m 35\u001b[0m age_categories \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m10\u001b[39m, \u001b[38;5;241m20\u001b[39m, \u001b[38;5;241m30\u001b[39m, \u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m50\u001b[39m, \u001b[38;5;241m60\u001b[39m, \u001b[38;5;241m70\u001b[39m, \u001b[38;5;241m80\u001b[39m, \u001b[38;5;241m90\u001b[39m]\n\u001b[1;32m---> 36\u001b[0m df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage_group\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mcut(\u001b[43mdf_dgc\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m'\u001b[39m], bins\u001b[38;5;241m=\u001b[39mage_categories)\n\u001b[0;32m 37\u001b[0m corr_matrix_age_diag\u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mcrosstab(df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage_group\u001b[39m\u001b[38;5;124m'\u001b[39m], df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdiag\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 39\u001b[0m \u001b[38;5;66;03m# Chi-square test\u001b[39;00m\n",
"\u001b[1;31mNameError\u001b[0m: name 'df_dgc' is not defined"
2024-06-05 09:53:25 +02:00
]
}
],
"source": [
2024-06-05 16:47:14 +02:00
"# #path = \"C:/Studium/dsa/data\"\n",
"# #path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n",
"# path = \"C:/Users/klara/projects/DSA/data\"\n",
2024-06-05 09:53:25 +02:00
"\n",
2024-06-05 16:47:14 +02:00
"# categories_dict = {\n",
"# 'SB': [426177001],\n",
"# 'AFIB': [164889003, 164890007],\n",
"# 'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n",
"# 'SR': [426783006, 427393009]\n",
"# }\n",
2024-06-05 09:53:25 +02:00
"\n",
2024-06-05 16:47:14 +02:00
"# data = {}\n",
"# for cat_name in categories_dict.keys():\n",
"# print(f\"Reading {cat_name}\")\n",
"# with open(f'{path}/{cat_name}.pkl', 'rb') as f:\n",
"# records = pickle.load(f)\n",
"# data[cat_name] = records\n",
"# print(f\"Length of {cat_name}: {len(records)}\")\n",
2024-06-05 09:53:25 +02:00
"\n",
2024-06-05 16:47:14 +02:00
"# data_demographic = {'age':[], 'diag':[], 'gender':[]}\n",
"# for cat_name, records in data.items():\n",
"# for record in records:\n",
"# age = record.comments[0].split(' ')[1]\n",
"# sex = record.comments[1].split(' ')[1]\n",
"# if age == 'NaN' or sex == 'NaN':\n",
"# continue\n",
"# # cut Age: from alter string \n",
"# data_demographic['age'].append(int(age))\n",
"# data_demographic['diag'].append(cat_name)\n",
"# data_demographic['gender'].append(sex)\n",
2024-06-05 09:53:25 +02:00
"\n",
2024-06-05 16:47:14 +02:00
"# df_dgc = pd.DataFrame(data_demographic)\n",
2024-06-05 09:53:25 +02:00
"\n",
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n",
"sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n",
"sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n",
"total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n",
"total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n",
"chi2_sb, p_sb = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n",
"print(f\"P-value for SB in 60-70 vs others: {p_sb}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results can be interpreted as followed:\n",
"\n",
"- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n",
"\n",
"- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}