{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Hypothesis\n", "This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n", "For that instance the chi-squared test is used." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import pickle\n", "import sys\n", "\n", "\n", "from scipy.stats import chi2_contingency\n", "sys.path.append('../scripts')\n", "import data_helper\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading GSVT\n", "Reading AFIB\n", "Reading SR\n", "Reading SB\n", "Number of patients per category:\n", "GSVT: 0\n", "AFIB: 27\n", "SR: 13\n", "SB: 50\n" ] } ], "source": [ "data = data_helper.load_data(only_demographic=False)\n", "\n", "print(\"Number of patients per category:\")\n", "for cat_name in data.keys():\n", " print(f\"{cat_name}: {len(data[cat_name])}\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading GSVT\n", "Reading AFIB\n", "Reading SR\n", "Reading SB\n" ] }, { "ename": "ValueError", "evalue": "All arrays must be of the same length", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[27], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m data_org \u001b[38;5;241m=\u001b[39m data_helper\u001b[38;5;241m.\u001b[39mload_data(only_demographic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m----> 3\u001b[0m df_dgc \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_org\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\pandas\\core\\frame.py:767\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 761\u001b[0m mgr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_mgr(\n\u001b[0;32m 762\u001b[0m data, axes\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex\u001b[39m\u001b[38;5;124m\"\u001b[39m: index, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m: columns}, dtype\u001b[38;5;241m=\u001b[39mdtype, copy\u001b[38;5;241m=\u001b[39mcopy\n\u001b[0;32m 763\u001b[0m )\n\u001b[0;32m 765\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m 766\u001b[0m \u001b[38;5;66;03m# GH#38939 de facto copy defaults to False only in non-dict cases\u001b[39;00m\n\u001b[1;32m--> 767\u001b[0m mgr \u001b[38;5;241m=\u001b[39m \u001b[43mdict_to_mgr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmanager\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 768\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, ma\u001b[38;5;241m.\u001b[39mMaskedArray):\n\u001b[0;32m 769\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mma\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mrecords\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\pandas\\core\\internals\\construction.py:503\u001b[0m, in \u001b[0;36mdict_to_mgr\u001b[1;34m(data, index, columns, dtype, typ, copy)\u001b[0m\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 500\u001b[0m \u001b[38;5;66;03m# dtype check to exclude e.g. range objects, scalars\u001b[39;00m\n\u001b[0;32m 501\u001b[0m arrays \u001b[38;5;241m=\u001b[39m [x\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(x, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m x \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m arrays]\n\u001b[1;32m--> 503\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marrays_to_mgr\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrays\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconsolidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\pandas\\core\\internals\\construction.py:114\u001b[0m, in \u001b[0;36marrays_to_mgr\u001b[1;34m(arrays, columns, index, dtype, verify_integrity, typ, consolidate)\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verify_integrity:\n\u001b[0;32m 112\u001b[0m \u001b[38;5;66;03m# figure out the index, if necessary\u001b[39;00m\n\u001b[0;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m index \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 114\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[43m_extract_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrays\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 116\u001b[0m index \u001b[38;5;241m=\u001b[39m ensure_index(index)\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\pandas\\core\\internals\\construction.py:677\u001b[0m, in \u001b[0;36m_extract_index\u001b[1;34m(data)\u001b[0m\n\u001b[0;32m 675\u001b[0m lengths \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(raw_lengths))\n\u001b[0;32m 676\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(lengths) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 677\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAll arrays must be of the same length\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m have_dicts:\n\u001b[0;32m 680\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 681\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMixing dicts with non-Series may lead to ambiguous ordering.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 682\u001b[0m )\n", "\u001b[1;31mValueError\u001b[0m: All arrays must be of the same length" ] } ], "source": [ "data_org = data_helper.load_data(only_demographic=False)\n", "\n", "df_dgc = pd.DataFrame(data_org)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'df_dgc' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[21], line 36\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# #path = \"C:/Studium/dsa/data\"\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# #path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# path = \"C:/Users/klara/projects/DSA/data\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 33\u001b[0m \n\u001b[0;32m 34\u001b[0m \u001b[38;5;66;03m# Change from group to category\u001b[39;00m\n\u001b[0;32m 35\u001b[0m age_categories \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m10\u001b[39m, \u001b[38;5;241m20\u001b[39m, \u001b[38;5;241m30\u001b[39m, \u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m50\u001b[39m, \u001b[38;5;241m60\u001b[39m, \u001b[38;5;241m70\u001b[39m, \u001b[38;5;241m80\u001b[39m, \u001b[38;5;241m90\u001b[39m]\n\u001b[1;32m---> 36\u001b[0m df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage_group\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mcut(\u001b[43mdf_dgc\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m'\u001b[39m], bins\u001b[38;5;241m=\u001b[39mage_categories)\n\u001b[0;32m 37\u001b[0m corr_matrix_age_diag\u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mcrosstab(df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mage_group\u001b[39m\u001b[38;5;124m'\u001b[39m], df_dgc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdiag\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 39\u001b[0m \u001b[38;5;66;03m# Chi-square test\u001b[39;00m\n", "\u001b[1;31mNameError\u001b[0m: name 'df_dgc' is not defined" ] } ], "source": [ "# #path = \"C:/Studium/dsa/data\"\n", "# #path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n", "# path = \"C:/Users/klara/projects/DSA/data\"\n", "\n", "# categories_dict = {\n", "# 'SB': [426177001],\n", "# 'AFIB': [164889003, 164890007],\n", "# 'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n", "# 'SR': [426783006, 427393009]\n", "# }\n", "\n", "# data = {}\n", "# for cat_name in categories_dict.keys():\n", "# print(f\"Reading {cat_name}\")\n", "# with open(f'{path}/{cat_name}.pkl', 'rb') as f:\n", "# records = pickle.load(f)\n", "# data[cat_name] = records\n", "# print(f\"Length of {cat_name}: {len(records)}\")\n", "\n", "# data_demographic = {'age':[], 'diag':[], 'gender':[]}\n", "# for cat_name, records in data.items():\n", "# for record in records:\n", "# age = record.comments[0].split(' ')[1]\n", "# sex = record.comments[1].split(' ')[1]\n", "# if age == 'NaN' or sex == 'NaN':\n", "# continue\n", "# # cut Age: from alter string \n", "# data_demographic['age'].append(int(age))\n", "# data_demographic['diag'].append(cat_name)\n", "# data_demographic['gender'].append(sex)\n", "\n", "# df_dgc = pd.DataFrame(data_demographic)\n", "\n", "# Change from group to category\n", "age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n", "df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n", "corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n", "\n", "# Chi-square test\n", "chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n", "\n", "# Difference between observed and expected frequencies\n", "print(f\"Chi-Square Statistic: {chi2}\")\n", "print(f\"P-value: {p}\")\n", "\n", "# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n", "sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n", "sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n", "total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n", "total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n", "\n", "# Frequency table for the specific Chi-Square test\n", "observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n", "chi2_sb, p_sb = chi2_contingency(observed)[:2]\n", "\n", "\n", "print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n", "print(f\"P-value for SB in 60-70 vs others: {p_sb}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results can be interpreted as followed:\n", "\n", "- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n", "\n", "- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }