From fedf321bc70ba4ec59757fb45b5be44c2c96b8bf Mon Sep 17 00:00:00 2001 From: klara Date: Wed, 5 Jun 2024 09:53:25 +0200 Subject: [PATCH] Erweiterungen --- notebooks/demographic_plots.ipynb | 5 +- notebooks/statistics.ipynb | 151 ++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 notebooks/statistics.ipynb diff --git a/notebooks/demographic_plots.ipynb b/notebooks/demographic_plots.ipynb index 1c90e68..4ac3068 100644 --- a/notebooks/demographic_plots.ipynb +++ b/notebooks/demographic_plots.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Demographic Plots\n", - "This Notebook is used to read the data from the pickle files and to create a dataframe with the demographic data.\n", + "This notebook is used to read the data from the pickle files and to create a dataframe with the demographic data.\n", "With this data we can create a plots to show the distribution of the demographic data." ] }, @@ -30,7 +30,8 @@ "source": [ "# Set path to data\n", "path = \"C:/Studium/dsa/data\"\n", - "#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"" + "#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n", + "#C:\\Users\\klara\\projects\\DSA\\a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0" ] }, { diff --git a/notebooks/statistics.ipynb b/notebooks/statistics.ipynb new file mode 100644 index 0000000..ec564cb --- /dev/null +++ b/notebooks/statistics.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hypothesis\n", + "This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n", + "For that instance the chi-squared test is used." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import pickle\n", + "from scipy.stats import chi2_contingency\n", + "from data_helper import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading SB\n", + "Length of SB: 50\n", + "Reading AFIB\n", + "Length of AFIB: 27\n", + "Reading GSVT\n", + "Length of GSVT: 0\n", + "Reading SR\n", + "Length of SR: 13\n", + "Chi-Square Statistic: 38.266574797751275\n", + "P-value: 0.0004730210823940083\n", + "Chi-Square Statistic for SB in 60-70 vs others: 1.4858035714285718\n", + "P-value for SB in 60-70 vs others: 0.22286870264719977\n" + ] + } + ], + "source": [ + "#path = \"C:/Studium/dsa/data\"\n", + "#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n", + "path = \"C:/Users/klara/projects/DSA/data\"\n", + "\n", + "categories_dict = {\n", + "'SB': [426177001],\n", + "'AFIB': [164889003, 164890007],\n", + "'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n", + "'SR': [426783006, 427393009]\n", + "}\n", + "\n", + "data = {}\n", + "for cat_name in categories_dict.keys():\n", + " print(f\"Reading {cat_name}\")\n", + " with open(f'{path}/{cat_name}.pkl', 'rb') as f:\n", + " records = pickle.load(f)\n", + " data[cat_name] = records\n", + " print(f\"Length of {cat_name}: {len(records)}\")\n", + "\n", + "data_demographic = {'age':[], 'diag':[], 'gender':[]}\n", + "for cat_name, records in data.items():\n", + " for record in records:\n", + " age = record.comments[0].split(' ')[1]\n", + " sex = record.comments[1].split(' ')[1]\n", + " if age == 'NaN' or sex == 'NaN':\n", + " continue\n", + " # cut Age: from alter string \n", + " data_demographic['age'].append(int(age))\n", + " data_demographic['diag'].append(cat_name)\n", + " data_demographic['gender'].append(sex)\n", + "\n", + "df_dgc = pd.DataFrame(data_demographic)\n", + "\n", + "# Change from group to category\n", + "age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n", + "df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n", + "corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n", + "\n", + "# Chi-square test\n", + "chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n", + "\n", + "# Difference between observed and expected frequencies\n", + "print(f\"Chi-Square Statistic: {chi2}\")\n", + "print(f\"P-value: {p}\")\n", + "\n", + "# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n", + "sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n", + "sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n", + "total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n", + "total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n", + "\n", + "# Frequency table for the specific Chi-Square test\n", + "observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n", + "chi2_sb, p_sb = chi2_contingency(observed)[:2]\n", + "\n", + "\n", + "print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n", + "print(f\"P-value for SB in 60-70 vs others: {p_sb}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results can be interpreted as followed:\n", + "\n", + "- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n", + "\n", + "- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}