ulusoy 2024-06-05 12:06:20 +02:00
commit a788038217
16 changed files with 2982 additions and 262 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/data/ /data/
/settings.json

View File

@ -2,7 +2,12 @@
This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project. This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project.
<<<<<<< HEAD
# Analysis of cardiovascular diseases using ECG data # Analysis of cardiovascular diseases using ECG data
=======
## Analysis of cardiovascular diseases using ECG data
>>>>>>> bc4897dacb83ed2c3722b7b5d66b2fff23fdcb6a
## Table of Contents ## Table of Contents
@ -15,6 +20,7 @@ This project was developed through the Data Science and Analytics course at the
- [Acknowledgements](#acknowledgements) - [Acknowledgements](#acknowledgements)
- [Contact](#contact) - [Contact](#contact)
## About ## About
Cardiovascular diseases refer to a group of diseases that affect the heart and blood vessels and represent a significant global health burden. They are a leading cause of morbidity and mortality worldwide, making effective prevention and management of these diseases critical. Physical examinations, blood tests, ECGs, stress or exercise tests, echocardiograms and CT or MRI scans are used to diagnose cardiovascular disease. Cardiovascular diseases refer to a group of diseases that affect the heart and blood vessels and represent a significant global health burden. They are a leading cause of morbidity and mortality worldwide, making effective prevention and management of these diseases critical. Physical examinations, blood tests, ECGs, stress or exercise tests, echocardiograms and CT or MRI scans are used to diagnose cardiovascular disease.
@ -36,7 +42,7 @@ The data set used in this project was divided into four main groups: SB, AFIB, G
The data provision provides for the following points, which can be taken from the diagram. The data provision provides for the following points, which can be taken from the diagram.
![Alt-Text](readme_data/flow_diag.png) ![Alt-Text](readme_data/Projektablauf.drawio.png)
## Getting Started ## Getting Started
@ -69,7 +75,7 @@ This project is licensed under the [MIT License](https://opensource.org/licenses
We would like to especially thank our instructor, Ms. Jacqueline Franßen, for her enthusiastic support in helping us realize this project. We would like to especially thank our instructor, Ms. Jacqueline Franßen, for her enthusiastic support in helping us realize this project.
## Contact ## Contact
- Klara Tabea Bracke - Klara Tabea Bracke (3015256@hs-mannheim.de)
- Arman Ulusoy (3016148@stud.hs-mannheim.de) - Arman Ulusoy (3016148@stud.hs-mannheim.de)
- Nils Rekus - Nils Rekus (1826514@stud.hs-mannheim.de)
- Felix Jan Michael Mucha (felixjanmichael.mucha@stud.hs-mannheim.de) - Felix Jan Michael Mucha (felixjanmichael.mucha@stud.hs-mannheim.de)

View File

@ -0,0 +1 @@
{"names": ["butterlowpass", "lowess", "non_local_means"], "order": 1, "fs": 500.0, "cutoff": 25, "filter_strength": 50, "template_window_size": 7, "search_window_size": 21, "frac": 0.003, "it": 1}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,151 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hypothesis\n",
"This notebook is used to read the data from the pickle files and to test the hypothesis that in the age group of 60-70 the frequency of a sinus bradycardia is significantly higher than in the other age groups.\n",
"For that instance the chi-squared test is used."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pickle\n",
"from scipy.stats import chi2_contingency\n",
"from data_helper import *\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading SB\n",
"Length of SB: 50\n",
"Reading AFIB\n",
"Length of AFIB: 27\n",
"Reading GSVT\n",
"Length of GSVT: 0\n",
"Reading SR\n",
"Length of SR: 13\n",
"Chi-Square Statistic: 38.266574797751275\n",
"P-value: 0.0004730210823940083\n",
"Chi-Square Statistic for SB in 60-70 vs others: 1.4858035714285718\n",
"P-value for SB in 60-70 vs others: 0.22286870264719977\n"
]
}
],
"source": [
"#path = \"C:/Studium/dsa/data\"\n",
"#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n",
"path = \"C:/Users/klara/projects/DSA/data\"\n",
"\n",
"categories_dict = {\n",
"'SB': [426177001],\n",
"'AFIB': [164889003, 164890007],\n",
"'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n",
"'SR': [426783006, 427393009]\n",
"}\n",
"\n",
"data = {}\n",
"for cat_name in categories_dict.keys():\n",
" print(f\"Reading {cat_name}\")\n",
" with open(f'{path}/{cat_name}.pkl', 'rb') as f:\n",
" records = pickle.load(f)\n",
" data[cat_name] = records\n",
" print(f\"Length of {cat_name}: {len(records)}\")\n",
"\n",
"data_demographic = {'age':[], 'diag':[], 'gender':[]}\n",
"for cat_name, records in data.items():\n",
" for record in records:\n",
" age = record.comments[0].split(' ')[1]\n",
" sex = record.comments[1].split(' ')[1]\n",
" if age == 'NaN' or sex == 'NaN':\n",
" continue\n",
" # cut Age: from alter string \n",
" data_demographic['age'].append(int(age))\n",
" data_demographic['diag'].append(cat_name)\n",
" data_demographic['gender'].append(sex)\n",
"\n",
"df_dgc = pd.DataFrame(data_demographic)\n",
"\n",
"# Change from group to category\n",
"age_categories = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]\n",
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)\n",
"corr_matrix_age_diag= pd.crosstab(df_dgc['age_group'], df_dgc['diag'])\n",
"\n",
"# Chi-square test\n",
"chi2, p, _, _ = chi2_contingency(corr_matrix_age_diag)\n",
"\n",
"# Difference between observed and expected frequencies\n",
"print(f\"Chi-Square Statistic: {chi2}\")\n",
"print(f\"P-value: {p}\")\n",
"\n",
"# Check if SB (Sinusbradykardie) has a significantly higher frequency in the 60-70 age group\n",
"sb_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right'), 'SB']\n",
"sb_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum()['SB']\n",
"total_60_70 = corr_matrix_age_diag.loc[pd.Interval(60, 70, closed='right')].sum()\n",
"total_other = corr_matrix_age_diag.drop(pd.Interval(60, 70, closed='right')).sum().sum()\n",
"\n",
"# Frequency table for the specific Chi-Square test\n",
"observed = [[sb_60_70, total_60_70 - sb_60_70], [sb_other, total_other - sb_other]]\n",
"chi2_sb, p_sb = chi2_contingency(observed)[:2]\n",
"\n",
"\n",
"print(f\"Chi-Square Statistic for SB in 60-70 vs others: {chi2_sb}\")\n",
"print(f\"P-value for SB in 60-70 vs others: {p_sb}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results can be interpreted as followed:\n",
"\n",
"- The first value returned is the Chi-Square Statistic that shows the difference between the observed and the expected frequencies. Here, a bigger number indicates a bigger difference. The p-value shows the probability of this difference being statistically significant. If the p-value is below the significance level of 0.05, the difference is significant.\n",
"\n",
"- The Chi-Square Statistic for sinus bradycardia in the age group 60-70 compared to the other age groups, is a value that shows whether there is a significant difference in the frequency of sinus bradycardia in the age group 60-70 in comparison to the other age groups. If the p-value is smaller than the significance level of 0.05, the difference in the frequency between the age group 60-70 and the other age groups is significant."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

View File

@ -0,0 +1,234 @@
import pickle
import json
import copy
from matplotlib import pyplot as plt
import numpy as np
import wfdb.processing
import scipy.signal
from scipy.signal import butter, lfilter
from statsmodels.nonparametric.smoothers_lowess import lowess
import cv2 as cv
""""
TODO create overall description
"""
def load_data(only_demographic:bool=False, path_settings:str="../settings.json"):
"""
Loads data from pickle files based on the specified settings.
Args:
only_demographic (bool, optional): If True, only loads demographic data (age, diagnosis, gender). Defaults to False.
path_settings (str, optional): Path to the settings file. Defaults to "./settings.json".
Returns:
dict: A dictionary containing the loaded data.
"""
settings = json.load(open(path_settings))
path_data = settings["data_path"]
labels = settings["labels"]
data = {}
if only_demographic:
data = {'age': [], 'diag': [], 'gender': []}
for cat_name in labels.keys():
print(f"Reading {cat_name}")
with open(f'{path_data}/{cat_name}.pkl', 'rb') as f:
records = pickle.load(f)
if only_demographic:
for record in records:
age = record.comments[0].split(' ')[1]
gender = record.comments[1].split(' ')[1]
if age == 'NaN' or gender == 'NaN':
continue
data['age'].append(int(age))
data['diag'].append(cat_name)
data['gender'].append(gender)
else:
data[cat_name] = records
return data
def format_data_input(data):
"""
Formats the input data into a standardized format.
Parameters:
data (np.ndarray or wfdb.Record or list or dict): The input data to be formatted.
Returns:
dict: The formatted data.
"""
if isinstance(data, np.ndarray):
data = wfdb.Record(p_signal=data.copy())
if isinstance(data, wfdb.Record):
data = [data]
if isinstance(data, list):
temp_dict = {}
temp_dict['temp_key'] = data
data = temp_dict.copy()
return data
def format_data_output(data):
"""
Formats the output data into a less redundant format.
Args:
data (dict, list, wfdb.Record, or ndarray): The input data to be formatted.
Returns:
The formatted data.
"""
if len(data.keys()) == 1 and 'temp_key' in data.keys():
data = data['temp_key']
if isinstance(data, list) and len(data) == 1:
data = data[0]
if isinstance(data, wfdb.Record) and len(data.p_signal.shape) == 1:
data = data.p_signal[0]
return data
def butterlowpass_filter(data, cutoff:int, fs:int, order:int=5):
"""
Apply a Butterworth lowpass filter to the input data.
Parameters:
- data: (dict, list, wfdb.Record, or ndarray)
The input data to be filtered.
- cutoff: float
The cutoff frequency of the filter.
- fs: float
The sampling frequency of the input data.
- order: int, optional
The order of the filter (default is 5).
Returns:
- data: (dict, list, wfdb.Record, or ndarray)
The filtered output data.
"""
data = copy.deepcopy(data)
data = format_data_input(data)
for label, wfdb_objs in data.items():
for wfdb_obj in wfdb_objs:
for idx in range(wfdb_obj.p_signal.shape[1]):
signal = wfdb_obj.p_signal[:, idx]
nyq = 0.5 * fs
normal_cutoff = cutoff / nyq
b, a = butter(order, normal_cutoff, btype='low', analog=False)
wfdb_obj.p_signal[:, idx] = lfilter(b, a, signal)
return format_data_output(data)
def lowess_filter(data, frac:float=0.03, it:int=1):
"""
Applies the lowess filter to the given data.
Parameters:
- data: (dict, list, wfdb.Record, or ndarray)
A dictionary containing the data to be filtered.
- frac (float):
The fraction of the data used to compute each fitted value. Default is 0.03.
- it (int):
The number of iterations for the smoothing process. Default is 1.
Returns:
(dict, list, wfdb.Record, or ndarray): The filtered data.
"""
data = copy.deepcopy(data)
data = format_data_input(data)
for label, wfdb_objs in data.items():
for wfdb_obj in wfdb_objs:
for idx in range(wfdb_obj.p_signal.shape[1]):
signal = wfdb_obj.p_signal[:, idx]
d_range = np.arange(len(signal))
# [:, 1] needed to get only the smoothed values
wfdb_obj.p_signal[:, idx] = lowess(signal, d_range, is_sorted=True, frac=frac, it=it)[:, 1]
return format_data_output(data)
def non_local_means_filter(data, filter_strength:int = 50, template_window_size:int = 7, search_window_size:int = 21):
"""
Applies the Non-Local Means filter to the given data.
Parameters:
- data: (dict, list, wfdb.Record, or ndarray)
A dictionary containing the data to be filtered.
- filter_strength (int):
Parameter controlling the strength of the filtering process. Default is 50.
- template_window_size (int):
Size in pixels of the template patch that is used to compute weights. Default is 7.
- search_window_size (int):
Size in pixels of the window that is used to compute weighted average for given pixel. Default is 21.
Returns:
(dict, list, wfdb.Record, or ndarray): The filtered data.
"""
data = copy.deepcopy(data)
data = format_data_input(data)
for label, wfdb_objs in data.items():
for wfdb_obj in wfdb_objs:
for idx in range(wfdb_obj.p_signal.shape[1]):
signal = wfdb_obj.p_signal[:, idx]
# reshape data to 2d for image like processing
d_2d = np.reshape(signal, (-1, 1))
# max min scaling
d_2d_scaled = np.uint8((d_2d - np.min(d_2d)) / (np.max(d_2d) - np.min(d_2d)) * 255)
# apply non local means filter
d_2d_filtered = cv.fastNlMeansDenoising(d_2d_scaled, None, filter_strength, template_window_size, search_window_size)
# Rescale the denoised signal back to the original range
d_filtered = np.reshape(d_2d_filtered, -1) * (np.max(signal) - np.min(signal)) / 255 + np.min(signal)
wfdb_obj.p_signal[:, idx] = d_filtered
return format_data_output(data)
def filter_data(data, filter_params:dict):
"""
Apply a filter to the input data.
Parameters:
- data: (dict, list, wfdb.Record, or ndarray)
The input data to be filtered.
- filter_params: dict
The parameters of the filter to be applied.
Returns:
- data: (dict, list, wfdb.Record, or ndarray)
The filtered output data.
"""
data = copy.deepcopy(data)
#data = format_data_input(data)
if 'butterlowpass' in filter_params['names']:
data = butterlowpass_filter(data, filter_params['cutoff'], filter_params['fs'], filter_params['order'])
if 'loess' in filter_params['names']:
data = lowess_filter(data, filter_params['frac'], filter_params['it'])
if 'non_local_means' in filter_params['names']:
data = non_local_means_filter(data, filter_params['filter_strength'], filter_params['template_window_size'], filter_params['search_window_size'])
if not any(name in filter_params['names'] for name in ['butterlowpass', 'loess', 'non_local_means']):
print("Warning: No valid filter names found in filter_params['names']. Data will be returned as is.")
return data #format_data_output(data)
if __name__ == "__main__":
data = load_data(only_demographic=False, path_settings="./settings.json")
# print shape of data for each category
for cat_name in data.keys():
print(f"{cat_name}: {len(data[cat_name])}")
order = 1
fs = 500.0
cutoff = 25#25
# Apply filter to the signal
data_test = butterlowpass_filter(data, cutoff, fs, order)
data_test = butterlowpass_filter(data['SB'], cutoff, fs, order)
data_test = butterlowpass_filter(data['SB'][0], cutoff, fs, order)

11
settings.json 100644
View File

@ -0,0 +1,11 @@
{
"data_path_comment": "Path to the data folder. This is the folder where the data is stored.",
"data_path": "C:/Studium/dsa/data",
"labels_comment": "Labels for the different classes. The labels are the SNOMED CT codes.",
"labels": {
"SB": [426177001],
"AFIB": [164889003, 164890007],
"GSVT": [426761007, 713422000, 233896004, 233897008, 713422000],
"SR": [426783006, 427393009]
}
}