commit 588e20b7b37c2d6700df43724fe47fa20b3a4027 Author: Felix Mucha <3016498@stud.hs-mannheim.de> Date: Wed May 1 09:56:36 2024 +0200 first structure diff --git a/README.md b/README.md new file mode 100644 index 0000000..c686844 --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +# HSMA Data Science and Analytics SS2024 +## ECG Data klassifier and segmentation + +[This Project aims to klassify ... and semgment ... ECG Data] + +## Table of Contents +- [About](#about) +- [Getting Started](#getting-started) +- [Usage](#usage) +- [Progress](#progress) +- [Contributing](#contributing) +- [License](#license) + +## About +[Provide a brief overview of the project, including its purpose and any relevant background information.] + +## Getting Started +[Instructions on how to get the project up and running on a local machine. Include prerequisites, installation steps, and any other necessary setup.] + +### Prerequisites +[List any software or tools that need to be installed before running the project.] + +### Installation +[Step-by-step guide on how to install the project.] + +## Usage +[Provide examples and instructions for using the project. Include any relevant code snippets or screenshots.] + +## Progress +- Data was searched and found at : https://doi.org/10.13026/wgex-er52 +- Data was cleaned: + - Docker Container with MongoDB, because 10 GB and many arrays + - Diagnosis from String to list + - Data filtered to contain only healthy and the needed diagnosis data + + + +## Contributing +[Explain how others can contribute to the project. This might include guidelines for reporting bugs, submitting enhancements, or proposing new features.] + +## License +[Specify the license under which the project is distributed. Include any additional terms or conditions.] + +## Acknowledgements +[Optional section to thank individuals or organizations that have contributed to the project.] + +## Contact +[Provide contact information for inquiries or support.] \ No newline at end of file diff --git a/skripts/clean_data.py b/skripts/clean_data.py new file mode 100644 index 0000000..cfdf31d --- /dev/null +++ b/skripts/clean_data.py @@ -0,0 +1,180 @@ +import wfdb +import os +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import numpy as np + + +# Directories and file paths +# -------------------------------------------------------------------------------- +# Specify the directory where the WFDB records are stored +project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0' +data_dir = project_dir + '/WFDBRecords' +path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv" +#project_dir +'/ConditionNames_SNOMED-CT.csv' + + + + # -------------------------------------------------------------------------------- +# print if project_dir exists +if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"): + print(f"Directory {path_diag_lookup} does not exist") + + + +def get_diagnosis_ids(record): + # Get the diagnosis + diagnosis = record.comments[2] + # clean the diagnosis + diagnosis = diagnosis.replace('Dx: ', '') + list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')] + return list_diagnosis + +def get_diagnosis_name(diagnosis): + # get the diagnosis name from the lookup table + name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis] + return name + +def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df): + # Create a list with filter codes and add 0 for padding + filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0] + # Filter the diagnosis dictionary based on the filter codes + filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)} + # Filter the df_dict based on the filtered_dict_diag + filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()} + + return filtered_df_dict + +# -------------------------------------------------------------------------------- +# Explore the data +# -------------------------------------------------------------------------------- +# Read the diagnosis lookup table +diagnosis_lookup = pd.read_csv(path_diag_lookup) +#print(diagnosis_lookup.head()) + +# Filter data based on the diagnosis + + + +# ---------------------------------------------- +healthy_codes = [426177001, 426783006] + + +categories = { + 'Gesund': [426177001, 426783006], # '426177001', '426783006 + 'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001], + 'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002], + 'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009], + 'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005], + 'Hypertrophien': [164873001, 89792004], + 'QT': [111975006], + 'Repolarisation': [428417006], + 'Myokardinfarkt': [164865005] +} + + + +diag_dict = {k: 0 for k in categories.keys()} + +# Create a counter for the number of records +counter = 0 +max_counter = 100_000#100_000 + +# Loop through the records +for dir_th in os.listdir(data_dir): + path_to_1000_records = data_dir + '/' + dir_th + for dir_hd in os.listdir(path_to_1000_records): + path_to_100_records = path_to_1000_records + '/' + dir_hd + for record_name in os.listdir(path_to_100_records): + # check if .hea is in the record_name + if '.hea' not in record_name: + continue + # Remove the .hea extension from record_name + record_name = record_name.replace('.hea', '') + try: + # Read the record + record = wfdb.rdrecord(path_to_100_records + '/' + record_name) + # Get the diagnosis + diagnosis = np.array(get_diagnosis_ids(record)) + + # check if diagnosis is a subset of one of the categories + for category_name, category_codes in categories.items(): + if set(diagnosis).issubset(set(category_codes)): + # Increment the counter for the category + diag_dict[category_name] += 1 + break + + # Increment the counter + counter += 1 + counter_bool = counter >= max_counter + # Break the loop if we have read max_counter records + if counter % 100 == 0: + print(f"Read {counter} records") + + if counter_bool: + break + except Exception as e: + print(f"Failed to read record {record_name} due to ValueError") + if counter_bool: + break + if counter_bool: + break + +""" +ID: Herzrhythmusstörungen, Count: 22571 +ID: Leitungsstörungen, Count: 505 +ID: EKG-Welle, Count: 2067 +ID: Spannungsänderungen, Count: 613 +ID: Hypertrophien, Count: 5 +ID: QT, Count: 43 +ID: Repolarisation, Count: 73 +ID: Myokardinfarkt, Count: 1 + +""" + +# # get the data +# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict() + +# # get unique diagnosis codes +# unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten()) +# unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten()) +# unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten()) + +# print(unique_health_codes) +# print(unique_afib_codes) +# print(unique_mi_codes) + +# print(dict_healthy['JS00004'].__dict__) + + +#print(diag_dict) +for id, count in diag_dict.items(): + print(f"ID: {id}, Count: {count}") + +print(f'Number of counter diagnoses: {len(diag_dict)}') +print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}') +print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup)) + +# flatten the counters and count the unique values +# healthy_counter = np.array(healthy_counter).flatten() +# afib_counter = np.array(afib_counter).flatten() +# mi_counter = np.array(mi_counter).flatten() + +# unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True) +# unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True) +# unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True) + +# print(unique_health_codes) +# print(counts_health) +# print(unique_afib_codes) +# print(counts_afib) +# print(unique_mi_codes) +# print(counts_mi) + +# # get the names of the diagnosis +# names_health = get_diagnosis_name(unique_health_codes) +# names_afib = get_diagnosis_name(unique_afib_codes) +# names_mi = get_diagnosis_name(unique_mi_codes) + +