diff --git a/README.md b/README.md index 72c46fe..f90eee9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project. -## Analysis of cardiovascular diseases using ECG data +# Analysis of cardiovascular diseases using ECG data ## Table of Contents diff --git a/skripts/generate_data.py b/skripts/generate_data.py index a1551ea..df772d0 100644 --- a/skripts/generate_data.py +++ b/skripts/generate_data.py @@ -1,105 +1,78 @@ -""" -This script reads the WFDB records and extracts the diagnosis information from the comments. -The diagnosis information is then used to classify the records into categories. -The categories are defined by the diagnosis codes in the comments. -The records are then saved to pickle files based on the categories. -""" - import wfdb import os -import numpy as np import pickle +import bz2 +import numpy as np +import pandas as pd -# Directories and file paths -# -------------------------------------------------------------------------------- -# NOTE: Specify the directory where the WFDB records are stored -project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0' -data_dir = project_dir + '/WFDBRecords' -path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv" - # -------------------------------------------------------------------------------- -# Functions +# Funktionen zum Bearbeiten der Daten def get_diagnosis_ids(record): - """ - Extracts diagnosis IDs from a record and returns them as a list. - Args: - record (object): The record object containing the diagnosis information. - Returns: - list: A list of diagnosis IDs extracted from the record. - """ - # Get the diagnosis diagnosis = record.comments[2] - # clean the diagnosis diagnosis = diagnosis.replace('Dx: ', '') list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')] return list_diagnosis -# -------------------------------------------------------------------------------- -# Generate the data -# -------------------------------------------------------------------------------- -if __name__ == '__main__': - """ - The following categories are used to classify the records: - SB, Sinusbradykardie - AFIB, Vorhofflimmern und Vorhofflattern (AFL) - GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher - SR Sinusrhythmus und Sinusunregelmäßigkeiten - """ - categories = { +def get_diagnosis_name(diagnosis): + name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis] + return name + +def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df): + filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0] + filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)} + filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()} + return filtered_df_dict + +# Verzeichnisse und Dateipfade +project_dir = 'C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0' +data_dir = project_dir + '/WFDBRecords' +path_diag_lookup = "C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv" + +# Daten erkunden +diagnosis_lookup = pd.read_csv(path_diag_lookup) + +categories = { 'SB': [426177001], 'AFIB': [164889003, 164890007], 'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000], 'SR': [426783006, 427393009] - } +} - diag_dict = {k: [] for k in categories.keys()} +diag_dict = {k: [] for k in categories.keys()} +counter = 0 +max_counter = 100_000 - # Create a counter for the number of records - counter = 0 - max_counter = 100_000 - failed_records = [] - # Loop through the records - for dir_th in os.listdir(data_dir): - path_to_1000_records = data_dir + '/' + dir_th - for dir_hd in os.listdir(path_to_1000_records): - path_to_100_records = path_to_1000_records + '/' + dir_hd - for record_name in os.listdir(path_to_100_records): - # check if .hea is in the record_name - if '.hea' not in record_name: - continue - # Remove the .hea extension from record_name - record_name = record_name.replace('.hea', '') - try: - # Read the record - record = wfdb.rdrecord(path_to_100_records + '/' + record_name) - # Get the diagnosis - diagnosis = np.array(get_diagnosis_ids(record)) - # check if diagnosis is a subset of one of the categories - for category_name, category_codes in categories.items(): - # if any of the diagnosis codes is in the category_codes - if any(i in category_codes for i in diagnosis): - diag_dict[category_name].append(record) - break - # Increment the counter of how many records we have read - counter += 1 - counter_bool = counter >= max_counter - # Break the loop if we have read max_counter records - if counter % 100 == 0: - print(f"Read {counter} records") - if counter_bool: +for dir_th in os.listdir(data_dir): + path_to_1000_records = data_dir + '/' + dir_th + for dir_hd in os.listdir(path_to_1000_records): + path_to_100_records = path_to_1000_records + '/' + dir_hd + for record_name in os.listdir(path_to_100_records): + if '.hea' not in record_name: + continue + record_name = record_name.replace('.hea', '') + try: + record = wfdb.rdrecord(path_to_100_records + '/' + record_name) + diagnosis = np.array(get_diagnosis_ids(record)) + for category_name, category_codes in categories.items(): + if any(i in category_codes for i in diagnosis): + diag_dict[category_name].append(record) break - except Exception as e: - failed_records.append(record_name) - print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}") - if counter_bool: - break + counter += 1 + counter_bool = counter >= max_counter + if counter % 100 == 0: + print(f"Gelesen {counter} Datensätze") + if counter_bool: + break + except Exception as e: + print(f"Fehler beim Lesen des Datensatzes {record_name}: {e}") if counter_bool: break + if counter_bool: + break - # write to pickle - for cat_name, records in diag_dict.items(): - print(f"Writing {cat_name} to pickle with {len(records)} records") - # if path not exists create it - if not os.path.exists('./data'): - os.makedirs('./data') - with open(f'./data/{cat_name}.pkl', 'wb') as f: - pickle.dump(records, f) \ No newline at end of file +for cat_name, records in diag_dict.items(): + print(f"Schreibe {cat_name} in eine komprimierte Datei mit {len(records)} Datensätzen") + if not os.path.exists('./data'): + os.makedirs('./data') + compressed_filename = f'./data/{cat_name}.pkl.bz2' + with bz2.open(compressed_filename, 'wb') as f: + pickle.dump(records, f)