import wfdb import os import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np import pickle #project_dir = "C:/Users/Nils/Documents/0000MASTER/IM1/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0" # Directories and file paths # -------------------------------------------------------------------------------- # Specify the directory where the WFDB records are stored project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0' data_dir = project_dir + '/WFDBRecords' path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv" #path_diag_lookup = project_dir + '/ConditionNames_SNOMED-CT.csv' #project_dir +'/ConditionNames_SNOMED-CT.csv' # -------------------------------------------------------------------------------- def get_diagnosis_ids(record): # Get the diagnosis diagnosis = record.comments[2] # clean the diagnosis diagnosis = diagnosis.replace('Dx: ', '') list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')] return list_diagnosis def get_diagnosis_name(diagnosis): # get the diagnosis name from the lookup table name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis] return name def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df): # Create a list with filter codes and add 0 for padding filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0] # Filter the diagnosis dictionary based on the filter codes filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)} # Filter the df_dict based on the filtered_dict_diag filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()} return filtered_df_dict # -------------------------------------------------------------------------------- # Explore the data # -------------------------------------------------------------------------------- # Read the diagnosis lookup table diagnosis_lookup = pd.read_csv(path_diag_lookup) #print(diagnosis_lookup.head()) # Filter data based on the diagnosis # ---------------------------------------------- """ SB, Sinusbradykardie AFIB, Vorhofflimmern und Vorhofflattern (AFL) GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher SR Sinusrhythmus und Sinusunregelmäßigkeiten (Vorhofschrittmacher = 713422000) """ categories = { 'SB': [426177001], 'AFIB': [164889003, 164890007], 'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000], 'SR': [426783006, 427393009] } #diag_dict = {k: 0 for k in categories.keys()} diag_dict = {k: [] for k in categories.keys()} # Create a counter for the number of records counter = 0 max_counter = 100#100_000 # Loop through the records for dir_th in os.listdir(data_dir): path_to_1000_records = data_dir + '/' + dir_th for dir_hd in os.listdir(path_to_1000_records): path_to_100_records = path_to_1000_records + '/' + dir_hd for record_name in os.listdir(path_to_100_records): # check if .hea is in the record_name if '.hea' not in record_name: continue # Remove the .hea extension from record_name record_name = record_name.replace('.hea', '') try: # Read the record record = wfdb.rdrecord(path_to_100_records + '/' + record_name) # Get the diagnosis diagnosis = np.array(get_diagnosis_ids(record)) # check if diagnosis is a subset of one of the categories for category_name, category_codes in categories.items(): #if set(diagnosis).issubset(set(category_codes)): # if any of the diagnosis codes is in the category_codes if any(i in category_codes for i in diagnosis): # Increment the counter for the category #diag_dict[category_name] += 1 # Add record to the category diag_dict[category_name].append(record) break # Increment the counter counter += 1 counter_bool = counter >= max_counter # Break the loop if we have read max_counter records if counter % 100 == 0: print(f"Read {counter} records") if counter_bool: break except Exception as e: print(f"Failed to read record {record_name} due to ValueError") if counter_bool: break if counter_bool: break """ if any(i in category_codes for i in diagnosis): ID: SB, Count: 16559 ID: AFIB, Count: 9839 ID: GSVT, Count: 948 ID: SR, Count: 9720 break Der Counter gibt an ob eine Diagnose in einer Kategorie ist --------------------------------------------------------------------------------------------------------------------- set(diagnosis).issubset(set(category_codes)): ID: SB, Count: 8909 ID: AFIB, Count: 1905 ID: GSVT, Count: 431 ID: SR, Count: 7299 break Der Counter gibt an ob alle Diagnosen in einer Kategorie sind """ # for id, count in diag_dict.items(): # print(f"ID: {id}, Count: {count}") # write to pickle for cat_name, records in diag_dict.items(): print(f"Writing {cat_name} to pickle with {len(records)} records") # if path not exists create it if not os.path.exists('./data'): os.makedirs('./data') with open(f'./data/{cat_name}.pkl', 'wb') as f: pickle.dump(records, f)