DSA_SS24/scripts/generate_data.py

"""
This script reads the WFDB records and extracts the diagnosis information from the comments.
The diagnosis information is then used to classify the records into categories.
The categories are defined by the diagnosis codes in the comments.
The records are then saved to pickle files based on the categories.
"""

import wfdb
import os
import numpy as np
import pickle

# Directories and file paths
# --------------------------------------------------------------------------------
# NOTE: Specify the directory where the WFDB records are stored
project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
data_dir = project_dir + '/WFDBRecords'
path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"
 # --------------------------------------------------------------------------------
# Functions
def get_diagnosis_ids(record):
    """
    Extracts diagnosis IDs from a record and returns them as a list.
    Args:
        record (object): The record object containing the diagnosis information.
    Returns:
        list: A list of diagnosis IDs extracted from the record.
    """
    # Get the diagnosis
    diagnosis = record.comments[2]
    # clean the diagnosis
    diagnosis = diagnosis.replace('Dx: ', '')
    list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
    return list_diagnosis
# --------------------------------------------------------------------------------
# Generate the data
# --------------------------------------------------------------------------------
if __name__ == '__main__':
    """
    The following categories are used to classify the records:

    SB,    Sinusbradykardie
    AFIB,  Vorhofflimmern und Vorhofflattern (AFL)
    GSVT,  supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
    SR     Sinusrhythmus und Sinusunregelmäßigkeiten
    """
    categories = {
    'SB':    [426177001],
    'AFIB':  [164889003, 164890007],
    'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],
    'SR':    [426783006, 427393009]
    }

    diag_dict = {k: [] for k in categories.keys()}

    # Create a counter for the number of records
    counter = 0
    max_counter = 100_000
    failed_records = []
    # Loop through the records
    for dir_th in os.listdir(data_dir):
        path_to_1000_records = data_dir + '/' + dir_th
        for dir_hd in os.listdir(path_to_1000_records):
            path_to_100_records = path_to_1000_records + '/' + dir_hd
            for record_name in os.listdir(path_to_100_records):
                # check if .hea is in the record_name 
                if '.hea' not in record_name:
                    continue
                # Remove the .hea extension from record_name
                record_name = record_name.replace('.hea', '')
                try:
                    # Read the record
                    record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
                    # Get the diagnosis
                    diagnosis = np.array(get_diagnosis_ids(record))
                    # check if diagnosis is a subset of one of the categories
                    for category_name, category_codes in categories.items():
                        # if any of the diagnosis codes is in the category_codes
                        if any(i in category_codes for i in diagnosis):
                            diag_dict[category_name].append(record)
                            break
                    # Increment the counter of how many records we have read
                    counter += 1
                    counter_bool = counter >= max_counter
                    # Break the loop if we have read max_counter records
                    if counter % 100 == 0:
                        print(f"Read {counter} records")
                    if counter_bool:
                        break
                except Exception as e:
                    failed_records.append(record_name)
                    print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")
            if counter_bool:
                break
        if counter_bool:
            break

    # write to pickle
    for cat_name, records in diag_dict.items():
        print(f"Writing {cat_name} to pickle with {len(records)} records")
        # if path not exists create it
        if not os.path.exists('./data'):
            os.makedirs('./data')
        with open(f'./data/{cat_name}.pkl', 'wb') as f:
            pickle.dump(records, f)
completed repo setup 2024-05-15 20:20:01 +02:00			`"""`
			`This script reads the WFDB records and extracts the diagnosis information from the comments.`
			`The diagnosis information is then used to classify the records into categories.`
			`The categories are defined by the diagnosis codes in the comments.`
			`The records are then saved to pickle files based on the categories.`
			`"""`

first structure 2024-05-01 09:56:36 +02:00			`import wfdb`
			`import os`
			`import numpy as np`
generate data 2024-05-01 12:53:33 +02:00			`import pickle`
first structure 2024-05-01 09:56:36 +02:00
			`# Directories and file paths`
			`# --------------------------------------------------------------------------------`
completed repo setup 2024-05-15 20:20:01 +02:00			`# NOTE: Specify the directory where the WFDB records are stored`
first structure 2024-05-01 09:56:36 +02:00			`project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'`
			`data_dir = project_dir + '/WFDBRecords'`
completed repo setup 2024-05-15 20:20:01 +02:00			`path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"`
first structure 2024-05-01 09:56:36 +02:00			`# --------------------------------------------------------------------------------`
completed repo setup 2024-05-15 20:20:01 +02:00			`# Functions`
first structure 2024-05-01 09:56:36 +02:00			`def get_diagnosis_ids(record):`
completed repo setup 2024-05-15 20:20:01 +02:00			`"""`
			`Extracts diagnosis IDs from a record and returns them as a list.`
			`Args:`
			`record (object): The record object containing the diagnosis information.`
			`Returns:`
			`list: A list of diagnosis IDs extracted from the record.`
			`"""`
first structure 2024-05-01 09:56:36 +02:00			`# Get the diagnosis`
			`diagnosis = record.comments[2]`
			`# clean the diagnosis`
			`diagnosis = diagnosis.replace('Dx: ', '')`
			`list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]`
			`return list_diagnosis`
			`# --------------------------------------------------------------------------------`
completed repo setup 2024-05-15 20:20:01 +02:00			`# Generate the data`
first structure 2024-05-01 09:56:36 +02:00			`# --------------------------------------------------------------------------------`
completed repo setup 2024-05-15 20:20:01 +02:00			`if __name__ == '__main__':`
			`"""`
			`The following categories are used to classify the records:`

			`SB, Sinusbradykardie`
			`AFIB, Vorhofflimmern und Vorhofflattern (AFL)`
			`GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher`
			`SR Sinusrhythmus und Sinusunregelmäßigkeiten`
			`"""`
			`categories = {`
			`'SB': [426177001],`
			`'AFIB': [164889003, 164890007],`
			`'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],`
			`'SR': [426783006, 427393009]`
			`}`

			`diag_dict = {k: [] for k in categories.keys()}`

			`# Create a counter for the number of records`
			`counter = 0`
			`max_counter = 100_000`
			`failed_records = []`
			`# Loop through the records`
			`for dir_th in os.listdir(data_dir):`
			`path_to_1000_records = data_dir + '/' + dir_th`
			`for dir_hd in os.listdir(path_to_1000_records):`
			`path_to_100_records = path_to_1000_records + '/' + dir_hd`
			`for record_name in os.listdir(path_to_100_records):`
			`# check if .hea is in the record_name`
			`if '.hea' not in record_name:`
			`continue`
			`# Remove the .hea extension from record_name`
			`record_name = record_name.replace('.hea', '')`
			`try:`
			`# Read the record`
			`record = wfdb.rdrecord(path_to_100_records + '/' + record_name)`
			`# Get the diagnosis`
			`diagnosis = np.array(get_diagnosis_ids(record))`
			`# check if diagnosis is a subset of one of the categories`
			`for category_name, category_codes in categories.items():`
			`# if any of the diagnosis codes is in the category_codes`
			`if any(i in category_codes for i in diagnosis):`
			`diag_dict[category_name].append(record)`
			`break`
			`# Increment the counter of how many records we have read`
			`counter += 1`
			`counter_bool = counter >= max_counter`
			`# Break the loop if we have read max_counter records`
			`if counter % 100 == 0:`
			`print(f"Read {counter} records")`
			`if counter_bool:`
first structure 2024-05-01 09:56:36 +02:00			`break`
completed repo setup 2024-05-15 20:20:01 +02:00			`except Exception as e:`
			`failed_records.append(record_name)`
			`print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")`
			`if counter_bool:`
			`break`
first structure 2024-05-01 09:56:36 +02:00			`if counter_bool:`
			`break`

completed repo setup 2024-05-15 20:20:01 +02:00			`# write to pickle`
			`for cat_name, records in diag_dict.items():`
			`print(f"Writing {cat_name} to pickle with {len(records)} records")`
			`# if path not exists create it`
			`if not os.path.exists('./data'):`
			`os.makedirs('./data')`
			`with open(f'./data/{cat_name}.pkl', 'wb') as f:`
			`pickle.dump(records, f)`