|
|
|
@ -1,105 +1,78 @@
|
|
|
|
|
"""
|
|
|
|
|
This script reads the WFDB records and extracts the diagnosis information from the comments.
|
|
|
|
|
The diagnosis information is then used to classify the records into categories.
|
|
|
|
|
The categories are defined by the diagnosis codes in the comments.
|
|
|
|
|
The records are then saved to pickle files based on the categories.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import wfdb
|
|
|
|
|
import os
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pickle
|
|
|
|
|
import bz2
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
# Directories and file paths
|
|
|
|
|
# --------------------------------------------------------------------------------
|
|
|
|
|
# NOTE: Specify the directory where the WFDB records are stored
|
|
|
|
|
project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
|
|
|
|
|
data_dir = project_dir + '/WFDBRecords'
|
|
|
|
|
path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"
|
|
|
|
|
# --------------------------------------------------------------------------------
|
|
|
|
|
# Functions
|
|
|
|
|
# Funktionen zum Bearbeiten der Daten
|
|
|
|
|
def get_diagnosis_ids(record):
|
|
|
|
|
"""
|
|
|
|
|
Extracts diagnosis IDs from a record and returns them as a list.
|
|
|
|
|
Args:
|
|
|
|
|
record (object): The record object containing the diagnosis information.
|
|
|
|
|
Returns:
|
|
|
|
|
list: A list of diagnosis IDs extracted from the record.
|
|
|
|
|
"""
|
|
|
|
|
# Get the diagnosis
|
|
|
|
|
diagnosis = record.comments[2]
|
|
|
|
|
# clean the diagnosis
|
|
|
|
|
diagnosis = diagnosis.replace('Dx: ', '')
|
|
|
|
|
list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
|
|
|
|
|
return list_diagnosis
|
|
|
|
|
# --------------------------------------------------------------------------------
|
|
|
|
|
# Generate the data
|
|
|
|
|
# --------------------------------------------------------------------------------
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
"""
|
|
|
|
|
The following categories are used to classify the records:
|
|
|
|
|
|
|
|
|
|
SB, Sinusbradykardie
|
|
|
|
|
AFIB, Vorhofflimmern und Vorhofflattern (AFL)
|
|
|
|
|
GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
|
|
|
|
|
SR Sinusrhythmus und Sinusunregelmäßigkeiten
|
|
|
|
|
"""
|
|
|
|
|
categories = {
|
|
|
|
|
def get_diagnosis_name(diagnosis):
|
|
|
|
|
name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis]
|
|
|
|
|
return name
|
|
|
|
|
|
|
|
|
|
def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df):
|
|
|
|
|
filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0]
|
|
|
|
|
filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
|
|
|
|
|
filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
|
|
|
|
|
return filtered_df_dict
|
|
|
|
|
|
|
|
|
|
# Verzeichnisse und Dateipfade
|
|
|
|
|
project_dir = 'C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
|
|
|
|
|
data_dir = project_dir + '/WFDBRecords'
|
|
|
|
|
path_diag_lookup = "C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
|
|
|
|
|
|
|
|
|
|
# Daten erkunden
|
|
|
|
|
diagnosis_lookup = pd.read_csv(path_diag_lookup)
|
|
|
|
|
|
|
|
|
|
categories = {
|
|
|
|
|
'SB': [426177001],
|
|
|
|
|
'AFIB': [164889003, 164890007],
|
|
|
|
|
'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],
|
|
|
|
|
'SR': [426783006, 427393009]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
diag_dict = {k: [] for k in categories.keys()}
|
|
|
|
|
diag_dict = {k: [] for k in categories.keys()}
|
|
|
|
|
counter = 0
|
|
|
|
|
max_counter = 100_000
|
|
|
|
|
|
|
|
|
|
# Create a counter for the number of records
|
|
|
|
|
counter = 0
|
|
|
|
|
max_counter = 100_000
|
|
|
|
|
failed_records = []
|
|
|
|
|
# Loop through the records
|
|
|
|
|
for dir_th in os.listdir(data_dir):
|
|
|
|
|
path_to_1000_records = data_dir + '/' + dir_th
|
|
|
|
|
for dir_hd in os.listdir(path_to_1000_records):
|
|
|
|
|
path_to_100_records = path_to_1000_records + '/' + dir_hd
|
|
|
|
|
for record_name in os.listdir(path_to_100_records):
|
|
|
|
|
# check if .hea is in the record_name
|
|
|
|
|
if '.hea' not in record_name:
|
|
|
|
|
continue
|
|
|
|
|
# Remove the .hea extension from record_name
|
|
|
|
|
record_name = record_name.replace('.hea', '')
|
|
|
|
|
try:
|
|
|
|
|
# Read the record
|
|
|
|
|
record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
|
|
|
|
|
# Get the diagnosis
|
|
|
|
|
diagnosis = np.array(get_diagnosis_ids(record))
|
|
|
|
|
# check if diagnosis is a subset of one of the categories
|
|
|
|
|
for category_name, category_codes in categories.items():
|
|
|
|
|
# if any of the diagnosis codes is in the category_codes
|
|
|
|
|
if any(i in category_codes for i in diagnosis):
|
|
|
|
|
diag_dict[category_name].append(record)
|
|
|
|
|
break
|
|
|
|
|
# Increment the counter of how many records we have read
|
|
|
|
|
counter += 1
|
|
|
|
|
counter_bool = counter >= max_counter
|
|
|
|
|
# Break the loop if we have read max_counter records
|
|
|
|
|
if counter % 100 == 0:
|
|
|
|
|
print(f"Read {counter} records")
|
|
|
|
|
if counter_bool:
|
|
|
|
|
for dir_th in os.listdir(data_dir):
|
|
|
|
|
path_to_1000_records = data_dir + '/' + dir_th
|
|
|
|
|
for dir_hd in os.listdir(path_to_1000_records):
|
|
|
|
|
path_to_100_records = path_to_1000_records + '/' + dir_hd
|
|
|
|
|
for record_name in os.listdir(path_to_100_records):
|
|
|
|
|
if '.hea' not in record_name:
|
|
|
|
|
continue
|
|
|
|
|
record_name = record_name.replace('.hea', '')
|
|
|
|
|
try:
|
|
|
|
|
record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
|
|
|
|
|
diagnosis = np.array(get_diagnosis_ids(record))
|
|
|
|
|
for category_name, category_codes in categories.items():
|
|
|
|
|
if any(i in category_codes for i in diagnosis):
|
|
|
|
|
diag_dict[category_name].append(record)
|
|
|
|
|
break
|
|
|
|
|
except Exception as e:
|
|
|
|
|
failed_records.append(record_name)
|
|
|
|
|
print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")
|
|
|
|
|
if counter_bool:
|
|
|
|
|
break
|
|
|
|
|
counter += 1
|
|
|
|
|
counter_bool = counter >= max_counter
|
|
|
|
|
if counter % 100 == 0:
|
|
|
|
|
print(f"Gelesen {counter} Datensätze")
|
|
|
|
|
if counter_bool:
|
|
|
|
|
break
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Fehler beim Lesen des Datensatzes {record_name}: {e}")
|
|
|
|
|
if counter_bool:
|
|
|
|
|
break
|
|
|
|
|
if counter_bool:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# write to pickle
|
|
|
|
|
for cat_name, records in diag_dict.items():
|
|
|
|
|
print(f"Writing {cat_name} to pickle with {len(records)} records")
|
|
|
|
|
# if path not exists create it
|
|
|
|
|
if not os.path.exists('./data'):
|
|
|
|
|
os.makedirs('./data')
|
|
|
|
|
with open(f'./data/{cat_name}.pkl', 'wb') as f:
|
|
|
|
|
pickle.dump(records, f)
|
|
|
|
|
for cat_name, records in diag_dict.items():
|
|
|
|
|
print(f"Schreibe {cat_name} in eine komprimierte Datei mit {len(records)} Datensätzen")
|
|
|
|
|
if not os.path.exists('./data'):
|
|
|
|
|
os.makedirs('./data')
|
|
|
|
|
compressed_filename = f'./data/{cat_name}.pkl.bz2'
|
|
|
|
|
with bz2.open(compressed_filename, 'wb') as f:
|
|
|
|
|
pickle.dump(records, f)
|
|
|
|
|