Compare commits

..

No commits in common. "a788038217c543a8ec710ea5ec11f6dff88339b0" and "bc4897dacb83ed2c3722b7b5d66b2fff23fdcb6a" have entirely different histories.

2 changed files with 85 additions and 62 deletions

View File

@ -2,12 +2,8 @@
This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project. This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project.
<<<<<<< HEAD
# Analysis of cardiovascular diseases using ECG data
=======
## Analysis of cardiovascular diseases using ECG data ## Analysis of cardiovascular diseases using ECG data
>>>>>>> bc4897dacb83ed2c3722b7b5d66b2fff23fdcb6a
## Table of Contents ## Table of Contents

View File

@ -1,78 +1,105 @@
"""
This script reads the WFDB records and extracts the diagnosis information from the comments.
The diagnosis information is then used to classify the records into categories.
The categories are defined by the diagnosis codes in the comments.
The records are then saved to pickle files based on the categories.
"""
import wfdb import wfdb
import os import os
import pickle
import bz2
import numpy as np import numpy as np
import pandas as pd import pickle
# Funktionen zum Bearbeiten der Daten # Directories and file paths
# --------------------------------------------------------------------------------
# NOTE: Specify the directory where the WFDB records are stored
project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
data_dir = project_dir + '/WFDBRecords'
path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"
# --------------------------------------------------------------------------------
# Functions
def get_diagnosis_ids(record): def get_diagnosis_ids(record):
"""
Extracts diagnosis IDs from a record and returns them as a list.
Args:
record (object): The record object containing the diagnosis information.
Returns:
list: A list of diagnosis IDs extracted from the record.
"""
# Get the diagnosis
diagnosis = record.comments[2] diagnosis = record.comments[2]
# clean the diagnosis
diagnosis = diagnosis.replace('Dx: ', '') diagnosis = diagnosis.replace('Dx: ', '')
list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')] list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
return list_diagnosis return list_diagnosis
# --------------------------------------------------------------------------------
# Generate the data
# --------------------------------------------------------------------------------
if __name__ == '__main__':
"""
The following categories are used to classify the records:
def get_diagnosis_name(diagnosis): SB, Sinusbradykardie
name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis] AFIB, Vorhofflimmern und Vorhofflattern (AFL)
return name GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
SR Sinusrhythmus und Sinusunregelmäßigkeiten
def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df): """
filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0] categories = {
filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
return filtered_df_dict
# Verzeichnisse und Dateipfade
project_dir = 'C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
data_dir = project_dir + '/WFDBRecords'
path_diag_lookup = "C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
# Daten erkunden
diagnosis_lookup = pd.read_csv(path_diag_lookup)
categories = {
'SB': [426177001], 'SB': [426177001],
'AFIB': [164889003, 164890007], 'AFIB': [164889003, 164890007],
'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000], 'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],
'SR': [426783006, 427393009] 'SR': [426783006, 427393009]
} }
diag_dict = {k: [] for k in categories.keys()} diag_dict = {k: [] for k in categories.keys()}
counter = 0
max_counter = 100_000
for dir_th in os.listdir(data_dir): # Create a counter for the number of records
path_to_1000_records = data_dir + '/' + dir_th counter = 0
for dir_hd in os.listdir(path_to_1000_records): max_counter = 100_000
path_to_100_records = path_to_1000_records + '/' + dir_hd failed_records = []
for record_name in os.listdir(path_to_100_records): # Loop through the records
if '.hea' not in record_name: for dir_th in os.listdir(data_dir):
continue path_to_1000_records = data_dir + '/' + dir_th
record_name = record_name.replace('.hea', '') for dir_hd in os.listdir(path_to_1000_records):
try: path_to_100_records = path_to_1000_records + '/' + dir_hd
record = wfdb.rdrecord(path_to_100_records + '/' + record_name) for record_name in os.listdir(path_to_100_records):
diagnosis = np.array(get_diagnosis_ids(record)) # check if .hea is in the record_name
for category_name, category_codes in categories.items(): if '.hea' not in record_name:
if any(i in category_codes for i in diagnosis): continue
diag_dict[category_name].append(record) # Remove the .hea extension from record_name
record_name = record_name.replace('.hea', '')
try:
# Read the record
record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
# Get the diagnosis
diagnosis = np.array(get_diagnosis_ids(record))
# check if diagnosis is a subset of one of the categories
for category_name, category_codes in categories.items():
# if any of the diagnosis codes is in the category_codes
if any(i in category_codes for i in diagnosis):
diag_dict[category_name].append(record)
break
# Increment the counter of how many records we have read
counter += 1
counter_bool = counter >= max_counter
# Break the loop if we have read max_counter records
if counter % 100 == 0:
print(f"Read {counter} records")
if counter_bool:
break break
counter += 1 except Exception as e:
counter_bool = counter >= max_counter failed_records.append(record_name)
if counter % 100 == 0: print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")
print(f"Gelesen {counter} Datensätze") if counter_bool:
if counter_bool: break
break
except Exception as e:
print(f"Fehler beim Lesen des Datensatzes {record_name}: {e}")
if counter_bool: if counter_bool:
break break
if counter_bool:
break
for cat_name, records in diag_dict.items(): # write to pickle
print(f"Schreibe {cat_name} in eine komprimierte Datei mit {len(records)} Datensätzen") for cat_name, records in diag_dict.items():
if not os.path.exists('./data'): print(f"Writing {cat_name} to pickle with {len(records)} records")
os.makedirs('./data') # if path not exists create it
compressed_filename = f'./data/{cat_name}.pkl.bz2' if not os.path.exists('./data'):
with bz2.open(compressed_filename, 'wb') as f: os.makedirs('./data')
pickle.dump(records, f) with open(f'./data/{cat_name}.pkl', 'wb') as f:
pickle.dump(records, f)