DSA_SS24/skripts/generate_data.py

165 lines
5.7 KiB
Python
Raw Normal View History

2024-05-01 09:56:36 +02:00
import wfdb
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
2024-05-01 12:53:33 +02:00
import pickle
2024-05-01 09:56:36 +02:00
# Directories and file paths
# --------------------------------------------------------------------------------
# Specify the directory where the WFDB records are stored
project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
data_dir = project_dir + '/WFDBRecords'
path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
#project_dir +'/ConditionNames_SNOMED-CT.csv'
# --------------------------------------------------------------------------------
def get_diagnosis_ids(record):
# Get the diagnosis
diagnosis = record.comments[2]
# clean the diagnosis
diagnosis = diagnosis.replace('Dx: ', '')
list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
return list_diagnosis
def get_diagnosis_name(diagnosis):
# get the diagnosis name from the lookup table
name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis]
return name
def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df):
# Create a list with filter codes and add 0 for padding
filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0]
# Filter the diagnosis dictionary based on the filter codes
filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
# Filter the df_dict based on the filtered_dict_diag
filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
return filtered_df_dict
# --------------------------------------------------------------------------------
# Explore the data
# --------------------------------------------------------------------------------
# Read the diagnosis lookup table
diagnosis_lookup = pd.read_csv(path_diag_lookup)
#print(diagnosis_lookup.head())
# Filter data based on the diagnosis
# ----------------------------------------------
2024-05-01 12:53:33 +02:00
"""
SB, Sinusbradykardie
AFIB, Vorhofflimmern und Vorhofflattern (AFL)
GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
SR Sinusrhythmus und Sinusunregelmäßigkeiten
2024-05-01 09:56:36 +02:00
2024-05-01 12:53:33 +02:00
(Vorhofschrittmacher = 713422000)
"""
2024-05-01 09:56:36 +02:00
categories = {
2024-05-01 12:53:33 +02:00
'SB': [426177001],
'AFIB': [164889003, 164890007],
'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],
'SR': [426783006, 427393009]
2024-05-01 09:56:36 +02:00
}
2024-05-01 12:53:33 +02:00
#diag_dict = {k: 0 for k in categories.keys()}
diag_dict = {k: [] for k in categories.keys()}
2024-05-01 09:56:36 +02:00
# Create a counter for the number of records
counter = 0
2024-05-01 12:53:33 +02:00
max_counter = 100#100_000
2024-05-01 09:56:36 +02:00
# Loop through the records
for dir_th in os.listdir(data_dir):
path_to_1000_records = data_dir + '/' + dir_th
for dir_hd in os.listdir(path_to_1000_records):
path_to_100_records = path_to_1000_records + '/' + dir_hd
for record_name in os.listdir(path_to_100_records):
# check if .hea is in the record_name
if '.hea' not in record_name:
continue
# Remove the .hea extension from record_name
record_name = record_name.replace('.hea', '')
try:
# Read the record
record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
# Get the diagnosis
diagnosis = np.array(get_diagnosis_ids(record))
# check if diagnosis is a subset of one of the categories
for category_name, category_codes in categories.items():
2024-05-01 12:53:33 +02:00
#if set(diagnosis).issubset(set(category_codes)):
# if any of the diagnosis codes is in the category_codes
if any(i in category_codes for i in diagnosis):
2024-05-01 09:56:36 +02:00
# Increment the counter for the category
2024-05-01 12:53:33 +02:00
#diag_dict[category_name] += 1
# Add record to the category
diag_dict[category_name].append(record)
2024-05-01 09:56:36 +02:00
break
# Increment the counter
counter += 1
counter_bool = counter >= max_counter
# Break the loop if we have read max_counter records
if counter % 100 == 0:
print(f"Read {counter} records")
if counter_bool:
break
except Exception as e:
print(f"Failed to read record {record_name} due to ValueError")
if counter_bool:
break
if counter_bool:
break
"""
2024-05-01 12:53:33 +02:00
if any(i in category_codes for i in diagnosis):
ID: SB, Count: 16559
ID: AFIB, Count: 9839
ID: GSVT, Count: 948
ID: SR, Count: 9720
break
Der Counter gibt an ob eine Diagnose in einer Kategorie ist
---------------------------------------------------------------------------------------------------------------------
set(diagnosis).issubset(set(category_codes)):
ID: SB, Count: 8909
ID: AFIB, Count: 1905
ID: GSVT, Count: 431
ID: SR, Count: 7299
break
Der Counter gibt an ob alle Diagnosen in einer Kategorie sind
2024-05-01 09:56:36 +02:00
"""
2024-05-01 12:53:33 +02:00
# for id, count in diag_dict.items():
# print(f"ID: {id}, Count: {count}")
2024-05-01 09:56:36 +02:00
2024-05-01 12:53:33 +02:00
# write to pickle
2024-05-01 09:56:36 +02:00
2024-05-01 12:53:33 +02:00
for cat_name, records in diag_dict.items():
print(f"Writing {cat_name} to pickle with {len(records)} records")
# if path not exists create it
if not os.path.exists('./data'):
os.makedirs('./data')
with open(f'./data/{cat_name}.pkl', 'wb') as f:
pickle.dump(records, f)
2024-05-01 09:56:36 +02:00