readme

2024-06-05 12:04:33 +02:00 · 2024-06-05 12:04:33 +02:00 · 28a34f3ff3
parent 8709e67500
commit 28a34f3ff3
2 changed files with 59 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@
 This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project. 
-## Analysis of cardiovascular diseases using ECG data
+# Analysis of cardiovascular diseases using ECG data
 ## Table of Contents
--- a/skripts/generate_data.py
+++ b/skripts/generate_data.py
@ -1,105 +1,78 @@
 """
 This script reads the WFDB records and extracts the diagnosis information from the comments.
 The diagnosis information is then used to classify the records into categories.
 The categories are defined by the diagnosis codes in the comments.
 The records are then saved to pickle files based on the categories.
 """
 import wfdb
 import os
 import numpy as np
 import pickle
 import bz2
 import numpy as np
 import pandas as pd
-# Directories and file paths
+# Funktionen zum Bearbeiten der Daten
 # --------------------------------------------------------------------------------
 # NOTE: Specify the directory where the WFDB records are stored
 project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
 data_dir = project_dir + '/WFDBRecords'
 path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"
 # --------------------------------------------------------------------------------
 # Functions
 def get_diagnosis_ids(record):
    """
    Extracts diagnosis IDs from a record and returns them as a list.
    Args:
        record (object): The record object containing the diagnosis information.
    Returns:
        list: A list of diagnosis IDs extracted from the record.
    """
    # Get the diagnosis
    diagnosis = record.comments[2]
    # clean the diagnosis
    diagnosis = diagnosis.replace('Dx: ', '')
    list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
    return list_diagnosis
 # --------------------------------------------------------------------------------
 # Generate the data
 # --------------------------------------------------------------------------------
 if __name__ == '__main__':
    """
    The following categories are used to classify the records:
-    SB,    Sinusbradykardie
+def get_diagnosis_name(diagnosis):
-    AFIB,  Vorhofflimmern und Vorhofflattern (AFL)
+    name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis]
-    GSVT,  supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
+    return name
-    SR     Sinusrhythmus und Sinusunregelmäßigkeiten
+
-    """
+def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df):
-    categories = {
+    filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0]
    filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
    filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
    return filtered_df_dict
 # Verzeichnisse und Dateipfade
 project_dir = 'C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
 data_dir = project_dir + '/WFDBRecords'
 path_diag_lookup = "C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
 # Daten erkunden
 diagnosis_lookup = pd.read_csv(path_diag_lookup)
 categories = {
    'SB':    [426177001],
    'AFIB':  [164889003, 164890007],
    'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],
    'SR':    [426783006, 427393009]
-    }
+}
-    diag_dict = {k: [] for k in categories.keys()}
+diag_dict = {k: [] for k in categories.keys()}
 counter = 0
 max_counter = 100_000
-    # Create a counter for the number of records
+for dir_th in os.listdir(data_dir):
-    counter = 0
+    path_to_1000_records = data_dir + '/' + dir_th
-    max_counter = 100_000
+    for dir_hd in os.listdir(path_to_1000_records):
-    failed_records = []
+        path_to_100_records = path_to_1000_records + '/' + dir_hd
-    # Loop through the records
+        for record_name in os.listdir(path_to_100_records):
-    for dir_th in os.listdir(data_dir):
+            if '.hea' not in record_name:
-        path_to_1000_records = data_dir + '/' + dir_th
+                continue
-        for dir_hd in os.listdir(path_to_1000_records):
+            record_name = record_name.replace('.hea', '')
-            path_to_100_records = path_to_1000_records + '/' + dir_hd
+            try:
-            for record_name in os.listdir(path_to_100_records):
+                record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
-                # check if .hea is in the record_name 
+                diagnosis = np.array(get_diagnosis_ids(record))
-                if '.hea' not in record_name:
+                for category_name, category_codes in categories.items():
-                    continue
+                    if any(i in category_codes for i in diagnosis):
-                # Remove the .hea extension from record_name
+                        diag_dict[category_name].append(record)
                record_name = record_name.replace('.hea', '')
                try:
                    # Read the record
                    record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
                    # Get the diagnosis
                    diagnosis = np.array(get_diagnosis_ids(record))
                    # check if diagnosis is a subset of one of the categories
                    for category_name, category_codes in categories.items():
                        # if any of the diagnosis codes is in the category_codes
                        if any(i in category_codes for i in diagnosis):
                            diag_dict[category_name].append(record)
                            break
                    # Increment the counter of how many records we have read
                    counter += 1
                    counter_bool = counter >= max_counter
                    # Break the loop if we have read max_counter records
                    if counter % 100 == 0:
                        print(f"Read {counter} records")
                    if counter_bool:
                        break
-                except Exception as e:
+                counter += 1
-                    failed_records.append(record_name)
+                counter_bool = counter >= max_counter
-                    print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")
+                if counter % 100 == 0:
-            if counter_bool:
+                    print(f"Gelesen {counter} Datensätze")
-                break
+                if counter_bool:
                    break
            except Exception as e:
                print(f"Fehler beim Lesen des Datensatzes {record_name}: {e}")
        if counter_bool:
            break
    if counter_bool:
        break
-    # write to pickle
+for cat_name, records in diag_dict.items():
-    for cat_name, records in diag_dict.items():
+    print(f"Schreibe {cat_name} in eine komprimierte Datei mit {len(records)} Datensätzen")
-        print(f"Writing {cat_name} to pickle with {len(records)} records")
+    if not os.path.exists('./data'):
-        # if path not exists create it
+        os.makedirs('./data')
-        if not os.path.exists('./data'):
+    compressed_filename = f'./data/{cat_name}.pkl.bz2'
-            os.makedirs('./data')
+    with bz2.open(compressed_filename, 'wb') as f:
-        with open(f'./data/{cat_name}.pkl', 'wb') as f:
+        pickle.dump(records, f)
            pickle.dump(records, f)