readme

2024-06-05 12:04:33 +02:00 · 2024-06-05 12:04:33 +02:00 · 28a34f3ff3
parent 8709e67500
commit 28a34f3ff3
2 changed files with 59 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 This project was developed through the Data Science and Analytics course at the Mannheim University of Applied Sciences. A data science cycle was taught theoretically on the basis of lectures and implemented practically in the project. 

-## Analysis of cardiovascular diseases using ECG data
+# Analysis of cardiovascular diseases using ECG data


 ## Table of Contents
--- a/skripts/generate_data.py
+++ b/skripts/generate_data.py
@ -1,105 +1,78 @@
-"""
-This script reads the WFDB records and extracts the diagnosis information from the comments.
-The diagnosis information is then used to classify the records into categories.
-The categories are defined by the diagnosis codes in the comments.
-The records are then saved to pickle files based on the categories.
-"""
-
 import wfdb
 import os
-import numpy as np
 import pickle
+import bz2
+import numpy as np
+import pandas as pd

-# Directories and file paths
-# --------------------------------------------------------------------------------
-# NOTE: Specify the directory where the WFDB records are stored
-project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
-data_dir = project_dir + '/WFDBRecords'
-path_diag_lookup = project_dir + "/ConditionNames_SNOMED-CT.csv"
- # --------------------------------------------------------------------------------
-# Functions
+# Funktionen zum Bearbeiten der Daten
 def get_diagnosis_ids(record):
-    """
-    Extracts diagnosis IDs from a record and returns them as a list.
-    Args:
-        record (object): The record object containing the diagnosis information.
-    Returns:
-        list: A list of diagnosis IDs extracted from the record.
-    """
-    # Get the diagnosis
    diagnosis = record.comments[2]
-    # clean the diagnosis
    diagnosis = diagnosis.replace('Dx: ', '')
    list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
    return list_diagnosis
-# --------------------------------------------------------------------------------
-# Generate the data
-# --------------------------------------------------------------------------------
-if __name__ == '__main__':
-    """
-    The following categories are used to classify the records:

-    SB,    Sinusbradykardie
-    AFIB,  Vorhofflimmern und Vorhofflattern (AFL)
-    GSVT,  supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
-    SR     Sinusrhythmus und Sinusunregelmäßigkeiten
-    """
-    categories = {
+def get_diagnosis_name(diagnosis):
+    name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis]
+    return name
+
+def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df):
+    filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0]
+    filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
+    filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
+    return filtered_df_dict
+
+# Verzeichnisse und Dateipfade
+project_dir = 'C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
+data_dir = project_dir + '/WFDBRecords'
+path_diag_lookup = "C:/Users/arman/PycharmProjects/pythonProject/DSA/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
+
+# Daten erkunden
+diagnosis_lookup = pd.read_csv(path_diag_lookup)
+
+categories = {
    'SB':    [426177001],
    'AFIB':  [164889003, 164890007],
    'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],
    'SR':    [426783006, 427393009]
-    }
+}

-    diag_dict = {k: [] for k in categories.keys()}
+diag_dict = {k: [] for k in categories.keys()}
+counter = 0
+max_counter = 100_000

-    # Create a counter for the number of records
-    counter = 0
-    max_counter = 100_000
-    failed_records = []
-    # Loop through the records
-    for dir_th in os.listdir(data_dir):
-        path_to_1000_records = data_dir + '/' + dir_th
-        for dir_hd in os.listdir(path_to_1000_records):
-            path_to_100_records = path_to_1000_records + '/' + dir_hd
-            for record_name in os.listdir(path_to_100_records):
-                # check if .hea is in the record_name 
-                if '.hea' not in record_name:
-                    continue
-                # Remove the .hea extension from record_name
-                record_name = record_name.replace('.hea', '')
-                try:
-                    # Read the record
-                    record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
-                    # Get the diagnosis
-                    diagnosis = np.array(get_diagnosis_ids(record))
-                    # check if diagnosis is a subset of one of the categories
-                    for category_name, category_codes in categories.items():
-                        # if any of the diagnosis codes is in the category_codes
-                        if any(i in category_codes for i in diagnosis):
-                            diag_dict[category_name].append(record)
-                            break
-                    # Increment the counter of how many records we have read
-                    counter += 1
-                    counter_bool = counter >= max_counter
-                    # Break the loop if we have read max_counter records
-                    if counter % 100 == 0:
-                        print(f"Read {counter} records")
-                    if counter_bool:
+for dir_th in os.listdir(data_dir):
+    path_to_1000_records = data_dir + '/' + dir_th
+    for dir_hd in os.listdir(path_to_1000_records):
+        path_to_100_records = path_to_1000_records + '/' + dir_hd
+        for record_name in os.listdir(path_to_100_records):
+            if '.hea' not in record_name:
+                continue
+            record_name = record_name.replace('.hea', '')
+            try:
+                record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
+                diagnosis = np.array(get_diagnosis_ids(record))
+                for category_name, category_codes in categories.items():
+                    if any(i in category_codes for i in diagnosis):
+                        diag_dict[category_name].append(record)
                        break
-                except Exception as e:
-                    failed_records.append(record_name)
-                    print(f"Failed to read record {record_name} due to ValueError. Sum of failed records: {len(failed_records)}")
-            if counter_bool:
-                break
+                counter += 1
+                counter_bool = counter >= max_counter
+                if counter % 100 == 0:
+                    print(f"Gelesen {counter} Datensätze")
+                if counter_bool:
+                    break
+            except Exception as e:
+                print(f"Fehler beim Lesen des Datensatzes {record_name}: {e}")
        if counter_bool:
            break
+    if counter_bool:
+        break

-    # write to pickle
-    for cat_name, records in diag_dict.items():
-        print(f"Writing {cat_name} to pickle with {len(records)} records")
-        # if path not exists create it
-        if not os.path.exists('./data'):
-            os.makedirs('./data')
-        with open(f'./data/{cat_name}.pkl', 'wb') as f:
-            pickle.dump(records, f)
+for cat_name, records in diag_dict.items():
+    print(f"Schreibe {cat_name} in eine komprimierte Datei mit {len(records)} Datensätzen")
+    if not os.path.exists('./data'):
+        os.makedirs('./data')
+    compressed_filename = f'./data/{cat_name}.pkl.bz2'
+    with bz2.open(compressed_filename, 'wb') as f:
+        pickle.dump(records, f)