From 588e20b7b37c2d6700df43724fe47fa20b3a4027 Mon Sep 17 00:00:00 2001
From: Felix Mucha <3016498@stud.hs-mannheim.de>
Date: Wed, 1 May 2024 09:56:36 +0200
Subject: [PATCH] first structure

---
 README.md             |  48 +++++++++++
 skripts/clean_data.py | 180 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 228 insertions(+)
 create mode 100644 README.md
 create mode 100644 skripts/clean_data.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c686844
--- /dev/null
+++ b/README.md
@@ -0,0 +1,48 @@
+# HSMA Data Science and Analytics SS2024 
+## ECG Data klassifier and segmentation
+
+[This Project aims to klassify ... and semgment ... ECG Data]
+
+## Table of Contents
+- [About](#about)
+- [Getting Started](#getting-started)
+- [Usage](#usage)
+- [Progress](#progress)
+- [Contributing](#contributing)
+- [License](#license)
+
+## About
+[Provide a brief overview of the project, including its purpose and any relevant background information.]
+
+## Getting Started
+[Instructions on how to get the project up and running on a local machine. Include prerequisites, installation steps, and any other necessary setup.]
+
+### Prerequisites
+[List any software or tools that need to be installed before running the project.]
+
+### Installation
+[Step-by-step guide on how to install the project.]
+
+## Usage
+[Provide examples and instructions for using the project. Include any relevant code snippets or screenshots.]
+
+## Progress
+- Data was searched and found at : https://doi.org/10.13026/wgex-er52
+- Data was cleaned:
+    - Docker Container with MongoDB, because 10 GB and many arrays
+    - Diagnosis from String to list
+    - Data filtered to contain only healthy and the needed diagnosis data
+
+
+
+## Contributing
+[Explain how others can contribute to the project. This might include guidelines for reporting bugs, submitting enhancements, or proposing new features.]
+
+## License
+[Specify the license under which the project is distributed. Include any additional terms or conditions.]
+
+## Acknowledgements
+[Optional section to thank individuals or organizations that have contributed to the project.]
+
+## Contact
+[Provide contact information for inquiries or support.]
\ No newline at end of file
diff --git a/skripts/clean_data.py b/skripts/clean_data.py
new file mode 100644
index 0000000..cfdf31d
--- /dev/null
+++ b/skripts/clean_data.py
@@ -0,0 +1,180 @@
+import wfdb
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+
+
+# Directories and file paths
+# --------------------------------------------------------------------------------
+# Specify the directory where the WFDB records are stored
+project_dir = 'C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0'
+data_dir = project_dir + '/WFDBRecords'
+path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"
+#project_dir +'/ConditionNames_SNOMED-CT.csv'
+
+
+
+ # --------------------------------------------------------------------------------
+# print if project_dir exists
+if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"):
+    print(f"Directory {path_diag_lookup} does not exist")
+
+
+
+def get_diagnosis_ids(record):
+    # Get the diagnosis
+    diagnosis = record.comments[2]
+    # clean the diagnosis
+    diagnosis = diagnosis.replace('Dx: ', '')
+    list_diagnosis = [int(x.strip()) for x in diagnosis.split(',')]
+    return list_diagnosis
+
+def get_diagnosis_name(diagnosis):
+    # get the diagnosis name from the lookup table
+    name = [diagnosis_lookup[diagnosis_lookup['Snomed_CT'] == x]['Full Name'].to_string(index=False) for x in diagnosis]
+    return name
+
+def filter_signal_df_on_diag(df_dict, diagnosis_dict, filter_codes_df):
+    # Create a list with filter codes and add 0 for padding
+    filter_cod_li = list(filter_codes_df['Snomed_CT']) + [0]
+    # Filter the diagnosis dictionary based on the filter codes
+    filter_dict_diag = {k: v for k, v in diagnosis_dict.items() if all(i in filter_cod_li for i in v)}
+    # Filter the df_dict based on the filtered_dict_diag
+    filtered_df_dict = {i: df.loc[df.index.isin(filter_dict_diag.keys())] for i, df in df_dict.items()}
+
+    return filtered_df_dict
+
+# --------------------------------------------------------------------------------
+# Explore the data
+# --------------------------------------------------------------------------------
+# Read the diagnosis lookup table
+diagnosis_lookup = pd.read_csv(path_diag_lookup)
+#print(diagnosis_lookup.head())
+
+# Filter data based on the diagnosis
+
+
+
+# ----------------------------------------------
+healthy_codes = [426177001, 426783006]
+
+
+categories = {
+    'Gesund': [426177001, 426783006], # '426177001', '426783006
+    'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001],
+    'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002],
+    'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009],
+    'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005],
+    'Hypertrophien': [164873001, 89792004],
+    'QT': [111975006],
+    'Repolarisation': [428417006],
+    'Myokardinfarkt': [164865005]
+}
+
+
+
+diag_dict = {k: 0 for k in categories.keys()}
+
+# Create a counter for the number of records
+counter = 0
+max_counter = 100_000#100_000
+
+# Loop through the records
+for dir_th in os.listdir(data_dir):
+    path_to_1000_records = data_dir + '/' + dir_th
+    for dir_hd in os.listdir(path_to_1000_records):
+        path_to_100_records = path_to_1000_records + '/' + dir_hd
+        for record_name in os.listdir(path_to_100_records):
+            # check if .hea is in the record_name 
+            if '.hea' not in record_name:
+                continue
+            # Remove the .hea extension from record_name
+            record_name = record_name.replace('.hea', '')
+            try:
+                # Read the record
+                record = wfdb.rdrecord(path_to_100_records + '/' + record_name)
+                # Get the diagnosis
+                diagnosis = np.array(get_diagnosis_ids(record))
+
+                # check if diagnosis is a subset of one of the categories
+                for category_name, category_codes in categories.items():
+                    if set(diagnosis).issubset(set(category_codes)):
+                        # Increment the counter for the category
+                        diag_dict[category_name] += 1
+                        break
+
+                # Increment the counter
+                counter += 1
+                counter_bool = counter >= max_counter
+                # Break the loop if we have read max_counter records
+                if counter % 100 == 0:
+                    print(f"Read {counter} records")
+
+                if counter_bool:
+                    break
+            except Exception as e:
+                print(f"Failed to read record {record_name} due to ValueError")
+        if counter_bool:
+            break
+    if counter_bool:
+        break
+
+"""
+ID: Herzrhythmusstörungen, Count: 22571
+ID: Leitungsstörungen, Count: 505
+ID: EKG-Welle, Count: 2067
+ID: Spannungsänderungen, Count: 613
+ID: Hypertrophien, Count: 5
+ID: QT, Count: 43
+ID: Repolarisation, Count: 73
+ID: Myokardinfarkt, Count: 1
+
+"""
+
+# # get the data
+# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict()
+
+# # get unique diagnosis codes
+# unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten())
+# unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten())
+# unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten())
+
+# print(unique_health_codes)
+# print(unique_afib_codes)
+# print(unique_mi_codes)
+
+# print(dict_healthy['JS00004'].__dict__)
+
+
+#print(diag_dict)
+for id, count in diag_dict.items():
+    print(f"ID: {id}, Count: {count}")
+
+print(f'Number of counter diagnoses: {len(diag_dict)}')
+print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}')
+print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup))
+
+# flatten the counters and count the unique values
+# healthy_counter = np.array(healthy_counter).flatten()
+# afib_counter = np.array(afib_counter).flatten()
+# mi_counter = np.array(mi_counter).flatten()
+
+# unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True)
+# unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True)
+# unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True)
+
+# print(unique_health_codes)
+# print(counts_health)
+# print(unique_afib_codes)
+# print(counts_afib)
+# print(unique_mi_codes)
+# print(counts_mi)
+
+# # get the names of the diagnosis
+# names_health = get_diagnosis_name(unique_health_codes)
+# names_afib = get_diagnosis_name(unique_afib_codes)
+# names_mi = get_diagnosis_name(unique_mi_codes)
+
+