From 45601729d721d815cebf2ff2e45fdec25bfa555b Mon Sep 17 00:00:00 2001
From: Felix Mucha <3016498@stud.hs-mannheim.de>
Date: Wed, 1 May 2024 12:53:33 +0200
Subject: [PATCH] generate data

---
 .gitignore                                  |   1 +
 notebooks/example.ipynb                     |  38 ++++++
 skripts/{clean_data.py => generate_data.py} | 122 +++++++++-----------
 3 files changed, 92 insertions(+), 69 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 notebooks/example.ipynb
 rename skripts/{clean_data.py => generate_data.py} (56%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5fac628
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/data/
\ No newline at end of file
diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb
new file mode 100644
index 0000000..0d5c9cc
--- /dev/null
+++ b/notebooks/example.ipynb
@@ -0,0 +1,38 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "# read pickle files and check len and print first record and first record keys\n",
+    "\n",
+    "\n",
+    "categories = {\n",
+    "'SB':    [426177001],\n",
+    "'AFIB':  [164889003, 164890007],\n",
+    "'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],\n",
+    "'SR':    [426783006, 427393009]\n",
+    "}\n",
+    "\n",
+    "\n",
+    "data = {}\n",
+    "for cat_name in categories.keys():\n",
+    "    print(f\"Reading {cat_name}\")\n",
+    "    with open(f'{cat_name}.pkl', 'rb') as f:\n",
+    "        records = pickle.load(f)\n",
+    "        data[cat_name] = records\n",
+    "        print(f\"Length of {cat_name}: {len(records)}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/skripts/clean_data.py b/skripts/generate_data.py
similarity index 56%
rename from skripts/clean_data.py
rename to skripts/generate_data.py
index cfdf31d..6f8fedc 100644
--- a/skripts/clean_data.py
+++ b/skripts/generate_data.py
@@ -4,6 +4,7 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
 import numpy as np
+import pickle
 
 
 # Directories and file paths
@@ -17,12 +18,6 @@ path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/ph
 
 
  # --------------------------------------------------------------------------------
-# print if project_dir exists
-if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"):
-    print(f"Directory {path_diag_lookup} does not exist")
-
-
-
 def get_diagnosis_ids(record):
     # Get the diagnosis
     diagnosis = record.comments[2]
@@ -58,28 +53,34 @@ diagnosis_lookup = pd.read_csv(path_diag_lookup)
 
 
 # ----------------------------------------------
-healthy_codes = [426177001, 426783006]
+"""
+
+SB,    Sinusbradykardie
+AFIB,  Vorhofflimmern und Vorhofflattern (AFL)
+GSVT,  supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
+SR     Sinusrhythmus und Sinusunregelmäßigkeiten
 
 
+(Vorhofschrittmacher = 713422000)
+"""
 categories = {
-    'Gesund': [426177001, 426783006], # '426177001', '426783006
-    'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001],
-    'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002],
-    'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009],
-    'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005],
-    'Hypertrophien': [164873001, 89792004],
-    'QT': [111975006],
-    'Repolarisation': [428417006],
-    'Myokardinfarkt': [164865005]
+'SB':    [426177001],
+'AFIB':  [164889003, 164890007],
+'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],
+'SR':    [426783006, 427393009]
 }
 
 
 
-diag_dict = {k: 0 for k in categories.keys()}
+
+
+#diag_dict = {k: 0 for k in categories.keys()}
+
+diag_dict = {k: [] for k in categories.keys()}
 
 # Create a counter for the number of records
 counter = 0
-max_counter = 100_000#100_000
+max_counter = 100#100_000
 
 # Loop through the records
 for dir_th in os.listdir(data_dir):
@@ -100,9 +101,14 @@ for dir_th in os.listdir(data_dir):
 
                 # check if diagnosis is a subset of one of the categories
                 for category_name, category_codes in categories.items():
-                    if set(diagnosis).issubset(set(category_codes)):
+                    #if set(diagnosis).issubset(set(category_codes)):
+
+                    # if any of the diagnosis codes is in the category_codes
+                    if any(i in category_codes for i in diagnosis):
                         # Increment the counter for the category
-                        diag_dict[category_name] += 1
+                        #diag_dict[category_name] += 1
+                        # Add record to the category
+                        diag_dict[category_name].append(record)
                         break
 
                 # Increment the counter
@@ -120,61 +126,39 @@ for dir_th in os.listdir(data_dir):
             break
     if counter_bool:
         break
-
 """
-ID: Herzrhythmusstörungen, Count: 22571
-ID: Leitungsstörungen, Count: 505
-ID: EKG-Welle, Count: 2067
-ID: Spannungsänderungen, Count: 613
-ID: Hypertrophien, Count: 5
-ID: QT, Count: 43
-ID: Repolarisation, Count: 73
-ID: Myokardinfarkt, Count: 1
+if any(i in category_codes for i in diagnosis):
+ID: SB, Count: 16559
+ID: AFIB, Count: 9839
+ID: GSVT, Count: 948
+ID: SR, Count: 9720
+break
 
+Der Counter gibt an ob eine Diagnose in einer Kategorie ist
+
+---------------------------------------------------------------------------------------------------------------------
+ set(diagnosis).issubset(set(category_codes)):
+ID: SB, Count: 8909
+ID: AFIB, Count: 1905
+ID: GSVT, Count: 431
+ID: SR, Count: 7299
+break
+
+Der Counter gibt an ob alle Diagnosen in einer Kategorie sind
 """
 
-# # get the data
-# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict()
-
-# # get unique diagnosis codes
-# unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten())
-# unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten())
-# unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten())
-
-# print(unique_health_codes)
-# print(unique_afib_codes)
-# print(unique_mi_codes)
-
-# print(dict_healthy['JS00004'].__dict__)
+# for id, count in diag_dict.items():
+#     print(f"ID: {id}, Count: {count}")
 
 
-#print(diag_dict)
-for id, count in diag_dict.items():
-    print(f"ID: {id}, Count: {count}")
 
-print(f'Number of counter diagnoses: {len(diag_dict)}')
-print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}')
-print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup))
-
-# flatten the counters and count the unique values
-# healthy_counter = np.array(healthy_counter).flatten()
-# afib_counter = np.array(afib_counter).flatten()
-# mi_counter = np.array(mi_counter).flatten()
-
-# unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True)
-# unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True)
-# unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True)
-
-# print(unique_health_codes)
-# print(counts_health)
-# print(unique_afib_codes)
-# print(counts_afib)
-# print(unique_mi_codes)
-# print(counts_mi)
-
-# # get the names of the diagnosis
-# names_health = get_diagnosis_name(unique_health_codes)
-# names_afib = get_diagnosis_name(unique_afib_codes)
-# names_mi = get_diagnosis_name(unique_mi_codes)
+# write to pickle
 
+for cat_name, records in diag_dict.items():
+    print(f"Writing {cat_name} to pickle with {len(records)} records")
+    # if path not exists create it
+    if not os.path.exists('./data'):
+        os.makedirs('./data')
+    with open(f'./data/{cat_name}.pkl', 'wb') as f:
+        pickle.dump(records, f)