generate data

2024-05-01 12:53:33 +02:00 · 2024-05-01 12:53:33 +02:00 · 45601729d7
parent 588e20b7b3
commit 45601729d7
3 changed files with 92 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /data/
--- a/notebooks/example.ipynb
+++ b/notebooks/example.ipynb
@ -0,0 +1,38 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "# read pickle files and check len and print first record and first record keys\n",
    "\n",
    "\n",
    "categories = {\n",
    "'SB':    [426177001],\n",
    "'AFIB':  [164889003, 164890007],\n",
    "'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],\n",
    "'SR':    [426783006, 427393009]\n",
    "}\n",
    "\n",
    "\n",
    "data = {}\n",
    "for cat_name in categories.keys():\n",
    "    print(f\"Reading {cat_name}\")\n",
    "    with open(f'{cat_name}.pkl', 'rb') as f:\n",
    "        records = pickle.load(f)\n",
    "        data[cat_name] = records\n",
    "        print(f\"Length of {cat_name}: {len(records)}\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/skripts/generate_data.py
+++ b/skripts/generate_data.py
@ -4,6 +4,7 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
 import numpy as np
 import pickle
 # Directories and file paths
@ -17,12 +18,6 @@ path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/ph
 # --------------------------------------------------------------------------------
 # print if project_dir exists
 if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"):
    print(f"Directory {path_diag_lookup} does not exist")
 def get_diagnosis_ids(record):
    # Get the diagnosis
    diagnosis = record.comments[2]
@ -58,28 +53,34 @@ diagnosis_lookup = pd.read_csv(path_diag_lookup)
 # ----------------------------------------------
-healthy_codes = [426177001, 426783006]
+"""
 SB,    Sinusbradykardie
 AFIB,  Vorhofflimmern und Vorhofflattern (AFL)
 GSVT,  supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
 SR     Sinusrhythmus und Sinusunregelmäßigkeiten
 (Vorhofschrittmacher = 713422000)
 """
 categories = {
-    'Gesund': [426177001, 426783006], # '426177001', '426783006
+'SB':    [426177001],
-    'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001],
+'AFIB':  [164889003, 164890007],
-    'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002],
+'GSVT':  [426761007, 713422000, 233896004, 233897008, 713422000],
-    'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009],
+'SR':    [426783006, 427393009]
    'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005],
    'Hypertrophien': [164873001, 89792004],
    'QT': [111975006],
    'Repolarisation': [428417006],
    'Myokardinfarkt': [164865005]
 }
-diag_dict = {k: 0 for k in categories.keys()}
+
 #diag_dict = {k: 0 for k in categories.keys()}
 diag_dict = {k: [] for k in categories.keys()}
 # Create a counter for the number of records
 counter = 0
-max_counter = 100_000#100_000
+max_counter = 100#100_000
 # Loop through the records
 for dir_th in os.listdir(data_dir):
@ -100,9 +101,14 @@ for dir_th in os.listdir(data_dir):
                # check if diagnosis is a subset of one of the categories
                for category_name, category_codes in categories.items():
-                    if set(diagnosis).issubset(set(category_codes)):
+                    #if set(diagnosis).issubset(set(category_codes)):
                    # if any of the diagnosis codes is in the category_codes
                    if any(i in category_codes for i in diagnosis):
                        # Increment the counter for the category
-                        diag_dict[category_name] += 1
+                        #diag_dict[category_name] += 1
                        # Add record to the category
                        diag_dict[category_name].append(record)
                        break
                # Increment the counter
@ -120,61 +126,39 @@ for dir_th in os.listdir(data_dir):
            break
    if counter_bool:
        break
 """
-ID: Herzrhythmusstörungen, Count: 22571
+if any(i in category_codes for i in diagnosis):
-ID: Leitungsstörungen, Count: 505
+ID: SB, Count: 16559
-ID: EKG-Welle, Count: 2067
+ID: AFIB, Count: 9839
-ID: Spannungsänderungen, Count: 613
+ID: GSVT, Count: 948
-ID: Hypertrophien, Count: 5
+ID: SR, Count: 9720
-ID: QT, Count: 43
+break
 ID: Repolarisation, Count: 73
 ID: Myokardinfarkt, Count: 1
 Der Counter gibt an ob eine Diagnose in einer Kategorie ist
 ---------------------------------------------------------------------------------------------------------------------
 set(diagnosis).issubset(set(category_codes)):
 ID: SB, Count: 8909
 ID: AFIB, Count: 1905
 ID: GSVT, Count: 431
 ID: SR, Count: 7299
 break
 Der Counter gibt an ob alle Diagnosen in einer Kategorie sind
 """
-# # get the data
+# for id, count in diag_dict.items():
-# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict()
+#     print(f"ID: {id}, Count: {count}")
 # # get unique diagnosis codes
 # unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten())
 # unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten())
 # unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten())
 # print(unique_health_codes)
 # print(unique_afib_codes)
 # print(unique_mi_codes)
 # print(dict_healthy['JS00004'].__dict__)
 #print(diag_dict)
 for id, count in diag_dict.items():
    print(f"ID: {id}, Count: {count}")
-print(f'Number of counter diagnoses: {len(diag_dict)}')
+# write to pickle
 print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}')
 print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup))
 # flatten the counters and count the unique values
 # healthy_counter = np.array(healthy_counter).flatten()
 # afib_counter = np.array(afib_counter).flatten()
 # mi_counter = np.array(mi_counter).flatten()
 # unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True)
 # unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True)
 # unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True)
 # print(unique_health_codes)
 # print(counts_health)
 # print(unique_afib_codes)
 # print(counts_afib)
 # print(unique_mi_codes)
 # print(counts_mi)
 # # get the names of the diagnosis
 # names_health = get_diagnosis_name(unique_health_codes)
 # names_afib = get_diagnosis_name(unique_afib_codes)
 # names_mi = get_diagnosis_name(unique_mi_codes)
 for cat_name, records in diag_dict.items():
    print(f"Writing {cat_name} to pickle with {len(records)} records")
    # if path not exists create it
    if not os.path.exists('./data'):
        os.makedirs('./data')
    with open(f'./data/{cat_name}.pkl', 'wb') as f:
        pickle.dump(records, f)