From 45601729d721d815cebf2ff2e45fdec25bfa555b Mon Sep 17 00:00:00 2001 From: Felix Mucha <3016498@stud.hs-mannheim.de> Date: Wed, 1 May 2024 12:53:33 +0200 Subject: [PATCH] generate data --- .gitignore | 1 + notebooks/example.ipynb | 38 ++++++ skripts/{clean_data.py => generate_data.py} | 122 +++++++++----------- 3 files changed, 92 insertions(+), 69 deletions(-) create mode 100644 .gitignore create mode 100644 notebooks/example.ipynb rename skripts/{clean_data.py => generate_data.py} (56%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5fac628 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/data/ \ No newline at end of file diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb new file mode 100644 index 0000000..0d5c9cc --- /dev/null +++ b/notebooks/example.ipynb @@ -0,0 +1,38 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "# read pickle files and check len and print first record and first record keys\n", + "\n", + "\n", + "categories = {\n", + "'SB': [426177001],\n", + "'AFIB': [164889003, 164890007],\n", + "'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n", + "'SR': [426783006, 427393009]\n", + "}\n", + "\n", + "\n", + "data = {}\n", + "for cat_name in categories.keys():\n", + " print(f\"Reading {cat_name}\")\n", + " with open(f'{cat_name}.pkl', 'rb') as f:\n", + " records = pickle.load(f)\n", + " data[cat_name] = records\n", + " print(f\"Length of {cat_name}: {len(records)}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/skripts/clean_data.py b/skripts/generate_data.py similarity index 56% rename from skripts/clean_data.py rename to skripts/generate_data.py index cfdf31d..6f8fedc 100644 --- a/skripts/clean_data.py +++ b/skripts/generate_data.py @@ -4,6 +4,7 @@ import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np +import pickle # Directories and file paths @@ -17,12 +18,6 @@ path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/ph # -------------------------------------------------------------------------------- -# print if project_dir exists -if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"): - print(f"Directory {path_diag_lookup} does not exist") - - - def get_diagnosis_ids(record): # Get the diagnosis diagnosis = record.comments[2] @@ -58,28 +53,34 @@ diagnosis_lookup = pd.read_csv(path_diag_lookup) # ---------------------------------------------- -healthy_codes = [426177001, 426783006] +""" + +SB, Sinusbradykardie +AFIB, Vorhofflimmern und Vorhofflattern (AFL) +GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher +SR Sinusrhythmus und Sinusunregelmäßigkeiten +(Vorhofschrittmacher = 713422000) +""" categories = { - 'Gesund': [426177001, 426783006], # '426177001', '426783006 - 'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001], - 'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002], - 'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009], - 'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005], - 'Hypertrophien': [164873001, 89792004], - 'QT': [111975006], - 'Repolarisation': [428417006], - 'Myokardinfarkt': [164865005] +'SB': [426177001], +'AFIB': [164889003, 164890007], +'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000], +'SR': [426783006, 427393009] } -diag_dict = {k: 0 for k in categories.keys()} + + +#diag_dict = {k: 0 for k in categories.keys()} + +diag_dict = {k: [] for k in categories.keys()} # Create a counter for the number of records counter = 0 -max_counter = 100_000#100_000 +max_counter = 100#100_000 # Loop through the records for dir_th in os.listdir(data_dir): @@ -100,9 +101,14 @@ for dir_th in os.listdir(data_dir): # check if diagnosis is a subset of one of the categories for category_name, category_codes in categories.items(): - if set(diagnosis).issubset(set(category_codes)): + #if set(diagnosis).issubset(set(category_codes)): + + # if any of the diagnosis codes is in the category_codes + if any(i in category_codes for i in diagnosis): # Increment the counter for the category - diag_dict[category_name] += 1 + #diag_dict[category_name] += 1 + # Add record to the category + diag_dict[category_name].append(record) break # Increment the counter @@ -120,61 +126,39 @@ for dir_th in os.listdir(data_dir): break if counter_bool: break - """ -ID: Herzrhythmusstörungen, Count: 22571 -ID: Leitungsstörungen, Count: 505 -ID: EKG-Welle, Count: 2067 -ID: Spannungsänderungen, Count: 613 -ID: Hypertrophien, Count: 5 -ID: QT, Count: 43 -ID: Repolarisation, Count: 73 -ID: Myokardinfarkt, Count: 1 +if any(i in category_codes for i in diagnosis): +ID: SB, Count: 16559 +ID: AFIB, Count: 9839 +ID: GSVT, Count: 948 +ID: SR, Count: 9720 +break +Der Counter gibt an ob eine Diagnose in einer Kategorie ist + +--------------------------------------------------------------------------------------------------------------------- + set(diagnosis).issubset(set(category_codes)): +ID: SB, Count: 8909 +ID: AFIB, Count: 1905 +ID: GSVT, Count: 431 +ID: SR, Count: 7299 +break + +Der Counter gibt an ob alle Diagnosen in einer Kategorie sind """ -# # get the data -# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict() - -# # get unique diagnosis codes -# unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten()) -# unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten()) -# unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten()) - -# print(unique_health_codes) -# print(unique_afib_codes) -# print(unique_mi_codes) - -# print(dict_healthy['JS00004'].__dict__) +# for id, count in diag_dict.items(): +# print(f"ID: {id}, Count: {count}") -#print(diag_dict) -for id, count in diag_dict.items(): - print(f"ID: {id}, Count: {count}") -print(f'Number of counter diagnoses: {len(diag_dict)}') -print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}') -print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup)) - -# flatten the counters and count the unique values -# healthy_counter = np.array(healthy_counter).flatten() -# afib_counter = np.array(afib_counter).flatten() -# mi_counter = np.array(mi_counter).flatten() - -# unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True) -# unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True) -# unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True) - -# print(unique_health_codes) -# print(counts_health) -# print(unique_afib_codes) -# print(counts_afib) -# print(unique_mi_codes) -# print(counts_mi) - -# # get the names of the diagnosis -# names_health = get_diagnosis_name(unique_health_codes) -# names_afib = get_diagnosis_name(unique_afib_codes) -# names_mi = get_diagnosis_name(unique_mi_codes) +# write to pickle +for cat_name, records in diag_dict.items(): + print(f"Writing {cat_name} to pickle with {len(records)} records") + # if path not exists create it + if not os.path.exists('./data'): + os.makedirs('./data') + with open(f'./data/{cat_name}.pkl', 'wb') as f: + pickle.dump(records, f)