generate data
parent
588e20b7b3
commit
45601729d7
|
@ -0,0 +1 @@
|
||||||
|
/data/
|
|
@ -0,0 +1,38 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pickle\n",
|
||||||
|
"# read pickle files and check len and print first record and first record keys\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"categories = {\n",
|
||||||
|
"'SB': [426177001],\n",
|
||||||
|
"'AFIB': [164889003, 164890007],\n",
|
||||||
|
"'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n",
|
||||||
|
"'SR': [426783006, 427393009]\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"data = {}\n",
|
||||||
|
"for cat_name in categories.keys():\n",
|
||||||
|
" print(f\"Reading {cat_name}\")\n",
|
||||||
|
" with open(f'{cat_name}.pkl', 'rb') as f:\n",
|
||||||
|
" records = pickle.load(f)\n",
|
||||||
|
" data[cat_name] = records\n",
|
||||||
|
" print(f\"Length of {cat_name}: {len(records)}\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
# Directories and file paths
|
# Directories and file paths
|
||||||
|
@ -17,12 +18,6 @@ path_diag_lookup = "C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/ph
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------
|
||||||
# print if project_dir exists
|
|
||||||
if not os.path.exists("C:/Users/felix/OneDrive/Studium/Master MDS/1 Semester/DSA/physionet/large_12_ecg_data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/ConditionNames_SNOMED-CT.csv"):
|
|
||||||
print(f"Directory {path_diag_lookup} does not exist")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_diagnosis_ids(record):
|
def get_diagnosis_ids(record):
|
||||||
# Get the diagnosis
|
# Get the diagnosis
|
||||||
diagnosis = record.comments[2]
|
diagnosis = record.comments[2]
|
||||||
|
@ -58,28 +53,34 @@ diagnosis_lookup = pd.read_csv(path_diag_lookup)
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------
|
# ----------------------------------------------
|
||||||
healthy_codes = [426177001, 426783006]
|
"""
|
||||||
|
|
||||||
|
SB, Sinusbradykardie
|
||||||
|
AFIB, Vorhofflimmern und Vorhofflattern (AFL)
|
||||||
|
GSVT, supraventrikulärer Tachykardie, Vorhoftachykardie, AV-Knoten-Reentry-Tachykardie, AV-Reentry-Tachykardie, Vorhofschrittmacher
|
||||||
|
SR Sinusrhythmus und Sinusunregelmäßigkeiten
|
||||||
|
|
||||||
|
|
||||||
|
(Vorhofschrittmacher = 713422000)
|
||||||
|
"""
|
||||||
categories = {
|
categories = {
|
||||||
'Gesund': [426177001, 426783006], # '426177001', '426783006
|
'SB': [426177001],
|
||||||
'Herzrhythmusstörungen': [164890007, 427084000, 164889003, 426761007, 713422000, 427393009, 284470004, 17338001],
|
'AFIB': [164889003, 164890007],
|
||||||
'Leitungsstörungen': [270492004, 233917008, 59118001, 164909002, 698252002],
|
'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],
|
||||||
'EKG-Welle': [164934002, 59931005, 428750005, 164917005, 429622005, 164930006, 164931005, 164912004, 164937009],
|
'SR': [426783006, 427393009]
|
||||||
'Spannungsänderungen': [39732003, 47665007, 251146004, 251199005],
|
|
||||||
'Hypertrophien': [164873001, 89792004],
|
|
||||||
'QT': [111975006],
|
|
||||||
'Repolarisation': [428417006],
|
|
||||||
'Myokardinfarkt': [164865005]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
diag_dict = {k: 0 for k in categories.keys()}
|
|
||||||
|
|
||||||
|
#diag_dict = {k: 0 for k in categories.keys()}
|
||||||
|
|
||||||
|
diag_dict = {k: [] for k in categories.keys()}
|
||||||
|
|
||||||
# Create a counter for the number of records
|
# Create a counter for the number of records
|
||||||
counter = 0
|
counter = 0
|
||||||
max_counter = 100_000#100_000
|
max_counter = 100#100_000
|
||||||
|
|
||||||
# Loop through the records
|
# Loop through the records
|
||||||
for dir_th in os.listdir(data_dir):
|
for dir_th in os.listdir(data_dir):
|
||||||
|
@ -100,9 +101,14 @@ for dir_th in os.listdir(data_dir):
|
||||||
|
|
||||||
# check if diagnosis is a subset of one of the categories
|
# check if diagnosis is a subset of one of the categories
|
||||||
for category_name, category_codes in categories.items():
|
for category_name, category_codes in categories.items():
|
||||||
if set(diagnosis).issubset(set(category_codes)):
|
#if set(diagnosis).issubset(set(category_codes)):
|
||||||
|
|
||||||
|
# if any of the diagnosis codes is in the category_codes
|
||||||
|
if any(i in category_codes for i in diagnosis):
|
||||||
# Increment the counter for the category
|
# Increment the counter for the category
|
||||||
diag_dict[category_name] += 1
|
#diag_dict[category_name] += 1
|
||||||
|
# Add record to the category
|
||||||
|
diag_dict[category_name].append(record)
|
||||||
break
|
break
|
||||||
|
|
||||||
# Increment the counter
|
# Increment the counter
|
||||||
|
@ -120,61 +126,39 @@ for dir_th in os.listdir(data_dir):
|
||||||
break
|
break
|
||||||
if counter_bool:
|
if counter_bool:
|
||||||
break
|
break
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ID: Herzrhythmusstörungen, Count: 22571
|
if any(i in category_codes for i in diagnosis):
|
||||||
ID: Leitungsstörungen, Count: 505
|
ID: SB, Count: 16559
|
||||||
ID: EKG-Welle, Count: 2067
|
ID: AFIB, Count: 9839
|
||||||
ID: Spannungsänderungen, Count: 613
|
ID: GSVT, Count: 948
|
||||||
ID: Hypertrophien, Count: 5
|
ID: SR, Count: 9720
|
||||||
ID: QT, Count: 43
|
break
|
||||||
ID: Repolarisation, Count: 73
|
|
||||||
ID: Myokardinfarkt, Count: 1
|
|
||||||
|
|
||||||
|
Der Counter gibt an ob eine Diagnose in einer Kategorie ist
|
||||||
|
|
||||||
|
---------------------------------------------------------------------------------------------------------------------
|
||||||
|
set(diagnosis).issubset(set(category_codes)):
|
||||||
|
ID: SB, Count: 8909
|
||||||
|
ID: AFIB, Count: 1905
|
||||||
|
ID: GSVT, Count: 431
|
||||||
|
ID: SR, Count: 7299
|
||||||
|
break
|
||||||
|
|
||||||
|
Der Counter gibt an ob alle Diagnosen in einer Kategorie sind
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# # get the data
|
# for id, count in diag_dict.items():
|
||||||
# dict_healthy, dict_afib, dict_mi = get_diag_filtered_data_dict()
|
# print(f"ID: {id}, Count: {count}")
|
||||||
|
|
||||||
# # get unique diagnosis codes
|
|
||||||
# unique_health_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_healthy.values()]).flatten())
|
|
||||||
# unique_afib_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_afib.values()]).flatten())
|
|
||||||
# unique_mi_codes = np.unique(np.array([np.array(get_diagnosis_ids(d)) for d in dict_mi.values()]).flatten())
|
|
||||||
|
|
||||||
# print(unique_health_codes)
|
|
||||||
# print(unique_afib_codes)
|
|
||||||
# print(unique_mi_codes)
|
|
||||||
|
|
||||||
# print(dict_healthy['JS00004'].__dict__)
|
|
||||||
|
|
||||||
|
|
||||||
#print(diag_dict)
|
|
||||||
for id, count in diag_dict.items():
|
|
||||||
print(f"ID: {id}, Count: {count}")
|
|
||||||
|
|
||||||
print(f'Number of counter diagnoses: {len(diag_dict)}')
|
# write to pickle
|
||||||
print(f'Number of diagnoses in the lookup table: {len(diagnosis_lookup)}')
|
|
||||||
print('found in the lookup table: ', len(diag_dict) == len(diagnosis_lookup))
|
|
||||||
|
|
||||||
# flatten the counters and count the unique values
|
|
||||||
# healthy_counter = np.array(healthy_counter).flatten()
|
|
||||||
# afib_counter = np.array(afib_counter).flatten()
|
|
||||||
# mi_counter = np.array(mi_counter).flatten()
|
|
||||||
|
|
||||||
# unique_health_codes, counts_health = np.unique(healthy_counter, return_counts=True)
|
|
||||||
# unique_afib_codes, counts_afib = np.unique(afib_counter, return_counts=True)
|
|
||||||
# unique_mi_codes, counts_mi = np.unique(mi_counter, return_counts=True)
|
|
||||||
|
|
||||||
# print(unique_health_codes)
|
|
||||||
# print(counts_health)
|
|
||||||
# print(unique_afib_codes)
|
|
||||||
# print(counts_afib)
|
|
||||||
# print(unique_mi_codes)
|
|
||||||
# print(counts_mi)
|
|
||||||
|
|
||||||
# # get the names of the diagnosis
|
|
||||||
# names_health = get_diagnosis_name(unique_health_codes)
|
|
||||||
# names_afib = get_diagnosis_name(unique_afib_codes)
|
|
||||||
# names_mi = get_diagnosis_name(unique_mi_codes)
|
|
||||||
|
|
||||||
|
for cat_name, records in diag_dict.items():
|
||||||
|
print(f"Writing {cat_name} to pickle with {len(records)} records")
|
||||||
|
# if path not exists create it
|
||||||
|
if not os.path.exists('./data'):
|
||||||
|
os.makedirs('./data')
|
||||||
|
with open(f'./data/{cat_name}.pkl', 'wb') as f:
|
||||||
|
pickle.dump(records, f)
|
||||||
|
|
Loading…
Reference in New Issue