DSA_SS24/notebooks/data_quality.ipynb

155 lines
4.2 KiB
Plaintext
Raw Normal View History

2024-05-08 17:45:29 +02:00
{
"cells": [
2024-05-15 20:20:01 +02:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Quality\n",
"\n",
"This notebook is used to ensure data quality for further evaluations. For this reason, it is examined how much of the data is incomplete. It is important that this only affects a small part of the data in order to avoid any distortion of the data in further analyses."
]
},
2024-05-08 17:45:29 +02:00
{
"cell_type": "code",
2024-05-29 09:25:12 +02:00
"execution_count": 1,
2024-05-08 17:45:29 +02:00
"metadata": {},
2024-05-15 20:20:01 +02:00
"outputs": [],
"source": [
"import pickle"
]
},
2024-05-08 17:45:29 +02:00
{
"cell_type": "code",
"execution_count": 1,
2024-05-08 17:45:29 +02:00
"metadata": {},
2024-05-15 20:20:01 +02:00
"outputs": [],
"source": [
"path = \"C:/Studium/dsa/data\"\n",
"#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
2024-05-08 17:45:29 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading SB\n",
2024-05-29 09:25:12 +02:00
"Length of SB: 50\n",
2024-05-08 17:45:29 +02:00
"Reading AFIB\n",
2024-05-29 09:25:12 +02:00
"Length of AFIB: 27\n",
2024-05-08 17:45:29 +02:00
"Reading GSVT\n",
2024-05-29 09:25:12 +02:00
"Length of GSVT: 0\n",
2024-05-08 17:45:29 +02:00
"Reading SR\n",
2024-05-29 09:25:12 +02:00
"Length of SR: 13\n"
2024-05-08 17:45:29 +02:00
]
}
],
"source": [
"import pickle\n",
"from matplotlib import pyplot as plt\n",
"import wfdb\n",
"# read pickle files and check len and print first record and first record keys\n",
"\n",
2024-05-29 09:25:12 +02:00
"#path = \"C:/Studium/dsa/data\"\n",
2024-05-08 17:45:29 +02:00
"#path = \"C:/Users/Nils/Documents/HS-Mannheim/0000_MASTER/DSA/EKG_Prog/data\"\n",
2024-05-29 09:25:12 +02:00
"path = \"C:/Users/klara/projects/DSA/data\"\n",
"\n",
2024-05-08 17:45:29 +02:00
"\n",
"categories_dict = {\n",
"'SB': [426177001],\n",
"'AFIB': [164889003, 164890007],\n",
"'GSVT': [426761007, 713422000, 233896004, 233897008, 713422000],\n",
"'SR': [426783006, 427393009]\n",
"}\n",
"\n",
"\n",
"data = {}\n",
"for cat_name in categories_dict.keys():\n",
" print(f\"Reading {cat_name}\")\n",
" with open(f'{path}/{cat_name}.pkl', 'rb') as f:\n",
" records = pickle.load(f)\n",
" data[cat_name] = records\n",
" print(f\"Length of {cat_name}: {len(records)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-05-15 20:20:01 +02:00
"## Check for missing data"
2024-05-08 17:45:29 +02:00
]
},
{
"cell_type": "code",
2024-05-15 20:20:01 +02:00
"execution_count": 4,
2024-05-08 17:45:29 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-12 13:31:54 +02:00
"First record in SB: ['Age: 59', 'Sex: Female', 'Dx: 426177001,164934002', 'Rx: Unknown', 'Hx: Unknown', 'Sx: Unknown']\n",
"Missing timeseries in 0 records\n",
"Missing age in 55 records\n",
"Missing sex in 21 records\n"
2024-05-08 17:45:29 +02:00
]
}
],
"source": [
2024-05-12 13:31:54 +02:00
"# print first record and first record keys\n",
"print(f\"First record in SB: {data['SB'][0].comments}\")\n",
"\n",
2024-05-08 17:45:29 +02:00
"missing_timeseries = []\n",
2024-05-12 13:31:54 +02:00
"missing_age = []\n",
"missing_sex = []\n",
2024-05-08 17:45:29 +02:00
"for cat_name, records in data.items():\n",
" for record in records:\n",
" if len(record.p_signal) != 5000:\n",
" missing_timeseries.append(record)\n",
" print(f\"Missing timeseries in {record.record_name}\")\n",
" #if record.comments[2]== '':\n",
2024-05-12 13:31:54 +02:00
" if 'Age: ' not in record.comments[0] or record.comments[0] == 'Age: NaN':\n",
" missing_age.append(record)\n",
" if record.comments[1] == 'Sex: Unknown' or record.comments[1] == '':\n",
" missing_sex.append(record)\n",
2024-05-15 20:20:01 +02:00
" \n",
2024-05-12 13:31:54 +02:00
"print(f\"Missing timeseries in {len(missing_timeseries)} records\")\n",
"print(f\"Missing age in {len(missing_age)} records\")\n",
"print(f\"Missing sex in {len(missing_sex)} records\")"
2024-05-08 17:45:29 +02:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-05-29 09:25:12 +02:00
"version": "3.11.9"
2024-05-08 17:45:29 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}