DSA_SS24/notebooks/data_quality.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Quality\n",
    "\n",
    "This notebook is used to ensure data quality for further evaluations. For this reason, it is examined how much of the data is incomplete. It is important that this only affects a small part of the data in order to avoid any distortion of the data in further analyses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('../scripts')\n",
    "import data_helper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading GSVT\n",
      "Reading AFIB\n",
      "Reading SR\n",
      "Reading SB\n",
      "Number of patients per category:\n",
      "GSVT: 1027\n",
      "AFIB: 9787\n",
      "SR: 10426\n",
      "SB: 15826\n"
     ]
    }
   ],
   "source": [
    "data = data_helper.load_data(only_demographic=False)\n",
    "\n",
    "print(\"Number of patients per category:\")\n",
    "for cat_name in data.keys():\n",
    "    print(f\"{cat_name}: {len(data[cat_name])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check for missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First record in SB: ['Age: 59', 'Sex: Female', 'Dx: 426177001,164934002', 'Rx: Unknown', 'Hx: Unknown', 'Sx: Unknown']\n",
      "Missing timeseries in 0 records\n",
      "Missing age in 55 records\n",
      "Missing sex in 21 records\n"
     ]
    }
   ],
   "source": [
    "# print first record and first record keys\n",
    "print(f\"First record in SB: {data['SB'][0].comments}\")\n",
    "\n",
    "missing_timeseries = []\n",
    "missing_age = []\n",
    "missing_sex = []\n",
    "for cat_name, records in data.items():\n",
    "    for record in records:\n",
    "        if len(record.p_signal) != 5000:\n",
    "            missing_timeseries.append(record)\n",
    "            print(f\"Missing timeseries in {record.record_name}\")\n",
    "        #if record.comments[2]== '':\n",
    "        if 'Age: ' not in record.comments[0] or record.comments[0] == 'Age: NaN':\n",
    "            missing_age.append(record)\n",
    "        if record.comments[1] == 'Sex: Unknown' or record.comments[1] == '':\n",
    "            missing_sex.append(record)\n",
    "                      \n",
    "print(f\"Missing timeseries in {len(missing_timeseries)} records\")\n",
    "print(f\"Missing age in {len(missing_age)} records\")\n",
    "print(f\"Missing sex in {len(missing_sex)} records\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}