{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extreme Gradient Boosting (XGBoost) Training and Analysis" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import os\n", "from datetime import datetime\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import xgboost as xgb\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.metrics import confusion_matrix, f1_score\n", "import seaborn as sns\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import Data from Database" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# connect to the database\n", "conn = sqlite3.connect('../features.db')\n", "c = conn.cursor()\n", "# get training, validation and test data\n", "train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n", "valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n", "test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n", "# close the connection\n", "conn.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Format Data for Machine Learning" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train_x shape: (3502, 10)\n", "test_x shape: (438, 10)\n", "valid_x shape: (438, 10)\n", "features: ['age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n", "number of classes: 4\n" ] } ], "source": [ "# get the target and features\n", "train_y = train['y']\n", "train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "train_x = train.drop(columns=['y'])\n", "\n", "valid_y = valid['y']\n", "valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "valid_x = valid.drop(columns=['y'])\n", "\n", "test_y = test['y']\n", "test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "test_x = test.drop(columns=['y'])\n", "\n", "# drop id column\n", "train_x = train_x.drop(columns=['id'])\n", "valid_x = valid_x.drop(columns=['id'])\n", "test_x = test_x.drop(columns=['id'])\n", "\n", "print('train_x shape:', train_x.shape)\n", "print('test_x shape:', test_x.shape)\n", "print('valid_x shape:', valid_x.shape)\n", "\n", "# print column names\n", "print('features:', train_x.columns.to_list())\n", "\n", "# use xgboost\n", "dtrain = xgb.DMatrix(train_x, label=train_y)\n", "dvalid = xgb.DMatrix(valid_x, label=valid_y)\n", "dtest = xgb.DMatrix(test_x, label=test_y)\n", "\n", "num_classes= len(set(valid_y.to_list()))\n", "print('number of classes:', num_classes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test Grid for Hyperparameter Analysis" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "param_grid = {\n", " 'max_depth': [3, 4, 5],\n", " 'min_child_weight': [1, 2, 3],\n", " 'eta': [0.1, 0.2, 0.3],\n", " 'learning_rate': [0.1, 0.2, 0.3],\n", " 'n_estimators': [100, 200, 300]\n", "}" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# Create a XGBClassifier object\n", "model = xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes, eval_metric='merror')\n", "\n", "# Create the grid search object\n", "grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2h 15min 58s\n", "Wall time: 10min\n" ] }, { "data": { "text/html": [ "
GridSearchCV(cv=3,\n", " estimator=XGBClassifier(base_score=None, booster=None,\n", " callbacks=None, colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric='merror', gamma=None,\n", " gpu_id=None, grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=None, max_bin=None,\n", " ma...\n", " max_leaves=None, min_child_weight=None,\n", " missing=nan, monotone_constraints=None,\n", " n_estimators=100, n_jobs=None, num_class=4,\n", " num_parallel_tree=None,\n", " objective='multi:softmax', predictor=None,\n", " random_state=None, ...),\n", " param_grid={'eta': [0.1, 0.2, 0.3],\n", " 'learning_rate': [0.1, 0.2, 0.3],\n", " 'max_depth': [3, 4, 5], 'min_child_weight': [1, 2, 3],\n", " 'n_estimators': [100, 200, 300]},\n", " scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3,\n", " estimator=XGBClassifier(base_score=None, booster=None,\n", " callbacks=None, colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric='merror', gamma=None,\n", " gpu_id=None, grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=None, max_bin=None,\n", " ma...\n", " max_leaves=None, min_child_weight=None,\n", " missing=nan, monotone_constraints=None,\n", " n_estimators=100, n_jobs=None, num_class=4,\n", " num_parallel_tree=None,\n", " objective='multi:softmax', predictor=None,\n", " random_state=None, ...),\n", " param_grid={'eta': [0.1, 0.2, 0.3],\n", " 'learning_rate': [0.1, 0.2, 0.3],\n", " 'max_depth': [3, 4, 5], 'min_child_weight': [1, 2, 3],\n", " 'n_estimators': [100, 200, 300]},\n", " scoring='accuracy')
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric='merror', gamma=None,\n", " gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", " max_leaves=None, min_child_weight=None, missing=nan,\n", " monotone_constraints=None, n_estimators=100, n_jobs=None,\n", " num_class=4, num_parallel_tree=None, objective='multi:softmax',\n", " predictor=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric='merror', gamma=None,\n", " gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", " max_leaves=None, min_child_weight=None, missing=nan,\n", " monotone_constraints=None, n_estimators=100, n_jobs=None,\n", " num_class=4, num_parallel_tree=None, objective='multi:softmax',\n", " predictor=None, random_state=None, ...)