{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Gradient Boosting Tree (GBT) Training and Analysis" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import os\n", "from datetime import datetime\n", "from joblib import dump, load\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import xgboost as xgb\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.metrics import confusion_matrix, f1_score\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import Data from Database" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# connect to the database\n", "conn = sqlite3.connect('../features.db')\n", "c = conn.cursor()\n", "# get training, validation and test data\n", "train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n", "valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n", "test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n", "# close the connection\n", "conn.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Format Data for Machine Learning" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train_x shape: (3502, 10)\n", "test_x shape: (438, 10)\n", "valid_x shape: (438, 10)\n", "features: ['age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n", "number of classes: 4\n" ] } ], "source": [ "# get the target and features\n", "train_y = train['y']\n", "train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "train_x = train.drop(columns=['y'])\n", "\n", "valid_y = valid['y']\n", "valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "valid_x = valid.drop(columns=['y'])\n", "\n", "test_y = test['y']\n", "test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "test_x = test.drop(columns=['y'])\n", "\n", "# drop id column\n", "train_x = train_x.drop(columns=['id'])\n", "valid_x = valid_x.drop(columns=['id'])\n", "test_x = test_x.drop(columns=['id'])\n", "\n", "print('train_x shape:', train_x.shape)\n", "print('test_x shape:', test_x.shape)\n", "print('valid_x shape:', valid_x.shape)\n", "# print column names\n", "print('features:', train_x.columns.to_list())\n", "feature_names = train_x.columns.to_list()\n", "\n", "# Create an imputer object with a mean filling strategy\n", "imputer = SimpleImputer(strategy='mean')\n", "\n", "train_x = imputer.fit_transform(train_x)\n", "valid_x = imputer.transform(valid_x)\n", "test_x = imputer.transform(test_x)\n", "\n", "# Scale Data between 0 and 1\n", "scaler = MinMaxScaler()\n", "# Fit the scaler to your data and then transform it\n", "train_x = scaler.fit_transform(train_x)\n", "valid_x = scaler.transform(valid_x)\n", "test_x = scaler.transform(test_x)\n", "\n", "\n", "\n", "# use xgboost\n", "dtrain = xgb.DMatrix(train_x, label=train_y)\n", "dvalid = xgb.DMatrix(valid_x, label=valid_y)\n", "dtest = xgb.DMatrix(test_x, label=test_y)\n", "\n", "num_classes= len(set(valid_y.to_list()))\n", "print('number of classes:', num_classes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test Grid for Hyperparameter Analysis" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "param_grid = {\n", " 'n_estimators': [100, 200, 300],\n", " 'learning_rate': [0.1, 0.2, 0.3],\n", " 'max_depth': [1, 3, 5],\n", "}# 'random_stat': 42" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Create a XGBClassifier object\n", "model = GradientBoostingClassifier()\n", "\n", "# Create the grid search object\n", "grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 49s\n", "Wall time: 4min 28s\n" ] }, { "data": { "text/html": [ "
GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),\n", " param_grid={'learning_rate': [0.1, 0.2, 0.3],\n", " 'max_depth': [1, 3, 5],\n", " 'n_estimators': [100, 200, 300]},\n", " scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),\n", " param_grid={'learning_rate': [0.1, 0.2, 0.3],\n", " 'max_depth': [1, 3, 5],\n", " 'n_estimators': [100, 200, 300]},\n", " scoring='accuracy')
GradientBoostingClassifier()
GradientBoostingClassifier()
GradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier()