{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Decison Tree" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Data from Database" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# connect to the database\n", "conn = sqlite3.connect('../features.db')\n", "c = conn.cursor()\n", "# get training, validation and test data\n", "train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n", "valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n", "test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n", "# close the connection\n", "conn.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Format Data for Machine Learning" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get the target and features\n", "train_y = train['y']\n", "train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "train_x = train.drop(columns=['y'])\n", "\n", "valid_y = valid['y']\n", "valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "valid_x = valid.drop(columns=['y'])\n", "\n", "test_y = test['y']\n", "test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", "test_x = test.drop(columns=['y'])\n", "\n", "# drop id column\n", "train_x = train_x.drop(columns=['id'])\n", "valid_x = valid_x.drop(columns=['id'])\n", "test_x = test_x.drop(columns=['id'])\n", "\n", "print('train_x shape:', train_x.shape)\n", "print('test_x shape:', test_x.shape)\n", "print('valid_x shape:', valid_x.shape)\n", "# print column names\n", "print('features:', train_x.columns.to_list())\n", "feature_names = train_x.columns.to_list()\n", "\n", "# Create an imputer object with a mean filling strategy\n", "imputer = SimpleImputer(strategy='mean')\n", "\n", "train_x = imputer.fit_transform(train_x)\n", "valid_x = imputer.transform(valid_x)\n", "test_x = imputer.transform(test_x)\n", "\n", "# Scale Data between 0 and 1\n", "scaler = MinMaxScaler()\n", "# Fit the scaler to your data and then transform it\n", "train_x = scaler.fit_transform(train_x)\n", "valid_x = scaler.transform(valid_x)\n", "test_x = scaler.transform(test_x)\n", "\n", "\n", "\n", "# use xgboost\n", "dtrain = xgb.DMatrix(train_x, label=train_y)\n", "dvalid = xgb.DMatrix(valid_x, label=valid_y)\n", "dtest = xgb.DMatrix(test_x, label=test_y)\n", "\n", "num_classes= len(set(valid_y.to_list()))\n", "print('number of classes:', num_classes)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }