diff --git a/notebooks/decision_tree.ipynb b/notebooks/decision_tree.ipynb new file mode 100644 index 0000000..471cafa --- /dev/null +++ b/notebooks/decision_tree.ipynb @@ -0,0 +1,105 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decison Tree" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Data from Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# connect to the database\n", + "conn = sqlite3.connect('../features.db')\n", + "c = conn.cursor()\n", + "# get training, validation and test data\n", + "train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n", + "valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n", + "test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n", + "# close the connection\n", + "conn.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Format Data for Machine Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the target and features\n", + "train_y = train['y']\n", + "train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", + "train_x = train.drop(columns=['y'])\n", + "\n", + "valid_y = valid['y']\n", + "valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", + "valid_x = valid.drop(columns=['y'])\n", + "\n", + "test_y = test['y']\n", + "test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n", + "test_x = test.drop(columns=['y'])\n", + "\n", + "# drop id column\n", + "train_x = train_x.drop(columns=['id'])\n", + "valid_x = valid_x.drop(columns=['id'])\n", + "test_x = test_x.drop(columns=['id'])\n", + "\n", + "print('train_x shape:', train_x.shape)\n", + "print('test_x shape:', test_x.shape)\n", + "print('valid_x shape:', valid_x.shape)\n", + "# print column names\n", + "print('features:', train_x.columns.to_list())\n", + "feature_names = train_x.columns.to_list()\n", + "\n", + "# Create an imputer object with a mean filling strategy\n", + "imputer = SimpleImputer(strategy='mean')\n", + "\n", + "train_x = imputer.fit_transform(train_x)\n", + "valid_x = imputer.transform(valid_x)\n", + "test_x = imputer.transform(test_x)\n", + "\n", + "# Scale Data between 0 and 1\n", + "scaler = MinMaxScaler()\n", + "# Fit the scaler to your data and then transform it\n", + "train_x = scaler.fit_transform(train_x)\n", + "valid_x = scaler.transform(valid_x)\n", + "test_x = scaler.transform(test_x)\n", + "\n", + "\n", + "\n", + "# use xgboost\n", + "dtrain = xgb.DMatrix(train_x, label=train_y)\n", + "dvalid = xgb.DMatrix(valid_x, label=valid_y)\n", + "dtest = xgb.DMatrix(test_x, label=test_y)\n", + "\n", + "num_classes= len(set(valid_y.to_list()))\n", + "print('number of classes:', num_classes)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}