194 lines
5.4 KiB
Plaintext
194 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Decison Tree"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sqlite3\n",
|
|
"import os\n",
|
|
"from datetime import datetime\n",
|
|
"from joblib import dump, load\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import xgboost as xgb\n",
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
|
"from sklearn.metrics import confusion_matrix, f1_score\n",
|
|
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from sklearn.preprocessing import MinMaxScaler\n",
|
|
"\n",
|
|
"import seaborn as sns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Import Data from Database"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# connect to the database\n",
|
|
"conn = sqlite3.connect('../features.db')\n",
|
|
"c = conn.cursor()\n",
|
|
"# get training, validation and test data\n",
|
|
"train = pd.read_sql_query(\"SELECT * FROM train\", conn)\n",
|
|
"valid = pd.read_sql_query(\"SELECT * FROM validation\", conn)\n",
|
|
"test = pd.read_sql_query(\"SELECT * FROM test\", conn)\n",
|
|
"# close the connection\n",
|
|
"conn.close()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Format Data for Machine Learning"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"train_x shape: (3502, 10)\n",
|
|
"test_x shape: (438, 10)\n",
|
|
"valid_x shape: (438, 10)\n",
|
|
"features: ['age', 'gender', 'artial_rate', 'ventricular_rate', 'qrs_duration', 'qt_length', 'qrs_count', 'q_peak', 'r_axis', 't_axis']\n",
|
|
"number of classes: 4\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# get the target and features\n",
|
|
"train_y = train['y']\n",
|
|
"train_y = train_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
|
|
"train_x = train.drop(columns=['y'])\n",
|
|
"\n",
|
|
"valid_y = valid['y']\n",
|
|
"valid_y = valid_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
|
|
"valid_x = valid.drop(columns=['y'])\n",
|
|
"\n",
|
|
"test_y = test['y']\n",
|
|
"test_y = test_y.map({'GSVT': 0, 'AFIB': 1, 'SR': 2, 'SB': 3})\n",
|
|
"test_x = test.drop(columns=['y'])\n",
|
|
"\n",
|
|
"# drop id column\n",
|
|
"train_x = train_x.drop(columns=['id'])\n",
|
|
"valid_x = valid_x.drop(columns=['id'])\n",
|
|
"test_x = test_x.drop(columns=['id'])\n",
|
|
"\n",
|
|
"print('train_x shape:', train_x.shape)\n",
|
|
"print('test_x shape:', test_x.shape)\n",
|
|
"print('valid_x shape:', valid_x.shape)\n",
|
|
"# print column names\n",
|
|
"print('features:', train_x.columns.to_list())\n",
|
|
"feature_names = train_x.columns.to_list()\n",
|
|
"\n",
|
|
"# Create an imputer object with a mean filling strategy\n",
|
|
"imputer = SimpleImputer(strategy='mean')\n",
|
|
"\n",
|
|
"train_x = imputer.fit_transform(train_x)\n",
|
|
"valid_x = imputer.transform(valid_x)\n",
|
|
"test_x = imputer.transform(test_x)\n",
|
|
"\n",
|
|
"# Scale Data between 0 and 1\n",
|
|
"scaler = MinMaxScaler()\n",
|
|
"# Fit the scaler to your data and then transform it\n",
|
|
"train_x = scaler.fit_transform(train_x)\n",
|
|
"valid_x = scaler.transform(valid_x)\n",
|
|
"test_x = scaler.transform(test_x)\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"# use xgboost\n",
|
|
"dtrain = xgb.DMatrix(train_x, label=train_y)\n",
|
|
"dvalid = xgb.DMatrix(valid_x, label=valid_y)\n",
|
|
"dtest = xgb.DMatrix(test_x, label=test_y)\n",
|
|
"\n",
|
|
"num_classes= len(set(valid_y.to_list()))\n",
|
|
"print('number of classes:', num_classes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Validierungsgenauigkeit: 0.7557077625570776\n",
|
|
"Testgenauigkeit: 0.7922374429223744\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"\n",
|
|
"\n",
|
|
"# Beispiel: Begrenzung der Tiefe des Baumes\n",
|
|
"dt_classifier = DecisionTreeClassifier(max_depth=5)\n",
|
|
"\n",
|
|
"# Schritt 3: Trainieren des Modells mit Trainingsdaten\n",
|
|
"dt_classifier.fit(train_x, train_y)\n",
|
|
"\n",
|
|
"# Schritt 4: Bewertung des Modells mit Validierungsdaten\n",
|
|
"valid_pred = dt_classifier.predict(valid_x)\n",
|
|
"valid_accuracy = accuracy_score(valid_y, valid_pred)\n",
|
|
"print(f'Validierungsgenauigkeit: {valid_accuracy}')\n",
|
|
"\n",
|
|
"# Schritt 5: Hyperparameter-Optimierung\n",
|
|
"\n",
|
|
"# Schritt 6: Endgültige Bewertung mit Testdaten\n",
|
|
"test_pred = dt_classifier.predict(test_x)\n",
|
|
"test_accuracy = accuracy_score(test_y, test_pred)\n",
|
|
"print(f'Testgenauigkeit: {test_accuracy}')\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|