From 77bf491aa63874a89c86758354bce392af626c1d Mon Sep 17 00:00:00 2001 From: Felix Mucha <3016498@stud.hs-mannheim.de> Date: Mon, 27 Jan 2025 07:11:32 +0100 Subject: [PATCH] extended ml pipeline, todo: model architecture --- transformer.py => transformer_1a.py | 9 +- transformer_1b.py | 199 +++++++++++++++ transformer_evaluation.ipynb | 363 ++++++++++++++++++++++++++++ 3 files changed, 570 insertions(+), 1 deletion(-) rename transformer.py => transformer_1a.py (96%) create mode 100644 transformer_1b.py create mode 100644 transformer_evaluation.ipynb diff --git a/transformer.py b/transformer_1a.py similarity index 96% rename from transformer.py rename to transformer_1a.py index 01faaac..bc8de3e 100644 --- a/transformer.py +++ b/transformer_1a.py @@ -42,6 +42,13 @@ import time import torchvision torchvision.disable_beta_transforms_warning() + +def get_device(verbose=False): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if verbose: + print('Using device:', device) + return device + # Test if GPU is available DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', DEVICE) @@ -69,7 +76,7 @@ def pad_sequences(sequences, MAX_LEN): class HumorDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings - self.labels = labels + self.labels = labels.reset_index(drop=True) def __getitem__(self, idx): item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)} diff --git a/transformer_1b.py b/transformer_1b.py new file mode 100644 index 0000000..05ffd62 --- /dev/null +++ b/transformer_1b.py @@ -0,0 +1,199 @@ +""" +This file contains the transformer model. +""" + + +# TODO refactor the code +# TODO create ml helper script +# TODO create ml evaluation script + +# TODO track overfitting better +# TODO validate model in training (accuracy, loss, etc) + +# TODO set length to a constant value which is the max length of the sentences or nearly + + +# TODO user gloVe embeddings + +#TODO: add attention mask +# TODO: add positional encoding +#TODO: add dropout (if needed) + +import time +import json + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from transformers import AdamW + +from sklearn.metrics import accuracy_score + +import ml_helper +import ml_history + +class TransformerBinaryClassifier(nn.Module): + def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1): + super(TransformerBinaryClassifier, self).__init__() + self.embedding = nn.Embedding(vocab_size, embed_dim) + self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout) + self.fc = nn.Linear(embed_dim, 1) + self.sigmoid = nn.Sigmoid() + + def forward(self, input_ids): + input_ids = input_ids.long() + embedded = self.embedding(input_ids) + transformer_output = self.transformer(embedded, embedded) + pooled_output = transformer_output.mean(dim=1) + logits = self.fc(pooled_output) + return self.sigmoid(logits) + + + +if __name__ == "__main__": + + # Load the data + data_path = 'data/idx_based_padded' + + train_dataset = torch.load(data_path + '/train.pt') + test_dataset = torch.load(data_path + '/test.pt') + val_dataset = torch.load(data_path + '/val.pt') + + # +2 for padding and unk tokens + vocab_size = train_dataset.vocab_size + 2 + embed_dim = 100 #train_dataset.emb_dim + + # NOTE: Info comes from data explore notebook: 280 is max length, + # 139 contains 80% and 192 contains 95% of the data + max_len = 280 + + device = ml_helper.get_device(verbose=True) + + # Model hyperparameters + num_heads = 2 + num_layers = 2 + hidden_dim = 256 + + model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim) + + # Training parameters + epochs = 3 #3 + batch_size = 8 + learning_rate = 2e-5 + + # Optimizer and loss function + optimizer = AdamW(model.parameters(), lr=learning_rate) + criterion = nn.BCEWithLogitsLoss() + + + # Data loaders + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) + + + ################################################################################################ + # Training + ################################################################################################ + + # Initialize the history + history = ml_history.History() + + # Model to device + model.to(device) + + print("Starting training...") + start_training_time = time.time() + + # Training loop + model.train() + for epoch in range(epochs): + # init batch tracking + epoch_start_time = time.time() + history.batch_reset() + + for batch in train_loader: + optimizer.zero_grad() + # prepare batch + input_ids = batch['input_ids'].to(device) + labels = batch['labels'].unsqueeze(1).to(device) + # forward pass + outputs = model(input_ids) + loss = criterion(outputs, labels) + # backward pass + loss.backward() + optimizer.step() + # calculate accuracy train + preds = outputs.round() + train_acc = accuracy_score(labels.cpu().detach().numpy(), + preds.cpu().detach().numpy()) + # update batch history + history.batch_update_train(loss.item(), train_acc) + + # calculate accuracy val + model.eval() + with torch.no_grad(): + for val_batch in val_loader: + val_input_ids = val_batch['input_ids'].to(device) + val_labels_batch = val_batch['labels'].unsqueeze(1).to(device) + val_outputs = model(val_input_ids) + val_acc = accuracy_score(val_outputs.round().cpu().numpy(), + val_labels_batch.cpu().numpy()) + history.batch_update_val(val_acc) + model.train() + + # update epoch history + history.update() + + epoch_end_time = time.time() + + print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}") + + end_training_time = time.time() + print(f"Training finished in {end_training_time - start_training_time:.2f} seconds") + + + ################################################################################################ + # Evaluation + ################################################################################################ + print("Starting evaluation...") + + model.eval() + predictions, true_labels = [], [] + with torch.no_grad(): + for batch in test_loader: + input_ids = batch['input_ids'].to(device) + labels = batch['labels'].unsqueeze(1).to(device) + + outputs = model(input_ids) + preds = outputs.round() + predictions.extend(preds.cpu().numpy()) + true_labels.extend(labels.cpu().numpy()) + + accuracy = accuracy_score(true_labels, predictions) + print(f"Accuracy: {accuracy}") + + + ################################################################################################ + # Save model and hyperparameters + ################################################################################################ + timestamp = time.strftime("%Y%m%d-%H%M%S") + + ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp, + max_len=max_len, + vocab_size=vocab_size, + embed_dim=embed_dim, + num_heads=num_heads, + num_layers=num_layers, + hidden_dim=hidden_dim, + epochs=epochs, + batch_size=batch_size, + learning_rate=learning_rate) + + #save history + + history_path = f'models/transformer_history_{timestamp}.json' + with open(history_path, 'w') as f: + json.dump(history.get_history(), f) \ No newline at end of file diff --git a/transformer_evaluation.ipynb b/transformer_evaluation.ipynb new file mode 100644 index 0000000..412f20e --- /dev/null +++ b/transformer_evaluation.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import json\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import ml_helper" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TODO: \n", + "- clean and refactor maybe ml_plot.py whith plot functions" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading model from: models/transformer_acc_0.5056_20250127-061459.pth\n", + "Loading history from: models/transformer_history_20250127-061459.json\n", + "Loading hyperparameters from: models/transformer_para_acc_0.5056_20250127-061459.json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_16796\\1644685603.py:5: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model = torch.load(model_path)\n" + ] + } + ], + "source": [ + "# load newest model\n", + "path = 'models/'\n", + "model_path = ml_helper.get_newest_model_path(path)\n", + "print(\"Loading model from: \", model_path)\n", + "model = torch.load(model_path)\n", + "\n", + "# load history\n", + "history_path = ml_helper.get_newest_model_path(path, name=\"history\", extension=\".json\")\n", + "print(\"Loading history from: \", history_path)\n", + "with open(history_path) as f:\n", + " history = json.load(f)\n", + "\n", + "# load hyperparameters\n", + "hyperparameters_path = ml_helper.get_newest_model_path(path, name=\"para\", extension=\".json\")\n", + "print(\"Loading hyperparameters from: \", hyperparameters_path)\n", + "with open(hyperparameters_path) as f:\n", + " params = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "History:\n", + "{\n", + " \"loss\": [\n", + " 0.6977859839254063,\n", + " 0.6934245683644947,\n", + " 0.6932587604291043\n", + " ],\n", + " \"train_acc\": [\n", + " 0.5086032388663968,\n", + " 0.5080971659919028,\n", + " 0.5063259109311741\n", + " ],\n", + " \"val_acc\": [\n", + " 0.5093117408906882,\n", + " 0.5093117408906882,\n", + " 0.5093117408906882\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "# print history\n", + "print(\"History:\")\n", + "print(json.dumps(history, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot training accuracy vs validation accuracy\n", + "plt.plot(history['train_acc'], label='train_acc')\n", + "plt.plot(history['val_acc'], label='val_acc')\n", + "plt.legend()\n", + "plt.title('Training accuracy vs Validation accuracy')\n", + "plt.xlabel('Epoch')\n", + "plt.ylabel('Accuracy')\n", + "# set y-axis limits to 0-1\n", + "#plt.ylim(0, 1)\n", + "# set x-axis limits to integer steps\n", + "plt.xticks(np.arange(0, len(history['train_acc']), 1))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(8000, 6)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load the data from csv\n", + "df = pd.read_csv('data/hack.csv')\n", + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_16796\\4202493223.py:4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " train_dataset = torch.load(data_path + '/train.pt')\n", + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_16796\\4202493223.py:5: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " test_dataset = torch.load(data_path + '/test.pt')\n", + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_16796\\4202493223.py:6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " val_dataset = torch.load(data_path + '/val.pt')\n" + ] + } + ], + "source": [ + "# Load the data\n", + "data_path = 'data/idx_based_padded'\n", + "\n", + "train_dataset = torch.load(data_path + '/train.pt')\n", + "test_dataset = torch.load(data_path + '/test.pt') \n", + "val_dataset = torch.load(data_path + '/val.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "textn" + ] + } + ], + "source": [ + "print(train_dataset.original_indices)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'max_len': 280, 'vocab_size': 10556, 'embed_dim': 100, 'num_heads': 2, 'num_layers': 2, 'hidden_dim': 256, 'epochs': 3, 'batch_size': 8, 'learning_rate': 2e-05, 'accuracy': 0.5056}\n" + ] + } + ], + "source": [ + "print(params)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", + " warnings.warn(\n", + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_16796\\3082896325.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_path))\n" + ] + } + ], + "source": [ + "import transformer_1b\n", + "\n", + "model = transformer_1b.TransformerBinaryClassifier(params['vocab_size'], params['embed_dim'], \n", + " params['num_heads'], params['num_layers'], \n", + " params['hidden_dim'])\n", + "\n", + "model.load_state_dict(torch.load(model_path))\n", + "\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "predictions = []\n", + "labels = []\n", + "\n", + "# Predict on test set\n", + "model.eval()\n", + "with torch.no_grad():\n", + " data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)\n", + " for batch in data_loader:\n", + " input_ids = batch['input_ids'].to(device)\n", + " labels_batch = batch['labels'].unsqueeze(1).to(device)\n", + " outputs = model(input_ids)\n", + " outputs = outputs.cpu().round().numpy()\n", + " labels_batch = labels_batch.cpu().numpy()\n", + "\n", + " predictions.append(outputs)\n", + " labels.append(labels_batch)\n", + "\n", + "# Concatenate all predictions\n", + "predictions = np.concatenate(predictions)\n", + "labels = np.concatenate(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "# get df data for original indices\n", + "df_test = df.iloc[test_dataset.original_indices].copy()\n", + "df_test['prediction'] = predictions\n", + "df_test['label'] = labels\n", + "df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])\n", + "\n", + "df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "997\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import patches as mpatches\n", + "\n", + "median_rating = df['humor_rating'].median()\n", + "# get first index where humor_rating is greater than median_rating\n", + "median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]\n", + "print(median_idx)\n", + "# range idx for len df_test\n", + "range_idx = range(len(df_test))\n", + "colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})\n", + "# bar plot for each df_test humor_rating value \n", + "plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)\n", + "# vertical line for True/False cut off\n", + "plt.axvline(x=median_idx, color='black', linestyle='--')\n", + "# Create a legend handles\n", + "green_patch = mpatches.Patch(color='g', label='Correct Prediction')\n", + "red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')\n", + "line_patch = mpatches.Patch(color='black', label='humor_rating cut off')\n", + "plt.title('Humor Rating vs Prediction for Test Set')\n", + "plt.xlabel('Index')\n", + "plt.ylabel('Humor Rating')\n", + "plt.legend(handles=[green_patch, red_patch, line_patch])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NOTE: \n", + "- model currently only predicts 0 therefore one site is green and other red\n", + "- plot can be helpfull to identify if around the cut off the model gets confused" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}