diff --git a/transformer.ipynb b/transformer.ipynb new file mode 100644 index 0000000..cdbeff2 --- /dev/null +++ b/transformer.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KuFFT6LrB6Fe" + }, + "outputs": [], + "source": [ + "import time\n", + "import json\n", + "import math\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader\n", + "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", + "\n", + "from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n", + "from sklearn.model_selection import KFold\n", + "# local imports\n", + "import ml_evaluation as ml_eval\n", + "import ml_helper\n", + "import ml_history\n", + "import dataset_generator as data_gen\n", + "# class imports\n", + "import HumorDataset as humor_ds\n", + "import EarlyStopping\n", + "import BalancedCELoss\n", + "\n", + "\n", + "# architecture inspired:\n", + "# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n", + "\n", + "# TODO: maybe KFold for cross validation?\n", + "\n", + "\n", + "#TODO: softmax for output layer or loss wih logits\n", + "#TODO: model.train() in training loop\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generalisation ideas:\n", + "- **data augmentation** with Pseudo-labelling (if acc is it better)\n", + "- **Adversarial Training** adding small perturbations to the input data during training\n", + "- **Multi Sample Dropout** During training, multiple forward passes are performed with different dropout masks, and the resulting predictions are averaged.\n", + "\n", + "### Learnings from papers:\n", + "- all used pretrained models berta like architectures\n", + "- voting from multiple models looks like a good approach (cross validation)\n", + "\n", + "### TO DOS:\n", + "- improve Generalisation with avg cross validation\n", + "(https://aclanthology.org/2021.semeval-1.35.pdf)" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image.png](attachment:image.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + } + ], + "source": [ + "torch.manual_seed(0)\n", + "np.random.seed(0)\n", + "\n", + "\n", + "best_model_filename = 'best_transformer_model.pt'\n", + "\n", + "device = ml_helper.get_device(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "400002\n", + "vocab_size: 400002, d_model: 100\n", + "vocab_size: 400002, d_model: 100\n" + ] + } + ], + "source": [ + "embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n", + "\n", + "vocab_size = len(embedding_matrix)\n", + "d_model = len(embedding_matrix[0])\n", + "vocab_size, d_model = embedding_matrix.size()\n", + "print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Model" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "class PositionalEncoding(nn.Module):\n", + " \"\"\"\n", + " https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n", + " \"\"\"\n", + "\n", + " def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n", + " super().__init__()\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " pe = torch.zeros(vocab_size, d_model)\n", + " position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n", + " div_term = torch.exp(\n", + " torch.arange(0, d_model, 2).float()\n", + " * (-math.log(10000.0) / d_model)\n", + " )\n", + " pe[:, 0::2] = torch.sin(position * div_term)\n", + " pe[:, 1::2] = torch.cos(position * div_term)\n", + " pe = pe.unsqueeze(0)\n", + " self.register_buffer(\"pe\", pe)\n", + "\n", + " def forward(self, x):\n", + " x = x + self.pe[:, : x.size(1), :]\n", + " return self.dropout(x)\n", + "\n", + "\n", + "class TransformerBinaryClassifier(nn.Module):\n", + " \"\"\"\n", + " Text classifier based on a pytorch TransformerEncoder.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " embeddings,\n", + " nhead=8,\n", + " dim_feedforward=2048,\n", + " num_layers=6,\n", + " positional_dropout=0.1,\n", + " classifier_dropout=0.1,\n", + " activation=\"relu\",\n", + " ):\n", + "\n", + " super().__init__()\n", + "\n", + " vocab_size, d_model = embeddings.size()\n", + " assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n", + "\n", + " self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n", + "\n", + " self.pos_encoder = PositionalEncoding(\n", + " d_model=d_model,\n", + " dropout=positional_dropout,\n", + " vocab_size=vocab_size,\n", + " )\n", + "\n", + " encoder_layer = nn.TransformerEncoderLayer(\n", + " d_model=d_model,\n", + " nhead=nhead,\n", + " dim_feedforward=dim_feedforward,\n", + " dropout=classifier_dropout,\n", + " )\n", + " self.transformer_encoder = nn.TransformerEncoder(\n", + " encoder_layer,\n", + " num_layers=num_layers,\n", + " )\n", + " # normalize to stabilize and stop overfitting\n", + " self.batch_norm = nn.BatchNorm1d(d_model)\n", + " self.classifier = nn.Linear(d_model, 2)\n", + " self.d_model = d_model\n", + " self.softmax = nn.Softmax(dim=1)\n", + "\n", + " def forward(self, x):\n", + " x = self.emb(x) * math.sqrt(self.d_model)\n", + " x = self.pos_encoder(x)\n", + " x = self.transformer_encoder(x)\n", + " x = x.mean(dim=1)\n", + " # normalize to stabilize and stop overfitting\n", + " x = self.batch_norm(x)\n", + " x = self.classifier(x)\n", + " \n", + " # TODO check if softmax is needed\n", + " #x = self.softmax(x)\n", + " return x\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train 3945 3945\n", + "test 494 494\n", + "val 493 493\n" + ] + } + ], + "source": [ + "X,y = data_gen.load_preprocess_data()\n", + "\n", + "ret_dict = data_gen.split_data(X, y)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set hyper params" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model created\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "params = {\n", + " # used for class balancing\n", + " 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n", + " # training parameters\n", + " 'batch_size': 32, # 32 (16 to 64)\n", + " 'epochs': 100, # 100\n", + " 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n", + " \n", + " # NOTE: used for gradient clipping (needed for lstm and transformer)\n", + " # use 0 to disable\n", + " 'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n", + " \n", + " # patience for early stopping\n", + " 'early_stopping_patience': 5, # 5 (3 to 10)\n", + "\n", + " # learning rate scheduler\n", + " 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n", + " 'lr_scheduler_patience': 3, # 3 (2 to 5)\n", + "\n", + " # model parameters\n", + " 'nhead': 2, # 5\n", + " 'num_layers': 3, # 6\n", + " 'hidden_dim': 10, # 50\n", + "\n", + " # regularization parameters\n", + " 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n", + " 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n", + " 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n", + "}\n", + "\n", + "# Model initialization\n", + "model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n", + " nhead=params['nhead'], \n", + " num_layers=params['num_layers'], \n", + " dim_feedforward=params['hidden_dim'],\n", + " positional_dropout=params['positional_dropout'],\n", + " classifier_dropout=params['classifier_dropout']\n", + " )\n", + "model.to(device)\n", + "print('model created')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### create datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "datasets length: 3945 493\n", + "train: 124, val: 16, test: 16\n" + ] + } + ], + "source": [ + "# NOTE: Info comes from data explore notebook: 280 is max length,\n", + "# 139 contains 80% and 192 contains 95% of the data\n", + "max_len = 280\n", + "\n", + "train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n", + "val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n", + "test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n", + "\n", + "print('datasets length:', len(train_dataset), len(val_dataset))\n", + "#NOTE: overfitting test\n", + "#train_dataset.labels = train_dataset.labels[:100]\n", + "#train_dataset.texts = train_dataset.texts[:100]\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n", + "val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n", + "test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n", + "\n", + "# NOTE: samller because of batches not all data\n", + "print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set training requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#criterion = nn.CrossEntropyLoss()\n", + "#criterion = nn.BCEWithLogitsLoss()\n", + "# alpha determines the weight of the imbalance penalty\n", + "criterion = BalancedCELoss.BalancedCELoss(alpha=params['equalize_classes_loss_factor'])\n", + "\n", + "\n", + "optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n", + " lr=params['lr']) #, \n", + " #weight_decay=params['weight_decay'])\n", + "\"\"\"\n", + "scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n", + " factor=params['lr_scheduler_factor'],\n", + " patience=params['lr_scheduler_patience'],\n", + " verbose=True)\n", + "\"\"\"\n", + "early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training loop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/100 - 2.87s - loss: 0.7935 - accuracy: 0.4969 - val_loss: 2.3949 - val_accuracy: 0.4742\n", + "Epoch 2/100 - 2.70s - loss: 0.8777 - accuracy: 0.5114 - val_loss: 0.7464 - val_accuracy: 0.5005\n", + "Epoch 3/100 - 2.67s - loss: 0.8741 - accuracy: 0.4914 - val_loss: 0.7593 - val_accuracy: 0.5044\n", + "Epoch 4/100 - 2.65s - loss: 0.8460 - accuracy: 0.4999 - val_loss: 0.8106 - val_accuracy: 0.5287\n", + "Epoch 5/100 - 2.65s - loss: 0.8504 - accuracy: 0.5051 - val_loss: 0.8417 - val_accuracy: 0.4917\n", + "Epoch 6/100 - 2.70s - loss: 0.8398 - accuracy: 0.5114 - val_loss: 0.7328 - val_accuracy: 0.5365\n", + "Epoch 7/100 - 2.67s - loss: 0.8218 - accuracy: 0.5227 - val_loss: 0.8496 - val_accuracy: 0.5258\n", + "Epoch 8/100 - 2.63s - loss: 0.8164 - accuracy: 0.5248 - val_loss: 1.0123 - val_accuracy: 0.4742\n", + "Epoch 9/100 - 2.70s - loss: 0.8178 - accuracy: 0.5225 - val_loss: 0.8091 - val_accuracy: 0.5248\n", + "Epoch 10/100 - 2.75s - loss: 0.8085 - accuracy: 0.5308 - val_loss: 0.8591 - val_accuracy: 0.5258\n", + "Epoch 11/100 - 2.72s - loss: 0.8109 - accuracy: 0.5396 - val_loss: 0.7448 - val_accuracy: 0.5766\n", + "Epoch 12/100 - 2.61s - loss: 0.7906 - accuracy: 0.5392 - val_loss: 0.7868 - val_accuracy: 0.5367\n", + "Epoch 13/100 - 2.62s - loss: 0.7814 - accuracy: 0.5413 - val_loss: 0.7026 - val_accuracy: 0.5805\n", + "Epoch 14/100 - 2.64s - loss: 0.7705 - accuracy: 0.5549 - val_loss: 0.7042 - val_accuracy: 0.5874\n", + "Epoch 15/100 - 2.67s - loss: 0.7804 - accuracy: 0.5703 - val_loss: 0.7103 - val_accuracy: 0.5972\n", + "Epoch 16/100 - 2.64s - loss: 0.7535 - accuracy: 0.5731 - val_loss: 0.7967 - val_accuracy: 0.5551\n", + "Epoch 17/100 - 2.61s - loss: 0.7696 - accuracy: 0.5748 - val_loss: 0.8019 - val_accuracy: 0.5406\n", + "Epoch 18/100 - 2.65s - loss: 0.7589 - accuracy: 0.5816 - val_loss: 0.9402 - val_accuracy: 0.5278\n", + "Epoch 19/100 - 2.73s - loss: 0.7952 - accuracy: 0.5621 - val_loss: 0.7648 - val_accuracy: 0.5581\n", + "Epoch 20/100 - 2.72s - loss: 0.7454 - accuracy: 0.5914 - val_loss: 0.8648 - val_accuracy: 0.5239\n", + "Epoch 21/100 - 2.61s - loss: 0.7232 - accuracy: 0.6027 - val_loss: 0.8158 - val_accuracy: 0.5395\n", + "Epoch 22/100 - 2.59s - loss: 0.7286 - accuracy: 0.6176 - val_loss: 0.6909 - val_accuracy: 0.6303\n", + "Epoch 23/100 - 2.74s - loss: 0.7111 - accuracy: 0.6171 - val_loss: 0.6731 - val_accuracy: 0.6322\n", + "Epoch 24/100 - 2.68s - loss: 0.7176 - accuracy: 0.6195 - val_loss: 0.8146 - val_accuracy: 0.5532\n", + "Epoch 25/100 - 2.65s - loss: 0.7193 - accuracy: 0.6134 - val_loss: 0.7866 - val_accuracy: 0.5727\n", + "Epoch 26/100 - 2.65s - loss: 0.6916 - accuracy: 0.6349 - val_loss: 0.7352 - val_accuracy: 0.5883\n", + "Epoch 27/100 - 2.68s - loss: 0.7175 - accuracy: 0.6147 - val_loss: 0.6878 - val_accuracy: 0.6157\n", + "Epoch 28/100 - 2.70s - loss: 0.6831 - accuracy: 0.6403 - val_loss: 0.8379 - val_accuracy: 0.5454\n", + "Epoch 29/100 - 2.68s - loss: 0.6874 - accuracy: 0.6381 - val_loss: 0.7423 - val_accuracy: 0.6059\n", + "Epoch 30/100 - 2.62s - loss: 0.6790 - accuracy: 0.6509 - val_loss: 0.7564 - val_accuracy: 0.5816\n", + "Epoch 31/100 - 2.66s - loss: 0.6696 - accuracy: 0.6509 - val_loss: 0.6831 - val_accuracy: 0.5983\n", + "Epoch 32/100 - 2.63s - loss: 0.6541 - accuracy: 0.6646 - val_loss: 0.6756 - val_accuracy: 0.6333\n", + "Epoch 33/100 - 2.69s - loss: 0.6396 - accuracy: 0.6744 - val_loss: 0.6731 - val_accuracy: 0.6274\n", + "Epoch 34/100 - 2.63s - loss: 0.6543 - accuracy: 0.6666 - val_loss: 0.6772 - val_accuracy: 0.6176\n", + "Epoch 35/100 - 2.70s - loss: 0.6387 - accuracy: 0.6775 - val_loss: 0.7236 - val_accuracy: 0.6157\n", + "Epoch 36/100 - 2.67s - loss: 0.6173 - accuracy: 0.6958 - val_loss: 0.6906 - val_accuracy: 0.6196\n", + "Epoch 37/100 - 2.75s - loss: 0.6283 - accuracy: 0.6857 - val_loss: 0.7336 - val_accuracy: 0.6235\n", + "Epoch 38/100 - 2.75s - loss: 0.6012 - accuracy: 0.7007 - val_loss: 0.7334 - val_accuracy: 0.6244\n", + "Epoch 39/100 - 2.71s - loss: 0.5942 - accuracy: 0.7105 - val_loss: 0.6830 - val_accuracy: 0.6352\n", + "Epoch 40/100 - 2.69s - loss: 0.5850 - accuracy: 0.7192 - val_loss: 0.7207 - val_accuracy: 0.6313\n", + "Epoch 41/100 - 2.62s - loss: 0.5905 - accuracy: 0.7138 - val_loss: 0.7521 - val_accuracy: 0.6137\n", + "Epoch 42/100 - 2.67s - loss: 0.5869 - accuracy: 0.7198 - val_loss: 0.7848 - val_accuracy: 0.5894\n", + "Epoch 43/100 - 2.71s - loss: 0.5666 - accuracy: 0.7316 - val_loss: 0.7337 - val_accuracy: 0.6274\n", + "Epoch 44/100 - 2.68s - loss: 0.5588 - accuracy: 0.7350 - val_loss: 0.7804 - val_accuracy: 0.6118\n", + "Epoch 45/100 - 2.66s - loss: 0.5488 - accuracy: 0.7442 - val_loss: 0.7241 - val_accuracy: 0.6020\n", + "Epoch 46/100 - 2.72s - loss: 0.5505 - accuracy: 0.7423 - val_loss: 0.7173 - val_accuracy: 0.6331\n", + "Epoch 47/100 - 2.76s - loss: 0.5269 - accuracy: 0.7574 - val_loss: 0.7258 - val_accuracy: 0.6283\n", + "Epoch 48/100 - 2.68s - loss: 0.5336 - accuracy: 0.7543 - val_loss: 0.7117 - val_accuracy: 0.6351\n", + "Epoch 49/100 - 2.64s - loss: 0.5198 - accuracy: 0.7622 - val_loss: 0.7552 - val_accuracy: 0.6264\n", + "Epoch 50/100 - 2.68s - loss: 0.5027 - accuracy: 0.7742 - val_loss: 0.7510 - val_accuracy: 0.6089\n", + "Epoch 51/100 - 2.73s - loss: 0.4982 - accuracy: 0.7701 - val_loss: 0.7405 - val_accuracy: 0.6137\n", + "Epoch 52/100 - 2.69s - loss: 0.4766 - accuracy: 0.7866 - val_loss: 0.7566 - val_accuracy: 0.6098\n", + "Epoch 53/100 - 2.65s - loss: 0.4801 - accuracy: 0.7846 - val_loss: 0.7657 - val_accuracy: 0.5972\n", + "Epoch 54/100 - 2.64s - loss: 0.4782 - accuracy: 0.7889 - val_loss: 0.7740 - val_accuracy: 0.6107\n", + "Epoch 55/100 - 2.77s - loss: 0.4609 - accuracy: 0.8001 - val_loss: 0.7642 - val_accuracy: 0.6342\n", + "Epoch 56/100 - 2.69s - loss: 0.4562 - accuracy: 0.7939 - val_loss: 0.7496 - val_accuracy: 0.6313\n", + "Epoch 57/100 - 2.69s - loss: 0.4294 - accuracy: 0.8175 - val_loss: 0.7696 - val_accuracy: 0.6187\n", + "Epoch 58/100 - 2.60s - loss: 0.4270 - accuracy: 0.8222 - val_loss: 0.7816 - val_accuracy: 0.6226\n", + "Epoch 59/100 - 2.61s - loss: 0.4097 - accuracy: 0.8323 - val_loss: 0.8205 - val_accuracy: 0.6107\n", + "Epoch 60/100 - 2.68s - loss: 0.4083 - accuracy: 0.8251 - val_loss: 0.8314 - val_accuracy: 0.6098\n", + "Epoch 61/100 - 2.65s - loss: 0.3924 - accuracy: 0.8430 - val_loss: 0.8286 - val_accuracy: 0.6040\n", + "Epoch 62/100 - 2.63s - loss: 0.4108 - accuracy: 0.8326 - val_loss: 0.8096 - val_accuracy: 0.6089\n", + "Epoch 63/100 - 2.63s - loss: 0.4049 - accuracy: 0.8311 - val_loss: 0.8225 - val_accuracy: 0.6079\n", + "Epoch 64/100 - 2.69s - loss: 0.3702 - accuracy: 0.8442 - val_loss: 0.8243 - val_accuracy: 0.6274\n", + "Epoch 65/100 - 2.69s - loss: 0.3736 - accuracy: 0.8471 - val_loss: 0.8599 - val_accuracy: 0.6059\n", + "Epoch 66/100 - 2.68s - loss: 0.3671 - accuracy: 0.8536 - val_loss: 0.8497 - val_accuracy: 0.5922\n", + "Epoch 67/100 - 2.70s - loss: 0.3560 - accuracy: 0.8590 - val_loss: 0.8470 - val_accuracy: 0.6196\n", + "Epoch 68/100 - 2.69s - loss: 0.3339 - accuracy: 0.8693 - val_loss: 0.8498 - val_accuracy: 0.6196\n", + "Epoch 69/100 - 2.68s - loss: 0.3347 - accuracy: 0.8651 - val_loss: 0.8548 - val_accuracy: 0.6255\n", + "Epoch 70/100 - 2.68s - loss: 0.3335 - accuracy: 0.8681 - val_loss: 0.8906 - val_accuracy: 0.6020\n", + "Epoch 71/100 - 2.65s - loss: 0.3275 - accuracy: 0.8753 - val_loss: 0.9036 - val_accuracy: 0.6148\n", + "Epoch 72/100 - 2.69s - loss: 0.3119 - accuracy: 0.8796 - val_loss: 0.9040 - val_accuracy: 0.5894\n", + "Epoch 73/100 - 2.64s - loss: 0.3095 - accuracy: 0.8837 - val_loss: 0.9050 - val_accuracy: 0.5933\n", + "Epoch 74/100 - 2.71s - loss: 0.3097 - accuracy: 0.8819 - val_loss: 0.9232 - val_accuracy: 0.5894\n", + "Epoch 75/100 - 2.71s - loss: 0.3006 - accuracy: 0.8859 - val_loss: 0.9982 - val_accuracy: 0.5699\n", + "Epoch 76/100 - 2.72s - loss: 0.2991 - accuracy: 0.8881 - val_loss: 0.9575 - val_accuracy: 0.5796\n", + "Epoch 77/100 - 2.66s - loss: 0.2826 - accuracy: 0.8967 - val_loss: 0.9735 - val_accuracy: 0.5864\n", + "Epoch 78/100 - 2.65s - loss: 0.2756 - accuracy: 0.8984 - val_loss: 0.9550 - val_accuracy: 0.5981\n", + "Epoch 79/100 - 2.72s - loss: 0.2848 - accuracy: 0.8934 - val_loss: 0.9656 - val_accuracy: 0.5903\n", + "Epoch 80/100 - 2.72s - loss: 0.2697 - accuracy: 0.9001 - val_loss: 1.0160 - val_accuracy: 0.5855\n", + "Epoch 81/100 - 2.65s - loss: 0.2580 - accuracy: 0.9013 - val_loss: 0.9867 - val_accuracy: 0.6137\n", + "Epoch 82/100 - 2.63s - loss: 0.2551 - accuracy: 0.9027 - val_loss: 1.0064 - val_accuracy: 0.5883\n", + "Epoch 83/100 - 2.72s - loss: 0.2568 - accuracy: 0.9042 - val_loss: 1.0603 - val_accuracy: 0.6059\n", + "Epoch 84/100 - 2.83s - loss: 0.2601 - accuracy: 0.9046 - val_loss: 1.0153 - val_accuracy: 0.5805\n", + "Epoch 85/100 - 2.70s - loss: 0.2430 - accuracy: 0.9098 - val_loss: 1.0606 - val_accuracy: 0.5864\n", + "Epoch 86/100 - 2.61s - loss: 0.2392 - accuracy: 0.9098 - val_loss: 1.0427 - val_accuracy: 0.5903\n", + "Epoch 87/100 - 2.58s - loss: 0.2298 - accuracy: 0.9148 - val_loss: 1.0372 - val_accuracy: 0.5894\n", + "Epoch 88/100 - 2.68s - loss: 0.2203 - accuracy: 0.9186 - val_loss: 1.1209 - val_accuracy: 0.5844\n", + "Epoch 89/100 - 2.64s - loss: 0.2277 - accuracy: 0.9214 - val_loss: 1.0755 - val_accuracy: 0.5766\n", + "Epoch 90/100 - 2.58s - loss: 0.2218 - accuracy: 0.9205 - val_loss: 1.1082 - val_accuracy: 0.5777\n", + "Epoch 91/100 - 2.59s - loss: 0.2071 - accuracy: 0.9275 - val_loss: 1.1067 - val_accuracy: 0.5825\n", + "Epoch 92/100 - 2.63s - loss: 0.2030 - accuracy: 0.9276 - val_loss: 1.1445 - val_accuracy: 0.5953\n", + "Epoch 93/100 - 2.80s - loss: 0.2049 - accuracy: 0.9335 - val_loss: 1.1118 - val_accuracy: 0.6010\n", + "Epoch 94/100 - 2.74s - loss: 0.1959 - accuracy: 0.9332 - val_loss: 1.1190 - val_accuracy: 0.5834\n", + "Epoch 95/100 - 2.68s - loss: 0.1939 - accuracy: 0.9350 - val_loss: 1.1235 - val_accuracy: 0.5883\n", + "Epoch 96/100 - 2.59s - loss: 0.2025 - accuracy: 0.9292 - val_loss: 1.1709 - val_accuracy: 0.5699\n", + "Epoch 97/100 - 2.69s - loss: 0.1860 - accuracy: 0.9361 - val_loss: 1.2127 - val_accuracy: 0.5844\n", + "Epoch 98/100 - 2.63s - loss: 0.1900 - accuracy: 0.9346 - val_loss: 1.1717 - val_accuracy: 0.5864\n", + "Epoch 99/100 - 2.64s - loss: 0.1750 - accuracy: 0.9398 - val_loss: 1.2376 - val_accuracy: 0.6029\n", + "Epoch 100/100 - 2.63s - loss: 0.1702 - accuracy: 0.9435 - val_loss: 1.1950 - val_accuracy: 0.5962\n" + ] + } + ], + "source": [ + "hist = ml_history.History()\n", + "\n", + "model.train()\n", + "# Training loop\n", + "for epoch in range(params['epochs']):\n", + " epoch_start_time = time.time()\n", + " hist.batch_reset()\n", + " for batch in train_loader:\n", + " optimizer.zero_grad()\n", + " input_ids, labels = batch\n", + " input_ids, labels = input_ids.to(device), labels.to(device)\n", + "\n", + " outputs = model(input_ids)\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n", + " optimizer.step()\n", + " preds = torch.argmax(outputs, dim=1) \n", + " # Update history\n", + " hist.batch_update_train(loss.item(), outputs, labels)\n", + " \n", + " # Validation\n", + " model.eval()\n", + " with torch.no_grad():\n", + " for batch in val_loader:\n", + " input_ids, labels = batch\n", + " input_ids, labels = input_ids.to(device), labels.to(device)\n", + " outputs = model(input_ids)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " preds = torch.argmax(outputs, dim=1)\n", + " # Update history\n", + " hist.batch_update_val(loss.item(), outputs, labels)\n", + "\n", + " # Update history\n", + " hist.update()\n", + " \n", + " epoch_end_time = time.time()\n", + " hist.print_history(epoch + 1, params['epochs'], epoch_end_time - epoch_start_time)\n", + "\n", + " \"\"\"\n", + " # Step the scheduler\n", + " scheduler.step(val_loss)\n", + " \n", + " \"\"\"\n", + " # Check early stopping\n", + " early_stopping(hist.history['val_loss'][-1], model, best_model_filename)\n", + " # if early_stopping.early_stop:\n", + " # print(\"Early stopping\")\n", + " # break\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 505 + }, + "id": "H7BSCQySGIF_", + "outputId": "edada67e-3461-4d04-9708-a632cdbe3fd9" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_26112\\1470113516.py:1: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load('checkpoints/checkpoint.pt'))\n" + ] + } + ], + "source": [ + "model.load_state_dict(torch.load(best_model_filename))\n", + "\n", + "model.eval()\n", + "\n", + "all_preds = []\n", + "all_labels = []\n", + "\n", + "with torch.no_grad():\n", + " for texts, labels in test_loader:\n", + " #print(texts)\n", + " texts, labels = texts.to(device), labels.to(device)\n", + " outputs = model(texts)\n", + " predictions = torch.argmax(outputs, dim=1)\n", + "\n", + " all_preds.extend(predictions.cpu())\n", + " all_labels.extend(labels.cpu())" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🚀 Finale Test Accuracy: 0.6194\n", + "🚀 Finale Test F1 Score: 0.6667\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(f'🚀 Finale Test Accuracy: {ml_eval.get_accuracy(all_preds, all_labels):.4f}')\n", + "print(f'🚀 Finale Test F1 Score: {ml_eval.get_f1_score(all_preds, all_labels):.4f}')\n", + "\n", + "# Confusion matrix\n", + "con_plt = ml_eval.plot_confusion_matrix(all_preds, all_labels, ['0', '1'])\n", + "con_plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Class 0: 0.64\n", + "Class 1: 0.36\n" + ] + } + ], + "source": [ + "ml_eval.get_label_distribution(all_labels, all_preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ml_eval.plot_training_history(hist).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ml_eval.plot_rating_preds(all_preds, all_labels, test_dataset).show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/transformer_1a.py b/transformer_1a.py deleted file mode 100644 index bc8de3e..0000000 --- a/transformer_1a.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -This file contains the transformer model. -""" - - -# TODO refactor the code -# TODO create ml helper script -# TODO create ml evaluation script - -# TODO track overfitting better -# TODO validate model in training (accuracy, loss, etc) - -# TODO set length to a constant value which is the max length of the sentences or nearly - - -# TODO user gloVe embeddings - -#TODO: add attention mask -# TODO: add positional encoding -#TODO: add dropout (if needed) - - -import torch -import torch.nn as nn -import torch.optim as optim - -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split -from nltk.tokenize import word_tokenize -from transformers import BertTokenizer, BertModel - -from torch.utils.data import DataLoader -from transformers import AdamW -from sklearn.metrics import accuracy_score - -import gensim - -import time - -# Disable the warning for beta transformers -import torchvision -torchvision.disable_beta_transforms_warning() - - -def get_device(verbose=False): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - if verbose: - print('Using device:', device) - return device - -# Test if GPU is available -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -print('Using device:', DEVICE) -# Input maximum length -MAX_LEN = 100 - -# download nltk data -import nltk -nltk.download('punkt') -nltk.download('punkt_tab') - -def get_embedding(model, word): - if word in model.wv: - return model.wv.key_to_index[word] - else: - return unk_index - -def encode_tokens(tokens): - return [get_embedding(model_embedding, token) for token in tokens] - -def pad_sequences(sequences, MAX_LEN): - return np.array([np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index) if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences]) - - -class HumorDataset(torch.utils.data.Dataset): - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels.reset_index(drop=True) - - def __getitem__(self, idx): - item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)} - item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) - return item - - def __len__(self): - return len(self.labels) - - -class TransformerBinaryClassifier(nn.Module): - def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1): - super(TransformerBinaryClassifier, self).__init__() - self.embedding = nn.Embedding(vocab_size, embed_dim) - self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout) - self.fc = nn.Linear(embed_dim, 1) - self.sigmoid = nn.Sigmoid() - - def forward(self, input_ids): - input_ids = input_ids.long() - embedded = self.embedding(input_ids) - transformer_output = self.transformer(embedded, embedded) - pooled_output = transformer_output.mean(dim=1) - logits = self.fc(pooled_output) - return self.sigmoid(logits) - -if __name__ == "__main__": - # Load the data from csv - df = pd.read_csv('data/hack.csv') - print(df.shape) - - # transfrom data into dataset - X = df['text'] - y = df['is_humor'] - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - # Tokenize the data with nltk - train_tokens = [word_tokenize(text.lower()) for text in X_train] - test_tokens = [word_tokenize(text.lower()) for text in X_test] - - # Embed the data with word2vec - model_embedding = gensim.models.Word2Vec(train_tokens, window=5, min_count=1, workers=4) - - # Add a special token for out-of-vocabulary words - model_embedding.wv.add_vector('', np.zeros(model_embedding.vector_size)) - unk_index = model_embedding.wv.key_to_index[''] - - # Encode the tokens - train_encodings = [encode_tokens(tokens) for tokens in train_tokens] - test_encodings = [encode_tokens(tokens) for tokens in test_tokens] - - # Define the maximum sequence length - train_encodings = pad_sequences(train_encodings, MAX_LEN) - test_encodings = pad_sequences(test_encodings, MAX_LEN) - - train_dataset = HumorDataset(train_encodings, y_train.reset_index(drop=True)) - test_dataset = HumorDataset(test_encodings, y_test.reset_index(drop=True)) - - - vocab_size = len(model_embedding.wv.key_to_index) - embed_dim = model_embedding.vector_size - num_heads = 2 - num_layers = 2 - hidden_dim = 256 - - print(f"Vocabulary size: {vocab_size}") - print(f"Embedding dimension: {embed_dim}") - - model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim) - - # Training parameters - epochs = 30 #3 - batch_size = 8 - learning_rate = 2e-5 - - # Optimizer and loss function - optimizer = AdamW(model.parameters(), lr=learning_rate) - criterion = nn.BCEWithLogitsLoss() - - - # Data loaders - train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) - - for td in train_dataset: - print(td['input_ids'].shape) - print(td['labels']) - break - - for batch in train_loader: - print(batch['input_ids'].shape) - print(batch['labels']) - break - - # Model to device - model.to(DEVICE) - - print("Starting training...") - start_training_time = time.time() - losses = [] - # Training loop - model.train() - for epoch in range(epochs): - epoch_start_time = time.time() - batch_losses = [] - for batch in train_loader: - optimizer.zero_grad() - - input_ids = batch['input_ids'].to(DEVICE) - labels = batch['labels'].unsqueeze(1).to(DEVICE) - - outputs = model(input_ids) - loss = criterion(outputs, labels) - - loss.backward() - optimizer.step() - batch_losses.append(loss.item()) - losses.append(np.mean(batch_losses)) - epoch_end_time = time.time() - print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {losses[-1]:.5f}") - end_training_time = time.time() - print(f"Training finished in {end_training_time - start_training_time:.2f} seconds") - - print("Starting evaluation...") - # Evaluation - model.eval() - predictions, true_labels = [], [] - with torch.no_grad(): - for batch in test_loader: - input_ids = batch['input_ids'].to(DEVICE) - labels = batch['labels'].unsqueeze(1).to(DEVICE) - - outputs = model(input_ids) - preds = outputs.round() - predictions.extend(preds.cpu().numpy()) - true_labels.extend(labels.cpu().numpy()) - - accuracy = accuracy_score(true_labels, predictions) - print(f"Accuracy: {accuracy}") - - # Save the model - timestamp = time.strftime("%Y%m%d-%H%M%S") - torch.save(model.state_dict(), f'models/transformer_acc_{accuracy}_{timestamp}.pth') - print("Model saved.") - - # Save model hyperparameters as json - hyperparameters = { - 'max_len': MAX_LEN, - 'vocab_size': vocab_size, - 'embed_dim': embed_dim, - 'num_heads': num_heads, - 'num_layers': num_layers, - 'hidden_dim': hidden_dim, - 'epochs': epochs, - 'batch_size': batch_size, - 'learning_rate': learning_rate, - 'accuracy': accuracy - } - pd.DataFrame(hyperparameters, index=[0]).to_json(f'models/transformer_acc_{accuracy}_{timestamp}.json') - \ No newline at end of file diff --git a/transformer_1b.py b/transformer_1b.py deleted file mode 100644 index 05ffd62..0000000 --- a/transformer_1b.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -This file contains the transformer model. -""" - - -# TODO refactor the code -# TODO create ml helper script -# TODO create ml evaluation script - -# TODO track overfitting better -# TODO validate model in training (accuracy, loss, etc) - -# TODO set length to a constant value which is the max length of the sentences or nearly - - -# TODO user gloVe embeddings - -#TODO: add attention mask -# TODO: add positional encoding -#TODO: add dropout (if needed) - -import time -import json - -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim -from torch.utils.data import DataLoader -from transformers import AdamW - -from sklearn.metrics import accuracy_score - -import ml_helper -import ml_history - -class TransformerBinaryClassifier(nn.Module): - def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1): - super(TransformerBinaryClassifier, self).__init__() - self.embedding = nn.Embedding(vocab_size, embed_dim) - self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout) - self.fc = nn.Linear(embed_dim, 1) - self.sigmoid = nn.Sigmoid() - - def forward(self, input_ids): - input_ids = input_ids.long() - embedded = self.embedding(input_ids) - transformer_output = self.transformer(embedded, embedded) - pooled_output = transformer_output.mean(dim=1) - logits = self.fc(pooled_output) - return self.sigmoid(logits) - - - -if __name__ == "__main__": - - # Load the data - data_path = 'data/idx_based_padded' - - train_dataset = torch.load(data_path + '/train.pt') - test_dataset = torch.load(data_path + '/test.pt') - val_dataset = torch.load(data_path + '/val.pt') - - # +2 for padding and unk tokens - vocab_size = train_dataset.vocab_size + 2 - embed_dim = 100 #train_dataset.emb_dim - - # NOTE: Info comes from data explore notebook: 280 is max length, - # 139 contains 80% and 192 contains 95% of the data - max_len = 280 - - device = ml_helper.get_device(verbose=True) - - # Model hyperparameters - num_heads = 2 - num_layers = 2 - hidden_dim = 256 - - model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim) - - # Training parameters - epochs = 3 #3 - batch_size = 8 - learning_rate = 2e-5 - - # Optimizer and loss function - optimizer = AdamW(model.parameters(), lr=learning_rate) - criterion = nn.BCEWithLogitsLoss() - - - # Data loaders - train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) - val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) - - - ################################################################################################ - # Training - ################################################################################################ - - # Initialize the history - history = ml_history.History() - - # Model to device - model.to(device) - - print("Starting training...") - start_training_time = time.time() - - # Training loop - model.train() - for epoch in range(epochs): - # init batch tracking - epoch_start_time = time.time() - history.batch_reset() - - for batch in train_loader: - optimizer.zero_grad() - # prepare batch - input_ids = batch['input_ids'].to(device) - labels = batch['labels'].unsqueeze(1).to(device) - # forward pass - outputs = model(input_ids) - loss = criterion(outputs, labels) - # backward pass - loss.backward() - optimizer.step() - # calculate accuracy train - preds = outputs.round() - train_acc = accuracy_score(labels.cpu().detach().numpy(), - preds.cpu().detach().numpy()) - # update batch history - history.batch_update_train(loss.item(), train_acc) - - # calculate accuracy val - model.eval() - with torch.no_grad(): - for val_batch in val_loader: - val_input_ids = val_batch['input_ids'].to(device) - val_labels_batch = val_batch['labels'].unsqueeze(1).to(device) - val_outputs = model(val_input_ids) - val_acc = accuracy_score(val_outputs.round().cpu().numpy(), - val_labels_batch.cpu().numpy()) - history.batch_update_val(val_acc) - model.train() - - # update epoch history - history.update() - - epoch_end_time = time.time() - - print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}") - - end_training_time = time.time() - print(f"Training finished in {end_training_time - start_training_time:.2f} seconds") - - - ################################################################################################ - # Evaluation - ################################################################################################ - print("Starting evaluation...") - - model.eval() - predictions, true_labels = [], [] - with torch.no_grad(): - for batch in test_loader: - input_ids = batch['input_ids'].to(device) - labels = batch['labels'].unsqueeze(1).to(device) - - outputs = model(input_ids) - preds = outputs.round() - predictions.extend(preds.cpu().numpy()) - true_labels.extend(labels.cpu().numpy()) - - accuracy = accuracy_score(true_labels, predictions) - print(f"Accuracy: {accuracy}") - - - ################################################################################################ - # Save model and hyperparameters - ################################################################################################ - timestamp = time.strftime("%Y%m%d-%H%M%S") - - ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp, - max_len=max_len, - vocab_size=vocab_size, - embed_dim=embed_dim, - num_heads=num_heads, - num_layers=num_layers, - hidden_dim=hidden_dim, - epochs=epochs, - batch_size=batch_size, - learning_rate=learning_rate) - - #save history - - history_path = f'models/transformer_history_{timestamp}.json' - with open(history_path, 'w') as f: - json.dump(history.get_history(), f) \ No newline at end of file diff --git a/transformer_evaluation.ipynb b/transformer_evaluation.ipynb deleted file mode 100644 index a36a2e1..0000000 --- a/transformer_evaluation.ipynb +++ /dev/null @@ -1,414 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import json\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "\n", - "import ml_helper" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TODO: \n", - "- clean and refactor maybe ml_plot.py whith plot functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading model from: models/transformer_acc_0.5056_20250127-061459.pth\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_5648\\1644685603.py:5: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model = torch.load(model_path)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading history from: models/transformer_history_20250127-061459.json\n", - "Loading hyperparameters from: models/transformer_para_acc_0.5056_20250127-061459.json\n" - ] - } - ], - "source": [ - "# load newest model\n", - "path = 'models/'\n", - "model_path = ml_helper.get_newest_model_path(path)\n", - "print(\"Loading model from: \", model_path)\n", - "model = torch.load(model_path)\n", - "\n", - "# load history\n", - "history_path = ml_helper.get_newest_model_path(path, name=\"history\", extension=\".json\")\n", - "print(\"Loading history from: \", history_path)\n", - "with open(history_path) as f:\n", - " history = json.load(f)\n", - "\n", - "# load hyperparameters\n", - "hyperparameters_path = ml_helper.get_newest_model_path(path, name=\"para\", extension=\".json\")\n", - "print(\"Loading hyperparameters from: \", hyperparameters_path)\n", - "with open(hyperparameters_path) as f:\n", - " params = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "History:\n", - "{\n", - " \"loss\": [\n", - " 0.6977859839254063,\n", - " 0.6934245683644947,\n", - " 0.6932587604291043\n", - " ],\n", - " \"train_acc\": [\n", - " 0.5086032388663968,\n", - " 0.5080971659919028,\n", - " 0.5063259109311741\n", - " ],\n", - " \"val_acc\": [\n", - " 0.5093117408906882,\n", - " 0.5093117408906882,\n", - " 0.5093117408906882\n", - " ]\n", - "}\n" - ] - } - ], - "source": [ - "# print history\n", - "print(\"History:\")\n", - "print(json.dumps(history, indent=4))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# plot training accuracy vs validation accuracy\n", - "plt.plot(history['train_acc'], label='train_acc')\n", - "plt.plot(history['val_acc'], label='val_acc')\n", - "plt.legend()\n", - "plt.title('Training accuracy vs Validation accuracy')\n", - "plt.xlabel('Epoch')\n", - "plt.ylabel('Accuracy')\n", - "# set y-axis limits to 0-1\n", - "#plt.ylim(0, 1)\n", - "# set x-axis limits to integer steps\n", - "plt.xticks(np.arange(0, len(history['train_acc']), 1))\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(8000, 6)\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# Load the data from csv\n", - "df = pd.read_csv('data/hack.csv')\n", - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_5648\\4202493223.py:4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " train_dataset = torch.load(data_path + '/train.pt')\n", - "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_5648\\4202493223.py:5: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " test_dataset = torch.load(data_path + '/test.pt')\n", - "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_5648\\4202493223.py:6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " val_dataset = torch.load(data_path + '/val.pt')\n" - ] - } - ], - "source": [ - "# Load the data\n", - "data_path = 'data/idx_based_padded'\n", - "\n", - "train_dataset = torch.load(data_path + '/train.pt')\n", - "test_dataset = torch.load(data_path + '/test.pt') \n", - "val_dataset = torch.load(data_path + '/val.pt')" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{False: 2001, True: 1944}\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# count train_dataset labels\n", - "train_labels = train_dataset.labels\n", - "unique, counts = np.unique(train_labels, return_counts=True)\n", - "print(dict(zip(unique, counts)))\n", - "\n", - "idx_range = range(0, len(train_dataset))\n", - "# plot label distribution\n", - "plt.bar(idx_range, train_labels)\n", - "plt.title('Label distribution')\n", - "plt.xlabel('Index')\n", - "plt.ylabel('Label')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- If distribution wouldnt be random it could screw up the training process" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "textn" - ] - } - ], - "source": [ - "print(train_dataset.original_indices)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'max_len': 280, 'vocab_size': 10556, 'embed_dim': 100, 'num_heads': 2, 'num_layers': 2, 'hidden_dim': 256, 'epochs': 3, 'batch_size': 8, 'learning_rate': 2e-05, 'accuracy': 0.5056}\n" - ] - } - ], - "source": [ - "print(params)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", - " warnings.warn(\n", - "C:\\Users\\felix\\AppData\\Local\\Temp\\ipykernel_5648\\3082896325.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_path))\n" - ] - } - ], - "source": [ - "import transformer_1b\n", - "\n", - "model = transformer_1b.TransformerBinaryClassifier(params['vocab_size'], params['embed_dim'], \n", - " params['num_heads'], params['num_layers'], \n", - " params['hidden_dim'])\n", - "\n", - "model.load_state_dict(torch.load(model_path))\n", - "\n", - "\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "model.to(device)\n", - "\n", - "predictions = []\n", - "labels = []\n", - "\n", - "# Predict on test set\n", - "model.eval()\n", - "with torch.no_grad():\n", - " data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)\n", - " for batch in data_loader:\n", - " input_ids = batch['input_ids'].to(device)\n", - " labels_batch = batch['labels'].unsqueeze(1).to(device)\n", - " outputs = model(input_ids)\n", - " outputs = outputs.cpu().round().numpy()\n", - " labels_batch = labels_batch.cpu().numpy()\n", - "\n", - " predictions.append(outputs)\n", - " labels.append(labels_batch)\n", - "\n", - "# Concatenate all predictions\n", - "predictions = np.concatenate(predictions)\n", - "labels = np.concatenate(labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# get df data for original indices\n", - "df_test = df.iloc[test_dataset.original_indices].copy()\n", - "df_test['prediction'] = predictions\n", - "df_test['label'] = labels\n", - "df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])\n", - "\n", - "df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "997\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from matplotlib import patches as mpatches\n", - "\n", - "median_rating = df['humor_rating'].median()\n", - "# get first index where humor_rating is greater than median_rating\n", - "median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]\n", - "print(median_idx)\n", - "# range idx for len df_test\n", - "range_idx = range(len(df_test))\n", - "colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})\n", - "# bar plot for each df_test humor_rating value \n", - "plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)\n", - "# vertical line for True/False cut off\n", - "plt.axvline(x=median_idx, color='black', linestyle='--')\n", - "# Create a legend handles\n", - "green_patch = mpatches.Patch(color='g', label='Correct Prediction')\n", - "red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')\n", - "line_patch = mpatches.Patch(color='black', label='humor_rating cut off')\n", - "plt.title('Humor Rating vs Prediction for Test Set')\n", - "plt.xlabel('Index')\n", - "plt.ylabel('Humor Rating')\n", - "plt.legend(handles=[green_patch, red_patch, line_patch])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# NOTE: \n", - "- model currently only predicts 0 therefore one site is green and other red\n", - "- plot can be helpfull to identify if around the cut off the model gets confused" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/transformer_reg.ipynb b/transformer_reg.ipynb new file mode 100644 index 0000000..62ea12e --- /dev/null +++ b/transformer_reg.ipynb @@ -0,0 +1,584 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KuFFT6LrB6Fe" + }, + "outputs": [], + "source": [ + "import time\n", + "import json\n", + "import math\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader\n", + "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", + "\n", + "from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n", + "from sklearn.model_selection import KFold\n", + "# local imports\n", + "import ml_evaluation as ml_eval\n", + "import ml_helper\n", + "import ml_history\n", + "import dataset_generator as data_gen\n", + "# class imports\n", + "import HumorDataset as humor_ds\n", + "import EarlyStopping\n", + "import BalancedCELoss\n", + "\n", + "\n", + "# architecture inspired:\n", + "# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n", + "\n", + "# TODO: maybe KFold for cross validation?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + } + ], + "source": [ + "torch.manual_seed(0)\n", + "np.random.seed(0)\n", + "\n", + "\n", + "best_model_filename = 'best_transformer_reg_model.pt'\n", + "\n", + "device = ml_helper.get_device(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "400002\n", + "vocab_size: 400002, d_model: 100\n", + "vocab_size: 400002, d_model: 100\n" + ] + } + ], + "source": [ + "embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n", + "\n", + "vocab_size = len(embedding_matrix)\n", + "d_model = len(embedding_matrix[0])\n", + "vocab_size, d_model = embedding_matrix.size()\n", + "print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class PositionalEncoding(nn.Module):\n", + " \"\"\"\n", + " https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n", + " \"\"\"\n", + "\n", + " def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n", + " super().__init__()\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " pe = torch.zeros(vocab_size, d_model)\n", + " position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n", + " div_term = torch.exp(\n", + " torch.arange(0, d_model, 2).float()\n", + " * (-math.log(10000.0) / d_model)\n", + " )\n", + " pe[:, 0::2] = torch.sin(position * div_term)\n", + " pe[:, 1::2] = torch.cos(position * div_term)\n", + " pe = pe.unsqueeze(0)\n", + " self.register_buffer(\"pe\", pe)\n", + "\n", + " def forward(self, x):\n", + " x = x + self.pe[:, : x.size(1), :]\n", + " return self.dropout(x)\n", + "\n", + "\n", + "class TransformerBinaryClassifier(nn.Module):\n", + " \"\"\"\n", + " Text classifier based on a pytorch TransformerEncoder.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " embeddings,\n", + " nhead=8,\n", + " dim_feedforward=2048,\n", + " num_layers=6,\n", + " positional_dropout=0.1,\n", + " classifier_dropout=0.1,\n", + " activation=\"relu\",\n", + " ):\n", + "\n", + " super().__init__()\n", + "\n", + " vocab_size, d_model = embeddings.size()\n", + " assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n", + "\n", + " self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n", + "\n", + " self.pos_encoder = PositionalEncoding(\n", + " d_model=d_model,\n", + " dropout=positional_dropout,\n", + " vocab_size=vocab_size,\n", + " )\n", + "\n", + " encoder_layer = nn.TransformerEncoderLayer(\n", + " d_model=d_model,\n", + " nhead=nhead,\n", + " dim_feedforward=dim_feedforward,\n", + " dropout=classifier_dropout,\n", + " )\n", + " self.transformer_encoder = nn.TransformerEncoder(\n", + " encoder_layer,\n", + " num_layers=num_layers,\n", + " )\n", + " # normalize to stabilize and stop overfitting\n", + " self.batch_norm = nn.BatchNorm1d(d_model)\n", + " self.classifier = nn.Linear(d_model, 1)\n", + " self.d_model = d_model\n", + " #self.softmax = nn.Softmax(dim=1)\n", + " #self.sigmoid = nn.Sigmoid()\n", + "\n", + " def forward(self, x):\n", + " x = self.emb(x) * math.sqrt(self.d_model)\n", + " x = self.pos_encoder(x)\n", + " x = self.transformer_encoder(x)\n", + " x = x.mean(dim=1)\n", + " # normalize to stabilize and stop overfitting\n", + " #x = self.batch_norm(x)\n", + "\n", + " #NOTE: no activation function for regression\n", + " # sigmoid would only distort the output\n", + " x = self.classifier(x)\n", + " \n", + " return x\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def load_preprocess_data(path_data='data/hack.csv'):\n", + " df = pd.read_csv(path_data)\n", + " df = df.dropna(subset=['humor_rating'])\n", + "\n", + " df['y'] = df['humor_rating']\n", + " X = df['text']\n", + " y = df['y']\n", + " return X, y" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train 3945 3945\n", + "test 494 494\n", + "val 493 493\n" + ] + } + ], + "source": [ + "X,y = load_preprocess_data()\n", + "\n", + "ret_dict = data_gen.split_data(X, y)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set hyper params" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model created\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "params = {\n", + " # used for class balancing\n", + " 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n", + " # training parameters\n", + " 'batch_size': 32, # 32 (16 to 64)\n", + " 'epochs': 100, # 100\n", + " 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n", + " \n", + " # NOTE: used for gradient clipping (needed for lstm and transformer)\n", + " # use 0 to disable\n", + " 'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n", + " \n", + " # patience for early stopping\n", + " 'early_stopping_patience': 5, # 5 (3 to 10)\n", + "\n", + " # learning rate scheduler\n", + " 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n", + " 'lr_scheduler_patience': 3, # 3 (2 to 5)\n", + "\n", + " # model parameters\n", + " 'nhead': 2, # 5\n", + " 'num_layers': 3, # 6\n", + " 'hidden_dim': 10, # 50\n", + "\n", + " # regularization parameters\n", + " 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n", + " 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n", + " 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n", + "}\n", + "\n", + "# Model initialization\n", + "model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n", + " nhead=params['nhead'], \n", + " num_layers=params['num_layers'], \n", + " dim_feedforward=params['hidden_dim'],\n", + " positional_dropout=params['positional_dropout'],\n", + " classifier_dropout=params['classifier_dropout']\n", + " )\n", + "model.to(device)\n", + "print('model created')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### create datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "datasets length: 3945 493\n", + "train: 124, val: 16, test: 16\n" + ] + } + ], + "source": [ + "# NOTE: Info comes from data explore notebook: 280 is max length,\n", + "# 139 contains 80% and 192 contains 95% of the data\n", + "max_len = 280\n", + "\n", + "train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n", + "val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n", + "test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n", + "\n", + "print('datasets length:', len(train_dataset), len(val_dataset))\n", + "#NOTE: overfitting test\n", + "#train_dataset.labels = train_dataset.labels[:100]\n", + "#train_dataset.texts = train_dataset.texts[:100]\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n", + "val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n", + "test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n", + "\n", + "# NOTE: samller because of batches not all data\n", + "print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set training requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#TODO: change to RMSE\n", + "\"\"\"\n", + "criterion = nn.MSELoss()\n", + "loss = torch.sqrt(criterion(x, y))\n", + "loss.backward()\n", + "print(x.grad)\n", + "\"\"\"\n", + "criterion = nn.MSELoss()\n", + "\n", + "optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n", + " lr=params['lr']) #, \n", + " #weight_decay=params['weight_decay'])\n", + "\"\"\"\n", + "scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n", + " factor=params['lr_scheduler_factor'],\n", + " patience=params['lr_scheduler_patience'],\n", + " verbose=True)\n", + "\"\"\"\n", + "early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training loop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s\n", + "Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s\n", + "Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s\n", + "Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n", + "Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s\n", + "Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s\n", + "Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s\n", + "Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s\n", + "Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s\n", + "Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s\n", + "Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s\n", + "Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s\n", + "Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s\n", + "Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.8873, Time: 2.28s\n", + "Epoch 17/100, Train Loss: 1.8081, Val Loss: 1.8873, Time: 2.44s\n", + "Epoch 18/100, Train Loss: 1.8078, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 19/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.15s\n", + "Epoch 20/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 21/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 22/100, Train Loss: 1.8103, Val Loss: 1.8873, Time: 2.09s\n", + "Epoch 23/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.16s\n", + "Epoch 24/100, Train Loss: 1.8034, Val Loss: 1.8873, Time: 2.24s\n", + "Epoch 25/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.46s\n", + "Epoch 26/100, Train Loss: 1.8084, Val Loss: 1.8873, Time: 2.38s\n", + "Epoch 27/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.35s\n", + "Epoch 28/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.15s\n", + "Epoch 29/100, Train Loss: 1.8136, Val Loss: 1.8873, Time: 2.24s\n", + "Epoch 30/100, Train Loss: 1.8051, Val Loss: 1.8873, Time: 2.28s\n", + "Epoch 31/100, Train Loss: 1.8026, Val Loss: 1.8873, Time: 2.19s\n", + "Epoch 32/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.16s\n", + "Epoch 33/100, Train Loss: 1.8121, Val Loss: 1.8873, Time: 2.13s\n", + "Epoch 34/100, Train Loss: 1.8098, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 35/100, Train Loss: 1.8036, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 36/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.19s\n", + "Epoch 37/100, Train Loss: 1.8108, Val Loss: 1.8873, Time: 2.50s\n", + "Epoch 38/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.45s\n", + "Epoch 39/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.38s\n", + "Epoch 40/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.22s\n", + "Epoch 41/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.26s\n", + "Epoch 42/100, Train Loss: 1.8088, Val Loss: 1.8873, Time: 2.30s\n", + "Epoch 43/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.28s\n", + "Epoch 44/100, Train Loss: 1.8029, Val Loss: 1.8873, Time: 2.14s\n", + "Epoch 45/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.14s\n", + "Epoch 46/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.22s\n", + "Epoch 47/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.19s\n", + "Epoch 48/100, Train Loss: 1.8069, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 49/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.22s\n", + "Epoch 50/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 51/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 52/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.08s\n", + "Epoch 53/100, Train Loss: 1.8075, Val Loss: 1.8873, Time: 2.00s\n", + "Epoch 54/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 55/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.02s\n", + "Epoch 56/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 57/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.34s\n", + "Epoch 58/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 59/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.11s\n", + "Epoch 60/100, Train Loss: 1.8100, Val Loss: 1.8873, Time: 2.05s\n", + "Epoch 61/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.08s\n", + "Epoch 62/100, Train Loss: 1.8068, Val Loss: 1.8873, Time: 2.22s\n", + "Epoch 63/100, Train Loss: 1.8012, Val Loss: 1.8873, Time: 2.32s\n", + "Epoch 64/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.35s\n", + "Epoch 65/100, Train Loss: 1.8109, Val Loss: 1.8873, Time: 2.36s\n", + "Epoch 66/100, Train Loss: 1.8030, Val Loss: 1.8873, Time: 2.28s\n", + "Epoch 67/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.24s\n", + "Epoch 68/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.20s\n", + "Epoch 69/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.18s\n", + "Epoch 70/100, Train Loss: 1.8019, Val Loss: 1.8873, Time: 2.15s\n", + "Epoch 71/100, Train Loss: 1.8025, Val Loss: 1.8873, Time: 2.19s\n", + "Epoch 72/100, Train Loss: 1.8124, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 73/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.06s\n", + "Epoch 74/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.06s\n", + "Epoch 75/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.08s\n", + "Epoch 76/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n", + "Epoch 77/100, Train Loss: 1.8141, Val Loss: 1.8873, Time: 2.39s\n", + "Epoch 78/100, Train Loss: 1.8092, Val Loss: 1.8873, Time: 2.44s\n", + "Epoch 79/100, Train Loss: 1.8106, Val Loss: 1.8873, Time: 2.30s\n", + "Epoch 80/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.25s\n", + "Epoch 81/100, Train Loss: 1.8142, Val Loss: 1.8873, Time: 2.26s\n", + "Epoch 82/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.08s\n", + "Epoch 83/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.14s\n", + "Epoch 84/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.15s\n", + "Epoch 85/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.17s\n", + "Epoch 86/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.12s\n", + "Epoch 87/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.09s\n", + "Epoch 88/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.11s\n", + "Epoch 89/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.25s\n", + "Epoch 90/100, Train Loss: 1.8047, Val Loss: 1.8873, Time: 2.42s\n", + "Epoch 91/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.34s\n", + "Epoch 92/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.37s\n", + "Epoch 93/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.20s\n", + "Epoch 94/100, Train Loss: 1.8031, Val Loss: 1.8873, Time: 2.18s\n", + "Epoch 95/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.07s\n", + "Epoch 96/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.20s\n" + ] + } + ], + "source": [ + "# Training loop\n", + "\n", + "for epoch in range(params['epochs']):\n", + " epoch_start_time = time.time()\n", + " model.train()\n", + " \n", + " train_loss = 0.0\n", + " \n", + " for batch in train_loader:\n", + " optimizer.zero_grad()\n", + " input_ids, labels = batch\n", + " input_ids, labels = input_ids.to(device), labels.to(device).float() \n", + "\n", + " outputs = model(input_ids)\n", + " outputs = outputs.squeeze().float()\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n", + " optimizer.step()\n", + " preds = outputs\n", + " \n", + " train_loss += loss.item()\n", + "\n", + " train_loss /= len(train_loader)\n", + " \n", + " # Validation\n", + " model.eval()\n", + " val_loss = 0.0\n", + " \n", + " with torch.no_grad():\n", + " for batch in val_loader:\n", + " input_ids, labels = batch\n", + " input_ids, labels = input_ids.to(device), labels.to(device).float() \n", + " outputs = model(input_ids)\n", + " outputs = outputs.squeeze().float()\n", + " loss = criterion(outputs, labels)\n", + " preds = outputs\n", + " \n", + " val_loss += loss.item()\n", + "\n", + " val_loss /= len(val_loader)\n", + " \n", + " epoch_end_time = time.time()\n", + " \n", + " print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n", + " f'Train Loss: {train_loss:.4f}, '\n", + " f'Val Loss: {val_loss:.4f}, '\n", + " f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n", + "\n", + " " + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}