updated transformer models

main
Felix Jan Michael Mucha 2025-02-09 15:31:47 +01:00
parent e1e9ac57ba
commit 394167488f
5 changed files with 1324 additions and 853 deletions

740
transformer.ipynb 100644

File diff suppressed because one or more lines are too long

View File

@ -1,240 +0,0 @@
"""
This file contains the transformer model.
"""
# TODO refactor the code
# TODO create ml helper script
# TODO create ml evaluation script
# TODO track overfitting better
# TODO validate model in training (accuracy, loss, etc)
# TODO set length to a constant value which is the max length of the sentences or nearly
# TODO user gloVe embeddings
#TODO: add attention mask
# TODO: add positional encoding
#TODO: add dropout (if needed)
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
import gensim
import time
# Disable the warning for beta transformers
import torchvision
torchvision.disable_beta_transforms_warning()
def get_device(verbose=False):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if verbose:
print('Using device:', device)
return device
# Test if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
# Input maximum length
MAX_LEN = 100
# download nltk data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
def get_embedding(model, word):
if word in model.wv:
return model.wv.key_to_index[word]
else:
return unk_index
def encode_tokens(tokens):
return [get_embedding(model_embedding, token) for token in tokens]
def pad_sequences(sequences, MAX_LEN):
return np.array([np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index) if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences])
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels.reset_index(drop=True)
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
class TransformerBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
super(TransformerBinaryClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
self.fc = nn.Linear(embed_dim, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids):
input_ids = input_ids.long()
embedded = self.embedding(input_ids)
transformer_output = self.transformer(embedded, embedded)
pooled_output = transformer_output.mean(dim=1)
logits = self.fc(pooled_output)
return self.sigmoid(logits)
if __name__ == "__main__":
# Load the data from csv
df = pd.read_csv('data/hack.csv')
print(df.shape)
# transfrom data into dataset
X = df['text']
y = df['is_humor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenize the data with nltk
train_tokens = [word_tokenize(text.lower()) for text in X_train]
test_tokens = [word_tokenize(text.lower()) for text in X_test]
# Embed the data with word2vec
model_embedding = gensim.models.Word2Vec(train_tokens, window=5, min_count=1, workers=4)
# Add a special token for out-of-vocabulary words
model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
unk_index = model_embedding.wv.key_to_index['<UNK>']
# Encode the tokens
train_encodings = [encode_tokens(tokens) for tokens in train_tokens]
test_encodings = [encode_tokens(tokens) for tokens in test_tokens]
# Define the maximum sequence length
train_encodings = pad_sequences(train_encodings, MAX_LEN)
test_encodings = pad_sequences(test_encodings, MAX_LEN)
train_dataset = HumorDataset(train_encodings, y_train.reset_index(drop=True))
test_dataset = HumorDataset(test_encodings, y_test.reset_index(drop=True))
vocab_size = len(model_embedding.wv.key_to_index)
embed_dim = model_embedding.vector_size
num_heads = 2
num_layers = 2
hidden_dim = 256
print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embed_dim}")
model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
# Training parameters
epochs = 30 #3
batch_size = 8
learning_rate = 2e-5
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
for td in train_dataset:
print(td['input_ids'].shape)
print(td['labels'])
break
for batch in train_loader:
print(batch['input_ids'].shape)
print(batch['labels'])
break
# Model to device
model.to(DEVICE)
print("Starting training...")
start_training_time = time.time()
losses = []
# Training loop
model.train()
for epoch in range(epochs):
epoch_start_time = time.time()
batch_losses = []
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
batch_losses.append(loss.item())
losses.append(np.mean(batch_losses))
epoch_end_time = time.time()
print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {losses[-1]:.5f}")
end_training_time = time.time()
print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
print("Starting evaluation...")
# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
preds = outputs.round()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
# Save the model
timestamp = time.strftime("%Y%m%d-%H%M%S")
torch.save(model.state_dict(), f'models/transformer_acc_{accuracy}_{timestamp}.pth')
print("Model saved.")
# Save model hyperparameters as json
hyperparameters = {
'max_len': MAX_LEN,
'vocab_size': vocab_size,
'embed_dim': embed_dim,
'num_heads': num_heads,
'num_layers': num_layers,
'hidden_dim': hidden_dim,
'epochs': epochs,
'batch_size': batch_size,
'learning_rate': learning_rate,
'accuracy': accuracy
}
pd.DataFrame(hyperparameters, index=[0]).to_json(f'models/transformer_acc_{accuracy}_{timestamp}.json')

View File

@ -1,199 +0,0 @@
"""
This file contains the transformer model.
"""
# TODO refactor the code
# TODO create ml helper script
# TODO create ml evaluation script
# TODO track overfitting better
# TODO validate model in training (accuracy, loss, etc)
# TODO set length to a constant value which is the max length of the sentences or nearly
# TODO user gloVe embeddings
#TODO: add attention mask
# TODO: add positional encoding
#TODO: add dropout (if needed)
import time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
import ml_helper
import ml_history
class TransformerBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
super(TransformerBinaryClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
self.fc = nn.Linear(embed_dim, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids):
input_ids = input_ids.long()
embedded = self.embedding(input_ids)
transformer_output = self.transformer(embedded, embedded)
pooled_output = transformer_output.mean(dim=1)
logits = self.fc(pooled_output)
return self.sigmoid(logits)
if __name__ == "__main__":
# Load the data
data_path = 'data/idx_based_padded'
train_dataset = torch.load(data_path + '/train.pt')
test_dataset = torch.load(data_path + '/test.pt')
val_dataset = torch.load(data_path + '/val.pt')
# +2 for padding and unk tokens
vocab_size = train_dataset.vocab_size + 2
embed_dim = 100 #train_dataset.emb_dim
# NOTE: Info comes from data explore notebook: 280 is max length,
# 139 contains 80% and 192 contains 95% of the data
max_len = 280
device = ml_helper.get_device(verbose=True)
# Model hyperparameters
num_heads = 2
num_layers = 2
hidden_dim = 256
model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
# Training parameters
epochs = 3 #3
batch_size = 8
learning_rate = 2e-5
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
################################################################################################
# Training
################################################################################################
# Initialize the history
history = ml_history.History()
# Model to device
model.to(device)
print("Starting training...")
start_training_time = time.time()
# Training loop
model.train()
for epoch in range(epochs):
# init batch tracking
epoch_start_time = time.time()
history.batch_reset()
for batch in train_loader:
optimizer.zero_grad()
# prepare batch
input_ids = batch['input_ids'].to(device)
labels = batch['labels'].unsqueeze(1).to(device)
# forward pass
outputs = model(input_ids)
loss = criterion(outputs, labels)
# backward pass
loss.backward()
optimizer.step()
# calculate accuracy train
preds = outputs.round()
train_acc = accuracy_score(labels.cpu().detach().numpy(),
preds.cpu().detach().numpy())
# update batch history
history.batch_update_train(loss.item(), train_acc)
# calculate accuracy val
model.eval()
with torch.no_grad():
for val_batch in val_loader:
val_input_ids = val_batch['input_ids'].to(device)
val_labels_batch = val_batch['labels'].unsqueeze(1).to(device)
val_outputs = model(val_input_ids)
val_acc = accuracy_score(val_outputs.round().cpu().numpy(),
val_labels_batch.cpu().numpy())
history.batch_update_val(val_acc)
model.train()
# update epoch history
history.update()
epoch_end_time = time.time()
print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}")
end_training_time = time.time()
print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
################################################################################################
# Evaluation
################################################################################################
print("Starting evaluation...")
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
labels = batch['labels'].unsqueeze(1).to(device)
outputs = model(input_ids)
preds = outputs.round()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
################################################################################################
# Save model and hyperparameters
################################################################################################
timestamp = time.strftime("%Y%m%d-%H%M%S")
ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp,
max_len=max_len,
vocab_size=vocab_size,
embed_dim=embed_dim,
num_heads=num_heads,
num_layers=num_layers,
hidden_dim=hidden_dim,
epochs=epochs,
batch_size=batch_size,
learning_rate=learning_rate)
#save history
history_path = f'models/transformer_history_{timestamp}.json'
with open(history_path, 'w') as f:
json.dump(history.get_history(), f)

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,584 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KuFFT6LrB6Fe"
},
"outputs": [],
"source": [
"import time\n",
"import json\n",
"import math\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import DataLoader\n",
"from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
"\n",
"from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n",
"from sklearn.model_selection import KFold\n",
"# local imports\n",
"import ml_evaluation as ml_eval\n",
"import ml_helper\n",
"import ml_history\n",
"import dataset_generator as data_gen\n",
"# class imports\n",
"import HumorDataset as humor_ds\n",
"import EarlyStopping\n",
"import BalancedCELoss\n",
"\n",
"\n",
"# architecture inspired:\n",
"# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n",
"\n",
"# TODO: maybe KFold for cross validation?\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
}
],
"source": [
"torch.manual_seed(0)\n",
"np.random.seed(0)\n",
"\n",
"\n",
"best_model_filename = 'best_transformer_reg_model.pt'\n",
"\n",
"device = ml_helper.get_device(verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"400002\n",
"vocab_size: 400002, d_model: 100\n",
"vocab_size: 400002, d_model: 100\n"
]
}
],
"source": [
"embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n",
"\n",
"vocab_size = len(embedding_matrix)\n",
"d_model = len(embedding_matrix[0])\n",
"vocab_size, d_model = embedding_matrix.size()\n",
"print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class PositionalEncoding(nn.Module):\n",
" \"\"\"\n",
" https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n",
" \"\"\"\n",
"\n",
" def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n",
" super().__init__()\n",
" self.dropout = nn.Dropout(p=dropout)\n",
"\n",
" pe = torch.zeros(vocab_size, d_model)\n",
" position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n",
" div_term = torch.exp(\n",
" torch.arange(0, d_model, 2).float()\n",
" * (-math.log(10000.0) / d_model)\n",
" )\n",
" pe[:, 0::2] = torch.sin(position * div_term)\n",
" pe[:, 1::2] = torch.cos(position * div_term)\n",
" pe = pe.unsqueeze(0)\n",
" self.register_buffer(\"pe\", pe)\n",
"\n",
" def forward(self, x):\n",
" x = x + self.pe[:, : x.size(1), :]\n",
" return self.dropout(x)\n",
"\n",
"\n",
"class TransformerBinaryClassifier(nn.Module):\n",
" \"\"\"\n",
" Text classifier based on a pytorch TransformerEncoder.\n",
" \"\"\"\n",
"\n",
" def __init__(\n",
" self,\n",
" embeddings,\n",
" nhead=8,\n",
" dim_feedforward=2048,\n",
" num_layers=6,\n",
" positional_dropout=0.1,\n",
" classifier_dropout=0.1,\n",
" activation=\"relu\",\n",
" ):\n",
"\n",
" super().__init__()\n",
"\n",
" vocab_size, d_model = embeddings.size()\n",
" assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n",
"\n",
" self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n",
"\n",
" self.pos_encoder = PositionalEncoding(\n",
" d_model=d_model,\n",
" dropout=positional_dropout,\n",
" vocab_size=vocab_size,\n",
" )\n",
"\n",
" encoder_layer = nn.TransformerEncoderLayer(\n",
" d_model=d_model,\n",
" nhead=nhead,\n",
" dim_feedforward=dim_feedforward,\n",
" dropout=classifier_dropout,\n",
" )\n",
" self.transformer_encoder = nn.TransformerEncoder(\n",
" encoder_layer,\n",
" num_layers=num_layers,\n",
" )\n",
" # normalize to stabilize and stop overfitting\n",
" self.batch_norm = nn.BatchNorm1d(d_model)\n",
" self.classifier = nn.Linear(d_model, 1)\n",
" self.d_model = d_model\n",
" #self.softmax = nn.Softmax(dim=1)\n",
" #self.sigmoid = nn.Sigmoid()\n",
"\n",
" def forward(self, x):\n",
" x = self.emb(x) * math.sqrt(self.d_model)\n",
" x = self.pos_encoder(x)\n",
" x = self.transformer_encoder(x)\n",
" x = x.mean(dim=1)\n",
" # normalize to stabilize and stop overfitting\n",
" #x = self.batch_norm(x)\n",
"\n",
" #NOTE: no activation function for regression\n",
" # sigmoid would only distort the output\n",
" x = self.classifier(x)\n",
" \n",
" return x\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def load_preprocess_data(path_data='data/hack.csv'):\n",
" df = pd.read_csv(path_data)\n",
" df = df.dropna(subset=['humor_rating'])\n",
"\n",
" df['y'] = df['humor_rating']\n",
" X = df['text']\n",
" y = df['y']\n",
" return X, y"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train 3945 3945\n",
"test 494 494\n",
"val 493 493\n"
]
}
],
"source": [
"X,y = load_preprocess_data()\n",
"\n",
"ret_dict = data_gen.split_data(X, y)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set hyper params"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model created\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
" warnings.warn(\n"
]
}
],
"source": [
"params = {\n",
" # used for class balancing\n",
" 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n",
" # training parameters\n",
" 'batch_size': 32, # 32 (16 to 64)\n",
" 'epochs': 100, # 100\n",
" 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n",
" \n",
" # NOTE: used for gradient clipping (needed for lstm and transformer)\n",
" # use 0 to disable\n",
" 'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n",
" \n",
" # patience for early stopping\n",
" 'early_stopping_patience': 5, # 5 (3 to 10)\n",
"\n",
" # learning rate scheduler\n",
" 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n",
" 'lr_scheduler_patience': 3, # 3 (2 to 5)\n",
"\n",
" # model parameters\n",
" 'nhead': 2, # 5\n",
" 'num_layers': 3, # 6\n",
" 'hidden_dim': 10, # 50\n",
"\n",
" # regularization parameters\n",
" 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
" 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
" 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n",
"}\n",
"\n",
"# Model initialization\n",
"model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n",
" nhead=params['nhead'], \n",
" num_layers=params['num_layers'], \n",
" dim_feedforward=params['hidden_dim'],\n",
" positional_dropout=params['positional_dropout'],\n",
" classifier_dropout=params['classifier_dropout']\n",
" )\n",
"model.to(device)\n",
"print('model created')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### create datasets"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"datasets length: 3945 493\n",
"train: 124, val: 16, test: 16\n"
]
}
],
"source": [
"# NOTE: Info comes from data explore notebook: 280 is max length,\n",
"# 139 contains 80% and 192 contains 95% of the data\n",
"max_len = 280\n",
"\n",
"train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n",
"val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n",
"test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n",
"\n",
"print('datasets length:', len(train_dataset), len(val_dataset))\n",
"#NOTE: overfitting test\n",
"#train_dataset.labels = train_dataset.labels[:100]\n",
"#train_dataset.texts = train_dataset.texts[:100]\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
"val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n",
"test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
"\n",
"# NOTE: samller because of batches not all data\n",
"print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set training requirements"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#TODO: change to RMSE\n",
"\"\"\"\n",
"criterion = nn.MSELoss()\n",
"loss = torch.sqrt(criterion(x, y))\n",
"loss.backward()\n",
"print(x.grad)\n",
"\"\"\"\n",
"criterion = nn.MSELoss()\n",
"\n",
"optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n",
" lr=params['lr']) #, \n",
" #weight_decay=params['weight_decay'])\n",
"\"\"\"\n",
"scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n",
" factor=params['lr_scheduler_factor'],\n",
" patience=params['lr_scheduler_patience'],\n",
" verbose=True)\n",
"\"\"\"\n",
"early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training loop"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s\n",
"Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s\n",
"Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s\n",
"Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s\n",
"Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s\n",
"Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s\n",
"Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s\n",
"Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 17/100, Train Loss: 1.8081, Val Loss: 1.8873, Time: 2.44s\n",
"Epoch 18/100, Train Loss: 1.8078, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 19/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 20/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 21/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 22/100, Train Loss: 1.8103, Val Loss: 1.8873, Time: 2.09s\n",
"Epoch 23/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.16s\n",
"Epoch 24/100, Train Loss: 1.8034, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 25/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.46s\n",
"Epoch 26/100, Train Loss: 1.8084, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 27/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.35s\n",
"Epoch 28/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 29/100, Train Loss: 1.8136, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 30/100, Train Loss: 1.8051, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 31/100, Train Loss: 1.8026, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 32/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.16s\n",
"Epoch 33/100, Train Loss: 1.8121, Val Loss: 1.8873, Time: 2.13s\n",
"Epoch 34/100, Train Loss: 1.8098, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 35/100, Train Loss: 1.8036, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 36/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 37/100, Train Loss: 1.8108, Val Loss: 1.8873, Time: 2.50s\n",
"Epoch 38/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.45s\n",
"Epoch 39/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 40/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 41/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 42/100, Train Loss: 1.8088, Val Loss: 1.8873, Time: 2.30s\n",
"Epoch 43/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 44/100, Train Loss: 1.8029, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 45/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 46/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 47/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 48/100, Train Loss: 1.8069, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 49/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 50/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 51/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 52/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 53/100, Train Loss: 1.8075, Val Loss: 1.8873, Time: 2.00s\n",
"Epoch 54/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 55/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.02s\n",
"Epoch 56/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 57/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.34s\n",
"Epoch 58/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 59/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.11s\n",
"Epoch 60/100, Train Loss: 1.8100, Val Loss: 1.8873, Time: 2.05s\n",
"Epoch 61/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 62/100, Train Loss: 1.8068, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 63/100, Train Loss: 1.8012, Val Loss: 1.8873, Time: 2.32s\n",
"Epoch 64/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.35s\n",
"Epoch 65/100, Train Loss: 1.8109, Val Loss: 1.8873, Time: 2.36s\n",
"Epoch 66/100, Train Loss: 1.8030, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 67/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 68/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.20s\n",
"Epoch 69/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.18s\n",
"Epoch 70/100, Train Loss: 1.8019, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 71/100, Train Loss: 1.8025, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 72/100, Train Loss: 1.8124, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 73/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 74/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 75/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 76/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 77/100, Train Loss: 1.8141, Val Loss: 1.8873, Time: 2.39s\n",
"Epoch 78/100, Train Loss: 1.8092, Val Loss: 1.8873, Time: 2.44s\n",
"Epoch 79/100, Train Loss: 1.8106, Val Loss: 1.8873, Time: 2.30s\n",
"Epoch 80/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.25s\n",
"Epoch 81/100, Train Loss: 1.8142, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 82/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 83/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 84/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 85/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 86/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 87/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.09s\n",
"Epoch 88/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.11s\n",
"Epoch 89/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.25s\n",
"Epoch 90/100, Train Loss: 1.8047, Val Loss: 1.8873, Time: 2.42s\n",
"Epoch 91/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.34s\n",
"Epoch 92/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.37s\n",
"Epoch 93/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.20s\n",
"Epoch 94/100, Train Loss: 1.8031, Val Loss: 1.8873, Time: 2.18s\n",
"Epoch 95/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.07s\n",
"Epoch 96/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.20s\n"
]
}
],
"source": [
"# Training loop\n",
"\n",
"for epoch in range(params['epochs']):\n",
" epoch_start_time = time.time()\n",
" model.train()\n",
" \n",
" train_loss = 0.0\n",
" \n",
" for batch in train_loader:\n",
" optimizer.zero_grad()\n",
" input_ids, labels = batch\n",
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
"\n",
" outputs = model(input_ids)\n",
" outputs = outputs.squeeze().float()\n",
" loss = criterion(outputs, labels)\n",
" loss.backward()\n",
" #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n",
" optimizer.step()\n",
" preds = outputs\n",
" \n",
" train_loss += loss.item()\n",
"\n",
" train_loss /= len(train_loader)\n",
" \n",
" # Validation\n",
" model.eval()\n",
" val_loss = 0.0\n",
" \n",
" with torch.no_grad():\n",
" for batch in val_loader:\n",
" input_ids, labels = batch\n",
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
" outputs = model(input_ids)\n",
" outputs = outputs.squeeze().float()\n",
" loss = criterion(outputs, labels)\n",
" preds = outputs\n",
" \n",
" val_loss += loss.item()\n",
"\n",
" val_loss /= len(val_loader)\n",
" \n",
" epoch_end_time = time.time()\n",
" \n",
" print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n",
" f'Train Loss: {train_loss:.4f}, '\n",
" f'Val Loss: {val_loss:.4f}, '\n",
" f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n",
"\n",
" "
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}