ANLP_WS24_CA2/transformer_bootstrap_agg.py

311 lines
9.8 KiB
Python

import time
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score
from sklearn.model_selection import KFold
# local imports
import ml_evaluation as ml_eval
import ml_helper
import ml_history
import dataset_generator as data_gen
# class imports
import HumorDataset as humor_ds
import EarlyStopping
import BalancedCELoss
torch.manual_seed(0)
np.random.seed(0)
best_model_filename = 'best_transformer_reg_model.pt'
device = ml_helper.get_device(verbose=True)
embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
class PositionalEncoding(nn.Module):
def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(vocab_size, d_model)
position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float()
* (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[:, : x.size(1), :]
return self.dropout(x)
class TransformerBinaryClassifier(nn.Module):
def __init__(
self,
embeddings,
nhead=8,
dim_feedforward=2048,
num_layers=6,
positional_dropout=0.1,
classifier_dropout=0.1,
activation="relu",
):
super().__init__()
vocab_size, d_model = embeddings.size()
assert d_model % nhead == 0, "nheads must divide evenly into d_model"
self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
self.pos_encoder = PositionalEncoding(
d_model=d_model,
dropout=positional_dropout,
vocab_size=vocab_size,
)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=nhead,
dim_feedforward=dim_feedforward,
dropout=classifier_dropout,
)
self.transformer_encoder = nn.TransformerEncoder(
encoder_layer,
num_layers=num_layers,
)
self.batch_norm = nn.BatchNorm1d(d_model)
self.classifier = nn.Linear(d_model, 1)
self.d_model = d_model
def forward(self, x):
x = self.emb(x) * math.sqrt(self.d_model)
x = self.pos_encoder(x)
x = self.transformer_encoder(x)
x = x.mean(dim=1)
x = self.classifier(x)
return x
def load_preprocess_data(path_data='data/hack.csv'):
df = pd.read_csv(path_data)
df = df.dropna(subset=['humor_rating'])
df['y'] = df['humor_rating']
X = df['text']
y = df['y']
return X, y
X, y = load_preprocess_data()
ret_dict = data_gen.split_data(X, y)
params = {
'equalize_classes_loss_factor': 0.15,
'batch_size': 32,
'epochs': 2,
'lr': 1e-4,
'clipping_max_norm': 0,
'early_stopping_patience': 5,
'lr_scheduler_factor': 0.5,
'lr_scheduler_patience': 3,
'nhead': 2,
'num_layers': 3,
'hidden_dim': 10,
'positional_dropout': 0.5,
'classifier_dropout': 0.5,
'weight_decay': 1e-2
}
max_len = 280
train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)
val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)
test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)
def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size):
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
model.to(device)
# Store for plotting
train_losses, val_losses = [], []
train_r2_scores, val_r2_scores = [], []
for epoch in range(epochs):
model.train()
total_loss = 0
all_preds, all_targets = [], []
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
loss.backward()
optimizer.step()
total_loss += loss.item()
all_preds.extend(outputs.detach().cpu().numpy())
all_targets.extend(targets.detach().cpu().numpy())
# Calculate R2
r2 = r2_score(all_targets, all_preds)
train_losses.append(total_loss / len(dataloader))
train_r2_scores.append(r2)
# Validation phase
model.eval()
val_loss = 0
val_preds, val_targets = [], []
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
val_loss += loss.item()
val_preds.extend(outputs.cpu().numpy())
val_targets.extend(targets.cpu().numpy())
# Calculate Validation R2
val_r2 = r2_score(val_targets, val_preds)
val_losses.append(val_loss / len(val_loader))
val_r2_scores.append(val_r2)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}")
return train_losses, val_losses, train_r2_scores, val_r2_scores
def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
models = []
all_train_losses, all_val_losses = [], []
all_train_r2_scores, all_val_r2_scores = [], []
subset_size = len(train_dataset) // num_models
for i in range(num_models):
print(f"Training Model {i + 1}/{num_models}...")
start_idx = i * subset_size
end_idx = start_idx + subset_size
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices)
model = ModelClass()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size)
models.append(model)
all_train_losses.append(train_losses)
all_val_losses.append(val_losses)
all_train_r2_scores.append(train_r2_scores)
all_val_r2_scores.append(val_r2_scores)
return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores
# Ensemble Prediction
def ensemble_predict(models, test_dataset):
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
all_predictions = []
with torch.no_grad():
for inputs, _ in dataloader:
inputs = inputs.to(device)
predictions = torch.stack([model(inputs).squeeze() for model in models])
avg_predictions = predictions.mean(dim=0)
all_predictions.extend(avg_predictions.cpu().numpy())
return np.array(all_predictions)
# Bootstrap Aggregating
num_models = 2
ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation(
lambda: TransformerBinaryClassifier(
embeddings=embedding_matrix,
nhead=params['nhead'],
num_layers=params['num_layers'],
dim_feedforward=params['hidden_dim'],
positional_dropout=params['positional_dropout'],
classifier_dropout=params['classifier_dropout']
).to(device),
train_dataset,
num_models=num_models,
epochs=params['epochs'],
batch_size=params['batch_size'],
learning_rate=params['lr']
)
# Ensemble Prediction on Testset
ensemble_predictions = ensemble_predict(ensemble_models, test_dataset)
# Plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Train and Validation Losses
for i in range(num_models):
ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}")
ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}")
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()
# Plot Train and Validation R²
for i in range(num_models):
ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}")
ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}")
ax2.set_title('Train and Validation R²')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('')
ax2.legend()
plt.tight_layout()
plt.show()
# Evaluation
mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions)
mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions)
r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions)
print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")