diff --git a/transformer_bootstrap_agg.py b/transformer_bootstrap_agg.py new file mode 100644 index 0000000..3d35d1d --- /dev/null +++ b/transformer_bootstrap_agg.py @@ -0,0 +1,310 @@ +import time +import json +import math + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +from nltk.tokenize import word_tokenize + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, Subset +from torch.optim.lr_scheduler import ReduceLROnPlateau + +from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score +from sklearn.model_selection import KFold +# local imports +import ml_evaluation as ml_eval +import ml_helper +import ml_history +import dataset_generator as data_gen +# class imports +import HumorDataset as humor_ds +import EarlyStopping +import BalancedCELoss + + +torch.manual_seed(0) +np.random.seed(0) + + +best_model_filename = 'best_transformer_reg_model.pt' + +device = ml_helper.get_device(verbose=True) + +embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix() + +vocab_size = len(embedding_matrix) +d_model = len(embedding_matrix[0]) +vocab_size, d_model = embedding_matrix.size() +print(f"vocab_size: {vocab_size}, d_model: {d_model}") + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, vocab_size=5000, dropout=0.1): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(vocab_size, d_model) + position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2).float() + * (-math.log(10000.0) / d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer("pe", pe) + + def forward(self, x): + x = x + self.pe[:, : x.size(1), :] + return self.dropout(x) + + +class TransformerBinaryClassifier(nn.Module): + def __init__( + self, + embeddings, + nhead=8, + dim_feedforward=2048, + num_layers=6, + positional_dropout=0.1, + classifier_dropout=0.1, + activation="relu", + ): + super().__init__() + + vocab_size, d_model = embeddings.size() + assert d_model % nhead == 0, "nheads must divide evenly into d_model" + + self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False) + + self.pos_encoder = PositionalEncoding( + d_model=d_model, + dropout=positional_dropout, + vocab_size=vocab_size, + ) + + encoder_layer = nn.TransformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=classifier_dropout, + ) + self.transformer_encoder = nn.TransformerEncoder( + encoder_layer, + num_layers=num_layers, + ) + self.batch_norm = nn.BatchNorm1d(d_model) + self.classifier = nn.Linear(d_model, 1) + self.d_model = d_model + + def forward(self, x): + x = self.emb(x) * math.sqrt(self.d_model) + x = self.pos_encoder(x) + x = self.transformer_encoder(x) + x = x.mean(dim=1) + x = self.classifier(x) + return x + + +def load_preprocess_data(path_data='data/hack.csv'): + df = pd.read_csv(path_data) + df = df.dropna(subset=['humor_rating']) + + df['y'] = df['humor_rating'] + X = df['text'] + y = df['y'] + return X, y + + +X, y = load_preprocess_data() + +ret_dict = data_gen.split_data(X, y) + +params = { + 'equalize_classes_loss_factor': 0.15, + 'batch_size': 32, + 'epochs': 2, + 'lr': 1e-4, + 'clipping_max_norm': 0, + 'early_stopping_patience': 5, + 'lr_scheduler_factor': 0.5, + 'lr_scheduler_patience': 3, + 'nhead': 2, + 'num_layers': 3, + 'hidden_dim': 10, + 'positional_dropout': 0.5, + 'classifier_dropout': 0.5, + 'weight_decay': 1e-2 +} + +max_len = 280 + +train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len) +val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len) +test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len) + +train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True) +val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False) +test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) + +early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False) + + +def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size): + dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + model.to(device) + + # Store for plotting + train_losses, val_losses = [], [] + train_r2_scores, val_r2_scores = [], [] + + for epoch in range(epochs): + model.train() + total_loss = 0 + all_preds, all_targets = [], [] + + for inputs, targets in dataloader: + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + outputs = model(inputs).squeeze() + loss = criterion(outputs, targets.float()) + loss.backward() + optimizer.step() + total_loss += loss.item() + + all_preds.extend(outputs.detach().cpu().numpy()) + all_targets.extend(targets.detach().cpu().numpy()) + + # Calculate R2 + r2 = r2_score(all_targets, all_preds) + train_losses.append(total_loss / len(dataloader)) + train_r2_scores.append(r2) + + # Validation phase + model.eval() + val_loss = 0 + val_preds, val_targets = [], [] + + with torch.no_grad(): + for inputs, targets in val_loader: + inputs, targets = inputs.to(device), targets.to(device) + outputs = model(inputs).squeeze() + loss = criterion(outputs, targets.float()) + val_loss += loss.item() + + val_preds.extend(outputs.cpu().numpy()) + val_targets.extend(targets.cpu().numpy()) + + # Calculate Validation R2 + val_r2 = r2_score(val_targets, val_preds) + val_losses.append(val_loss / len(val_loader)) + val_r2_scores.append(val_r2) + + print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}") + + return train_losses, val_losses, train_r2_scores, val_r2_scores + + +def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001): + models = [] + all_train_losses, all_val_losses = [], [] + all_train_r2_scores, all_val_r2_scores = [], [] + + subset_size = len(train_dataset) // num_models + + for i in range(num_models): + print(f"Training Model {i + 1}/{num_models}...") + start_idx = i * subset_size + end_idx = start_idx + subset_size + subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) + subset = Subset(train_dataset, subset_indices) + + model = ModelClass() + criterion = nn.MSELoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size) + + models.append(model) + all_train_losses.append(train_losses) + all_val_losses.append(val_losses) + all_train_r2_scores.append(train_r2_scores) + all_val_r2_scores.append(val_r2_scores) + + return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores + + +# Ensemble Prediction +def ensemble_predict(models, test_dataset): + dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False) + all_predictions = [] + + with torch.no_grad(): + for inputs, _ in dataloader: + inputs = inputs.to(device) + predictions = torch.stack([model(inputs).squeeze() for model in models]) + avg_predictions = predictions.mean(dim=0) + all_predictions.extend(avg_predictions.cpu().numpy()) + + return np.array(all_predictions) + + +# Bootstrap Aggregating +num_models = 2 +ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation( + lambda: TransformerBinaryClassifier( + embeddings=embedding_matrix, + nhead=params['nhead'], + num_layers=params['num_layers'], + dim_feedforward=params['hidden_dim'], + positional_dropout=params['positional_dropout'], + classifier_dropout=params['classifier_dropout'] + ).to(device), + train_dataset, + num_models=num_models, + epochs=params['epochs'], + batch_size=params['batch_size'], + learning_rate=params['lr'] +) + +# Ensemble Prediction on Testset +ensemble_predictions = ensemble_predict(ensemble_models, test_dataset) + +# Plotting +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) + +# Plot Train and Validation Losses +for i in range(num_models): + ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}") + ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}") + +ax1.set_title('Train and Validation Loss') +ax1.set_xlabel('Epochs') +ax1.set_ylabel('Loss') +ax1.legend() + +# Plot Train and Validation R² +for i in range(num_models): + ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}") + ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}") + +ax2.set_title('Train and Validation R²') +ax2.set_xlabel('Epochs') +ax2.set_ylabel('R²') +ax2.legend() + +plt.tight_layout() +plt.show() + +# Evaluation +mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions) +mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions) +r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions) + +print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")