In [None]:
import time
import json
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix
from sklearn.model_selection import KFold
# local imports
import ml_evaluation as ml_eval
import ml_helper
import ml_history
import dataset_generator as data_gen
# class imports
import HumorDataset as humor_ds
import EarlyStopping
import BalancedCELoss


# architecture inspired:
# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/

# TODO: maybe KFold for cross validation?


In [2]:
torch.manual_seed(0)
np.random.seed(0)


best_model_filename = 'best_transformer_reg_model.pt'

device = ml_helper.get_device(verbose=True)

Using device: cuda


### Load Embeddings

In [3]:
embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()

vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")

400002
vocab_size: 400002, d_model: 100
vocab_size: 400002, d_model: 100


### Define Model

In [None]:
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, vocab_size=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(vocab_size, d_model)
        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)


class TransformerBinaryClassifier(nn.Module):
    """
    Text classifier based on a pytorch TransformerEncoder.
    """

    def __init__(
        self,
        embeddings,
        nhead=8,
        dim_feedforward=2048,
        num_layers=6,
        positional_dropout=0.1,
        classifier_dropout=0.1,
        activation="relu",
    ):

        super().__init__()

        vocab_size, d_model = embeddings.size()
        assert d_model % nhead == 0, "nheads must divide evenly into d_model"

        self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)

        self.pos_encoder = PositionalEncoding(
            d_model=d_model,
            dropout=positional_dropout,
            vocab_size=vocab_size,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=classifier_dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )
        # normalize to stabilize and stop overfitting
        self.batch_norm = nn.BatchNorm1d(d_model)
        self.classifier = nn.Linear(d_model, 1)
        self.d_model = d_model
        #self.softmax = nn.Softmax(dim=1)
        #self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.emb(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        # normalize to stabilize and stop overfitting
        #x = self.batch_norm(x)

        #NOTE: no activation function for regression
        # sigmoid would only distort the output
        x = self.classifier(x)
        
        return x


### Load data

In [20]:
def load_preprocess_data(path_data='data/hack.csv'):
    df = pd.read_csv(path_data)
    df = df.dropna(subset=['humor_rating'])

    df['y'] = df['humor_rating']
    X = df['text']
    y = df['y']
    return X, y

In [26]:
X,y = load_preprocess_data()

ret_dict = data_gen.split_data(X, y)


train 3945 3945
test 494 494
val 493 493


### Set hyper params

In [27]:
params = {
    # used for class balancing
    'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)
    # training parameters
    'batch_size': 32, # 32 (16 to 64)
    'epochs': 100, # 100
    'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)
    
    # NOTE: used for gradient clipping (needed for lstm and transformer)
    # use 0 to disable
    'clipping_max_norm': 0, # 0 (0.5 to 2.0)
    
    # patience for early stopping
    'early_stopping_patience': 5, # 5 (3 to 10)

    # learning rate scheduler
    'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)
    'lr_scheduler_patience': 3, # 3 (2 to 5)

    # model parameters
    'nhead': 2, # 5
    'num_layers': 3, # 6
    'hidden_dim': 10, # 50

    # regularization parameters
    'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)
    'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)
    'weight_decay': 1e-2 # 0.0  (1e-6 to 1e-2)
}

# Model initialization
model = TransformerBinaryClassifier(embeddings=embedding_matrix, 
                                    nhead=params['nhead'], 
                                    num_layers=params['num_layers'], 
                                    dim_feedforward=params['hidden_dim'],
                                    positional_dropout=params['positional_dropout'],
                                    classifier_dropout=params['classifier_dropout']
                                    )
model.to(device)
print('model created')

model created




### create datasets

In [8]:
# NOTE: Info comes from data explore notebook: 280 is max length,
# 139 contains 80% and 192 contains 95% of the data
max_len = 280

train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)
val_dataset =  humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)
test_dataset =  humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)

print('datasets length:', len(train_dataset), len(val_dataset))
#NOTE: overfitting test
#train_dataset.labels = train_dataset.labels[:100]
#train_dataset.texts = train_dataset.texts[:100]

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

# NOTE: samller because of batches not all data
print(f"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}")

datasets length: 3945 493
train: 124, val: 16, test: 16


### Set training requirements

In [10]:
#TODO: change to RMSE
"""
criterion = nn.MSELoss()
loss = torch.sqrt(criterion(x, y))
loss.backward()
print(x.grad)
"""
criterion = nn.MSELoss()

optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), 
                            lr=params['lr']) #, 
                            #weight_decay=params['weight_decay'])
"""
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                        factor=params['lr_scheduler_factor'],
                                                        patience=params['lr_scheduler_patience'],
                                                        verbose=True)
"""
early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)


# Training loop

In [None]:
# Training loop

for epoch in range(params['epochs']):
    epoch_start_time = time.time()
    model.train()
    
    train_loss = 0.0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device).float() 

        outputs = model(input_ids)
        outputs = outputs.squeeze().float()
        loss = criterion(outputs, labels)
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])
        optimizer.step()
        preds = outputs
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device).float() 
            outputs = model(input_ids)
            outputs = outputs.squeeze().float()
            loss = criterion(outputs, labels)
            preds = outputs
            
            val_loss += loss.item()

    val_loss /= len(val_loader)
    
    epoch_end_time = time.time()
    
    print(f'Epoch {epoch+1}/{params["epochs"]}, '
          f'Train Loss: {train_loss:.4f}, '
          f'Val Loss: {val_loss:.4f}, '
          f'Time: {epoch_end_time - epoch_start_time:.2f}s')

    

Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s
Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s
Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s
Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s
Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s
Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s
Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s
Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s
Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s
Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s
Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s
Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s
Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s
Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s
Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s
Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.887