Merge branch 'main' of https://gitty.informatik.hs-mannheim.de/3016498/ANLP_WS24_CA2

2025-01-27 13:56:02 +01:00 · 2025-01-27 13:56:02 +01:00 · 8097362c61
parent a25821f20c 8279123019
commit 8097362c61
18 changed files with 1814 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 # Ignore pycache directory
 __pycache__/
 # Ignore virtual environment directory
 .venv/
@ -18,3 +21,6 @@ plots/
 # Ignore plot file
 *.png
 *.jpg
 # Ignore everything with delete_me in name
 *delete_me*
--- a/HumorDataset.py
+++ b/HumorDataset.py
@ -0,0 +1,42 @@
 """
 This file contains the HumorDataset class.
 """
 import torch
 import numpy as np
 class HumorDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, vocab_size=0, emb_dim=None):
        self.original_indices = labels.index.to_list()
        self.data = data
        self.labels = labels.reset_index(drop=True)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        # TODO: bug fix
        self.shape = self.get_shape()
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.labels)
    def get_single_shape(self, data):
        shape_data = None
        if type(data) == list:
            shape_data = len(data[0])
        elif type(data) == torch.Tensor:
            shape_data = data[0].shape
        elif type(data) == np.ndarray:
            shape_data = data[0].shape
        return shape_data
    def get_shape(self):
        shape_data = self.get_single_shape(self.data)
        shape_labels = self.get_single_shape(self.labels)
        return shape_data, shape_labels
--- a/README.md
+++ b/README.md
@ -4,6 +4,17 @@
 ## TODOS
 data
 - maybe buffer zone between good and bad jokes (trade off would be less data)
 - maybe not bineary classification
 - maybe change to humor detection (more data available)
 - dataset shape doesnt work correctly
 - history: integrate validation loss
 ## Data
--- a/data/embedded_padded/test.pt
+++ b/data/embedded_padded/test.pt
--- a/data/embedded_padded/train.pt
+++ b/data/embedded_padded/train.pt
--- a/data/embedded_padded/val.pt
+++ b/data/embedded_padded/val.pt
--- a/data/idx_based_padded/test.pt
+++ b/data/idx_based_padded/test.pt
--- a/data/idx_based_padded/train.pt
+++ b/data/idx_based_padded/train.pt
--- a/data/idx_based_padded/val.pt
+++ b/data/idx_based_padded/val.pt
--- a/data_explore_hack.ipynb
+++ b/data_explore_hack.ipynb
@ -914,7 +914,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.10.4"
  }
 },
 "nbformat": 4,
--- a/data_explore_hack_rating.ipynb
+++ b/data_explore_hack_rating.ipynb
--- a/dataset_generator.py
+++ b/dataset_generator.py
@ -0,0 +1,123 @@
 """
 This file contains the dataset generation and preprocessing.
 """
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
 from nltk.tokenize import word_tokenize
 import gensim
 import torch
 import os
 from HumorDataset import HumorDataset
 def get_embedding_idx(model, word):
    if word in model.wv:
        return model.wv.key_to_index[word]
    else:
        return unk_index
 def get_embedding_vector(model, word):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(model.vector_size)
 def encode_tokens(tokens, vector=False):
    if vector:
        return [get_embedding_vector(model_embedding, token) for token in tokens]
    else:
        return [get_embedding_idx(model_embedding, token) for token in tokens]
 def pad_sequences(sequences, max_len, pad_index):
    return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
 def split_data(X, y, test_size=0.1, val_size=0.1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
    val_split_ratio = val_size / (val_size + test_size)
    X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=val_split_ratio, random_state=42)
    ret_dict = {
        'train': {'X': X_train, 'y': y_train},
        'test': {'X': X_test, 'y': y_test},
        'val': {'X': X_val, 'y': y_val}
    }
    return ret_dict
 def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
    if not os.path.exists(path):
        print('Creating directory:', path)
        os.makedirs(path)
    print('saving data into:', path)
    for key, value in data_dict.items():
        # tansform to Dataset
        dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
        # save dataset
        torch.save(dataset, path + prefix + key + '.pt')
 if __name__ == "__main__":
    # Load the data from csv
    df = pd.read_csv('data/hack.csv')
    print(df.shape)
    df = df.dropna(subset=['humor_rating'])
    # find median of humor_rating
    median_rating = df['humor_rating'].median()
    #print('median and therefore middle of humor_rating:', median_rating)
    df['y'] = df['humor_rating'] > median_rating 
    # transfrom data into dataset
    X = df['text']
    y = df['y']
    # Tokenize the data with nltk
    tokens = [word_tokenize(text.lower()) for text in X]
    vocab_size = len(set([word for sentence in tokens for word in sentence]))
    print('vocab size:', vocab_size)
    # Pad the sequences 
    # NOTE: Info comes from data explore notebook: 280 is max length, 
    # 139 contains 80% and 192 contains 95% of the data
    max_len = 280
    padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
    # split data into train, test, and validation
    data_dict = split_data(padded_indices, y)
    # TODO: test gloVe embeddings
    # Embed the data with word2vec
    model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
    # Add a special token for out-of-vocabulary words
    model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
    unk_index = model_embedding.wv.key_to_index['<UNK>']
   # Add padding index for padding
    model_embedding.wv.add_vector('<PAD>', np.zeros(model_embedding.vector_size))
    pad_index = model_embedding.wv.key_to_index['<PAD>']
    data_idx_based = data_dict.copy()
    vector_based = False
    for key in data_idx_based.keys():
        data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
        # print shape of data
        #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
    # save the data
    save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
    vector_based = True
    # Encode the tokens
    for key in data_dict.keys():
        data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
        # print shape of data
        #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
    # Save the data
    save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
--- a/gpu_check.py
+++ b/gpu_check.py
@ -1,16 +0,0 @@
 import torch
 # Check if CUDA is available
 cuda_available = torch.cuda.is_available()
 print(f"CUDA available: {cuda_available}")
 if cuda_available:
    # Print the current CUDA device
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device: {current_device}")
    # Print the name of the current CUDA device
    device_name = torch.cuda.get_device_name(current_device)
    print(f"CUDA device name: {device_name}")
 else:
    print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
--- a/ml_helper.py
+++ b/ml_helper.py
@ -0,0 +1,89 @@
 import torch
 import nltk
 import time
 import json
 import os
 def get_device(verbose=False):
    """
    Get the current device (CPU or GPU) for PyTorch.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if verbose:
        print('Using device:', device)
    return device
 def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
    """
    Save the model and hyperparameters to disk.
    **kwargs: hyperparameters to save
    """
    # Create a timestamp
    if timestamp is None:
        timestamp = time.strftime("%Y%m%d-%H%M%S")
    accuracy = round(accuracy, 4)
    # Save the model state dictionary
    model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}.")
    # Save the hyperparameters as a JSON file
    hyperparameters = kwargs
    hyperparameters['accuracy'] = accuracy
    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
    with open(hyperparameters_path, 'w') as f:
        json.dump(hyperparameters, f)
    print(f"Hyperparameters saved to {hyperparameters_path}.")
 def get_newest_model_path(path, name=None, extension=".pth"):
    """
    Get the newest file in a directory.
    """
    # List all files in the directory
    files = [f for f in os.listdir(path) if f.endswith(extension)]
    # List all files with name in it
    if name:
        files = [f for f in files if name in f]
    # Sort files by modification time
    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
    # Get the newest file
    if files:
        newest_model_path = os.path.join(path, files[0])
        return newest_model_path
    else:
        print("No File found in the directory")
        return None
 def main():
    """
    Main function used to set up the environment.
    """
    # download nltk data
    nltk.download('punkt')
    nltk.download('punkt_tab')
    # Check if CUDA is available
    cuda_available = torch.cuda.is_available()
    print(f"CUDA available: {cuda_available}")
    if cuda_available:
        # Print the current CUDA device
        current_device = torch.cuda.current_device()
        print(f"Current CUDA device: {current_device}")
        # Print the name of the current CUDA device
        device_name = torch.cuda.get_device_name(current_device)
        print(f"CUDA device name: {device_name}")
    else:
        print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
 if __name__ == "__main__":
    main()
--- a/ml_history.py
+++ b/ml_history.py
@ -0,0 +1,48 @@
 import numpy as np
 class History:
    """
    Class to store the history of the training process.
    Used to store the loss and accuracy of the training and validation sets.
    """
    def __init__(self):
        self.history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
        self.batch_history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
    def update(self):
        self.history['loss'].append(np.mean(self.batch_history['loss']))
        self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
        self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
    def get_history(self):
        return self.history
    def batch_reset(self):
        self.batch_history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
    def batch_update(self, loss, train_acc, val_acc):
        self.batch_history['loss'].append(loss)
        self.batch_history['train_acc'].append(train_acc)
        self.batch_history['val_acc'].append(val_acc)
    def batch_update_train(self, loss, train_acc):
        self.batch_history['loss'].append(loss)
        self.batch_history['train_acc'].append(train_acc)
    def batch_update_val(self, val_acc):
        self.batch_history['val_acc'].append(val_acc)
    def get_batch_history(self):
        return self.batch_history
--- a/transformer_1a.py
+++ b/transformer_1a.py
@ -42,6 +42,13 @@ import time
 import torchvision
 torchvision.disable_beta_transforms_warning()
 def get_device(verbose=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if verbose:
        print('Using device:', device)
    return device
 # Test if GPU is available
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print('Using device:', DEVICE)
@ -69,7 +76,7 @@ def pad_sequences(sequences, MAX_LEN):
 class HumorDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
-        self.labels = labels
+        self.labels = labels.reset_index(drop=True)
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)}
--- a/transformer_1b.py
+++ b/transformer_1b.py
@ -0,0 +1,199 @@
 """
 This file contains the transformer model.
 """
 # TODO refactor the code
 # TODO create ml helper script
 # TODO create ml evaluation script
 # TODO track overfitting better
 # TODO validate model in training (accuracy, loss, etc)
 # TODO set length to a constant value which is the max length of the sentences or nearly
 # TODO user gloVe embeddings
 #TODO: add attention mask
 # TODO: add positional encoding
 #TODO: add dropout (if needed)
 import time
 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader
 from transformers import AdamW
 from sklearn.metrics import accuracy_score
 import ml_helper
 import ml_history
 class TransformerBinaryClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
        super(TransformerBinaryClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
        self.fc = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_ids):
        input_ids = input_ids.long()
        embedded = self.embedding(input_ids)
        transformer_output = self.transformer(embedded, embedded)
        pooled_output = transformer_output.mean(dim=1)
        logits = self.fc(pooled_output)
        return self.sigmoid(logits)
 if __name__ == "__main__":
    # Load the data
    data_path = 'data/idx_based_padded'
    train_dataset = torch.load(data_path + '/train.pt')
    test_dataset = torch.load(data_path + '/test.pt') 
    val_dataset = torch.load(data_path + '/val.pt')
    # +2 for padding and unk tokens
    vocab_size = train_dataset.vocab_size + 2 
    embed_dim = 100 #train_dataset.emb_dim
    # NOTE: Info comes from data explore notebook: 280 is max length, 
    # 139 contains 80% and 192 contains 95% of the data
    max_len = 280
    device = ml_helper.get_device(verbose=True)
    # Model hyperparameters
    num_heads = 2
    num_layers = 2
    hidden_dim = 256
    model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
    # Training parameters
    epochs = 3 #3
    batch_size = 8
    learning_rate = 2e-5
    # Optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()
    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    ################################################################################################
    # Training
    ################################################################################################
    # Initialize the history
    history = ml_history.History()
    # Model to device
    model.to(device)
    print("Starting training...")
    start_training_time = time.time()
    # Training loop
    model.train()
    for epoch in range(epochs):
        # init batch tracking
        epoch_start_time = time.time()
        history.batch_reset()
        for batch in train_loader:
            optimizer.zero_grad()
            # prepare batch
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].unsqueeze(1).to(device)
            # forward pass
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            # backward pass
            loss.backward()
            optimizer.step()
            # calculate accuracy train
            preds = outputs.round()
            train_acc = accuracy_score(labels.cpu().detach().numpy(), 
                                       preds.cpu().detach().numpy())
            # update batch history
            history.batch_update_train(loss.item(), train_acc)
        # calculate accuracy val
        model.eval()
        with torch.no_grad():
            for val_batch in val_loader:
                val_input_ids = val_batch['input_ids'].to(device)
                val_labels_batch = val_batch['labels'].unsqueeze(1).to(device)
                val_outputs = model(val_input_ids)
                val_acc = accuracy_score(val_outputs.round().cpu().numpy(),
                                      val_labels_batch.cpu().numpy())
                history.batch_update_val(val_acc)
        model.train()
        # update epoch history
        history.update()
        epoch_end_time = time.time()
        print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}")
    end_training_time = time.time()
    print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
    ################################################################################################
    # Evaluation
    ################################################################################################
    print("Starting evaluation...")
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].unsqueeze(1).to(device)
            outputs = model(input_ids)
            preds = outputs.round()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}")
    ################################################################################################
    # Save model and hyperparameters
    ################################################################################################
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp, 
                                             max_len=max_len, 
                                             vocab_size=vocab_size, 
                                             embed_dim=embed_dim, 
                                             num_heads=num_heads, 
                                             num_layers=num_layers, 
                                             hidden_dim=hidden_dim, 
                                             epochs=epochs, 
                                             batch_size=batch_size, 
                                             learning_rate=learning_rate)
    #save history
    history_path = f'models/transformer_history_{timestamp}.json'
    with open(history_path, 'w') as f:
        json.dump(history.get_history(), f)
--- a/transformer_evaluation.ipynb
+++ b/transformer_evaluation.ipynb