Merge branch 'main' of https://gitty.informatik.hs-mannheim.de/3016498/ANLP_WS24_CA2

2025-01-27 13:56:02 +01:00 · 2025-01-27 13:56:02 +01:00 · 8097362c61
parent a25821f20c 8279123019
commit 8097362c61
18 changed files with 1814 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
+# Ignore pycache directory
+__pycache__/
+
 # Ignore virtual environment directory
 .venv/

@ -18,3 +21,6 @@ plots/
 # Ignore plot file
 *.png
 *.jpg
+
+# Ignore everything with delete_me in name
+*delete_me*
--- a/HumorDataset.py
+++ b/HumorDataset.py
@ -0,0 +1,42 @@
+"""
+This file contains the HumorDataset class.
+"""
+import torch
+import numpy as np
+
+class HumorDataset(torch.utils.data.Dataset):
+    def __init__(self, data, labels, vocab_size=0, emb_dim=None):
+        self.original_indices = labels.index.to_list()
+
+        self.data = data
+        self.labels = labels.reset_index(drop=True)
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+
+        # TODO: bug fix
+        self.shape = self.get_shape()
+            
+
+    def __getitem__(self, idx):
+        item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+    def get_single_shape(self, data):
+        shape_data = None
+        if type(data) == list:
+            shape_data = len(data[0])
+        elif type(data) == torch.Tensor:
+            shape_data = data[0].shape
+        elif type(data) == np.ndarray:
+            shape_data = data[0].shape
+        return shape_data
+
+    def get_shape(self):
+        shape_data = self.get_single_shape(self.data)
+        shape_labels = self.get_single_shape(self.labels)
+        return shape_data, shape_labels
+    
--- a/README.md
+++ b/README.md
@ -4,6 +4,17 @@



+## TODOS
+data
+- maybe buffer zone between good and bad jokes (trade off would be less data)
+- maybe not bineary classification
+- maybe change to humor detection (more data available)
+
+
+- dataset shape doesnt work correctly
+
+- history: integrate validation loss
+
 ## Data


--- a/data/embedded_padded/test.pt
+++ b/data/embedded_padded/test.pt
--- a/data/embedded_padded/train.pt
+++ b/data/embedded_padded/train.pt
--- a/data/embedded_padded/val.pt
+++ b/data/embedded_padded/val.pt
--- a/data/idx_based_padded/test.pt
+++ b/data/idx_based_padded/test.pt
--- a/data/idx_based_padded/train.pt
+++ b/data/idx_based_padded/train.pt
--- a/data/idx_based_padded/val.pt
+++ b/data/idx_based_padded/val.pt
--- a/data_explore_hack.ipynb
+++ b/data_explore_hack.ipynb
@ -914,7 +914,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.10.4"
  }
 },
 "nbformat": 4,
--- a/data_explore_hack_rating.ipynb
+++ b/data_explore_hack_rating.ipynb
--- a/dataset_generator.py
+++ b/dataset_generator.py
@ -0,0 +1,123 @@
+"""
+This file contains the dataset generation and preprocessing.
+"""
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from nltk.tokenize import word_tokenize
+import gensim
+import torch
+import os
+
+from HumorDataset import HumorDataset
+
+def get_embedding_idx(model, word):
+    if word in model.wv:
+        return model.wv.key_to_index[word]
+    else:
+        return unk_index
+
+def get_embedding_vector(model, word):
+    if word in model.wv:
+        return model.wv[word]
+    else:
+        return np.zeros(model.vector_size)
+
+def encode_tokens(tokens, vector=False):
+    if vector:
+        return [get_embedding_vector(model_embedding, token) for token in tokens]
+    else:
+        return [get_embedding_idx(model_embedding, token) for token in tokens]
+
+def pad_sequences(sequences, max_len, pad_index):
+    return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
+
+
+def split_data(X, y, test_size=0.1, val_size=0.1):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
+    val_split_ratio = val_size / (val_size + test_size)
+    X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=val_split_ratio, random_state=42)
+
+    ret_dict = {
+        'train': {'X': X_train, 'y': y_train},
+        'test': {'X': X_test, 'y': y_test},
+        'val': {'X': X_val, 'y': y_val}
+    }
+    return ret_dict
+
+def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
+    if not os.path.exists(path):
+        print('Creating directory:', path)
+        os.makedirs(path)
+    print('saving data into:', path)
+    for key, value in data_dict.items():
+        # tansform to Dataset
+        dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
+        # save dataset
+        torch.save(dataset, path + prefix + key + '.pt')
+    
+if __name__ == "__main__":
+    # Load the data from csv
+    df = pd.read_csv('data/hack.csv')
+    print(df.shape)
+
+    df = df.dropna(subset=['humor_rating'])
+
+    # find median of humor_rating
+    median_rating = df['humor_rating'].median()
+    #print('median and therefore middle of humor_rating:', median_rating)
+
+    df['y'] = df['humor_rating'] > median_rating 
+
+    # transfrom data into dataset
+    X = df['text']
+    y = df['y']
+
+    # Tokenize the data with nltk
+    tokens = [word_tokenize(text.lower()) for text in X]
+
+    vocab_size = len(set([word for sentence in tokens for word in sentence]))
+    print('vocab size:', vocab_size)
+
+    # Pad the sequences 
+    # NOTE: Info comes from data explore notebook: 280 is max length, 
+    # 139 contains 80% and 192 contains 95% of the data
+    max_len = 280
+    padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
+
+    # split data into train, test, and validation
+    data_dict = split_data(padded_indices, y)
+
+    # TODO: test gloVe embeddings
+    # Embed the data with word2vec
+    model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
+
+    # Add a special token for out-of-vocabulary words
+    model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
+    unk_index = model_embedding.wv.key_to_index['<UNK>']
+
+   # Add padding index for padding
+    model_embedding.wv.add_vector('<PAD>', np.zeros(model_embedding.vector_size))
+    pad_index = model_embedding.wv.key_to_index['<PAD>']
+
+
+    data_idx_based = data_dict.copy()
+    vector_based = False
+
+    for key in data_idx_based.keys():
+        data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
+        # print shape of data
+        #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
+
+    # save the data
+    save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
+
+    vector_based = True
+    # Encode the tokens
+    for key in data_dict.keys():
+        data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
+        # print shape of data
+        #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
+        
+    # Save the data
+    save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
--- a/gpu_check.py
+++ b/gpu_check.py
@ -1,16 +0,0 @@
-import torch
-
-# Check if CUDA is available
-cuda_available = torch.cuda.is_available()
-print(f"CUDA available: {cuda_available}")
-
-if cuda_available:
-    # Print the current CUDA device
-    current_device = torch.cuda.current_device()
-    print(f"Current CUDA device: {current_device}")
-
-    # Print the name of the current CUDA device
-    device_name = torch.cuda.get_device_name(current_device)
-    print(f"CUDA device name: {device_name}")
-else:
-    print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
--- a/ml_helper.py
+++ b/ml_helper.py
@ -0,0 +1,89 @@
+import torch
+import nltk
+
+import time
+import json
+import os
+
+def get_device(verbose=False):
+    """
+    Get the current device (CPU or GPU) for PyTorch.
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if verbose:
+        print('Using device:', device)
+    return device
+
+def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
+    """
+    Save the model and hyperparameters to disk.
+    **kwargs: hyperparameters to save
+    """
+    # Create a timestamp
+    if timestamp is None:
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+    
+    accuracy = round(accuracy, 4)
+
+    # Save the model state dictionary
+    model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
+    torch.save(model.state_dict(), model_path)
+    print(f"Model saved to {model_path}.")
+    
+    # Save the hyperparameters as a JSON file
+    hyperparameters = kwargs
+    hyperparameters['accuracy'] = accuracy
+    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
+    with open(hyperparameters_path, 'w') as f:
+        json.dump(hyperparameters, f)
+    print(f"Hyperparameters saved to {hyperparameters_path}.")
+
+def get_newest_model_path(path, name=None, extension=".pth"):
+    """
+    Get the newest file in a directory.
+    """
+    # List all files in the directory
+    files = [f for f in os.listdir(path) if f.endswith(extension)]
+    # List all files with name in it
+    if name:
+        files = [f for f in files if name in f]
+
+    # Sort files by modification time
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
+    
+    # Get the newest file
+    if files:
+        newest_model_path = os.path.join(path, files[0])
+        return newest_model_path
+    else:
+        print("No File found in the directory")
+        return None
+
+
+def main():
+    """
+    Main function used to set up the environment.
+    """
+    # download nltk data
+    nltk.download('punkt')
+    nltk.download('punkt_tab')
+
+
+    # Check if CUDA is available
+    cuda_available = torch.cuda.is_available()
+    print(f"CUDA available: {cuda_available}")
+
+    if cuda_available:
+        # Print the current CUDA device
+        current_device = torch.cuda.current_device()
+        print(f"Current CUDA device: {current_device}")
+
+        # Print the name of the current CUDA device
+        device_name = torch.cuda.get_device_name(current_device)
+        print(f"CUDA device name: {device_name}")
+    else:
+        print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
+
+
+if __name__ == "__main__":
+    main()
--- a/ml_history.py
+++ b/ml_history.py
@ -0,0 +1,48 @@
+import numpy as np
+
+class History:
+    """
+    Class to store the history of the training process.
+    Used to store the loss and accuracy of the training and validation sets.
+    """
+    def __init__(self):
+        self.history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+        self.batch_history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+
+    def update(self):
+        self.history['loss'].append(np.mean(self.batch_history['loss']))
+        self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
+        self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
+
+    def get_history(self):
+        return self.history
+    
+    def batch_reset(self):
+        self.batch_history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+    
+    def batch_update(self, loss, train_acc, val_acc):
+        self.batch_history['loss'].append(loss)
+        self.batch_history['train_acc'].append(train_acc)
+        self.batch_history['val_acc'].append(val_acc)
+
+    def batch_update_train(self, loss, train_acc):
+        self.batch_history['loss'].append(loss)
+        self.batch_history['train_acc'].append(train_acc)
+
+    def batch_update_val(self, val_acc):
+        self.batch_history['val_acc'].append(val_acc)
+
+    def get_batch_history(self):
+        return self.batch_history
--- a/transformer_1a.py
+++ b/transformer_1a.py
@ -42,6 +42,13 @@ import time
 import torchvision
 torchvision.disable_beta_transforms_warning()

+
+def get_device(verbose=False):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if verbose:
+        print('Using device:', device)
+    return device
+
 # Test if GPU is available
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print('Using device:', DEVICE)
@ -69,7 +76,7 @@ def pad_sequences(sequences, MAX_LEN):
 class HumorDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
-        self.labels = labels
+        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)}
--- a/transformer_1b.py
+++ b/transformer_1b.py
@ -0,0 +1,199 @@
+"""
+This file contains the transformer model.
+"""
+
+
+# TODO refactor the code
+# TODO create ml helper script
+# TODO create ml evaluation script
+
+# TODO track overfitting better
+# TODO validate model in training (accuracy, loss, etc)
+
+# TODO set length to a constant value which is the max length of the sentences or nearly
+
+
+# TODO user gloVe embeddings
+
+#TODO: add attention mask
+# TODO: add positional encoding
+#TODO: add dropout (if needed)
+
+import time
+import json
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from transformers import AdamW
+
+from sklearn.metrics import accuracy_score
+
+import ml_helper
+import ml_history
+
+class TransformerBinaryClassifier(nn.Module):
+    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
+        super(TransformerBinaryClassifier, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
+        self.fc = nn.Linear(embed_dim, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_ids):
+        input_ids = input_ids.long()
+        embedded = self.embedding(input_ids)
+        transformer_output = self.transformer(embedded, embedded)
+        pooled_output = transformer_output.mean(dim=1)
+        logits = self.fc(pooled_output)
+        return self.sigmoid(logits)
+    
+
+
+if __name__ == "__main__":
+
+    # Load the data
+    data_path = 'data/idx_based_padded'
+    
+    train_dataset = torch.load(data_path + '/train.pt')
+    test_dataset = torch.load(data_path + '/test.pt') 
+    val_dataset = torch.load(data_path + '/val.pt')
+
+    # +2 for padding and unk tokens
+    vocab_size = train_dataset.vocab_size + 2 
+    embed_dim = 100 #train_dataset.emb_dim
+
+    # NOTE: Info comes from data explore notebook: 280 is max length, 
+    # 139 contains 80% and 192 contains 95% of the data
+    max_len = 280
+
+    device = ml_helper.get_device(verbose=True)
+
+    # Model hyperparameters
+    num_heads = 2
+    num_layers = 2
+    hidden_dim = 256
+
+    model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
+
+    # Training parameters
+    epochs = 3 #3
+    batch_size = 8
+    learning_rate = 2e-5
+
+    # Optimizer and loss function
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
+    criterion = nn.BCEWithLogitsLoss()
+
+
+    # Data loaders
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+
+
+    ################################################################################################
+    # Training
+    ################################################################################################
+
+    # Initialize the history
+    history = ml_history.History()
+
+    # Model to device
+    model.to(device)
+
+    print("Starting training...")
+    start_training_time = time.time()
+
+    # Training loop
+    model.train()
+    for epoch in range(epochs):
+        # init batch tracking
+        epoch_start_time = time.time()
+        history.batch_reset()
+
+        for batch in train_loader:
+            optimizer.zero_grad()
+            # prepare batch
+            input_ids = batch['input_ids'].to(device)
+            labels = batch['labels'].unsqueeze(1).to(device)
+            # forward pass
+            outputs = model(input_ids)
+            loss = criterion(outputs, labels)
+            # backward pass
+            loss.backward()
+            optimizer.step()
+            # calculate accuracy train
+            preds = outputs.round()
+            train_acc = accuracy_score(labels.cpu().detach().numpy(), 
+                                       preds.cpu().detach().numpy())
+            # update batch history
+            history.batch_update_train(loss.item(), train_acc)
+
+        # calculate accuracy val
+        model.eval()
+        with torch.no_grad():
+            for val_batch in val_loader:
+                val_input_ids = val_batch['input_ids'].to(device)
+                val_labels_batch = val_batch['labels'].unsqueeze(1).to(device)
+                val_outputs = model(val_input_ids)
+                val_acc = accuracy_score(val_outputs.round().cpu().numpy(),
+                                      val_labels_batch.cpu().numpy())
+                history.batch_update_val(val_acc)
+        model.train()
+
+        # update epoch history
+        history.update()
+
+        epoch_end_time = time.time()
+
+        print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}")
+        
+    end_training_time = time.time()
+    print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
+
+
+    ################################################################################################
+    # Evaluation
+    ################################################################################################
+    print("Starting evaluation...")
+    
+    model.eval()
+    predictions, true_labels = [], []
+    with torch.no_grad():
+        for batch in test_loader:
+            input_ids = batch['input_ids'].to(device)
+            labels = batch['labels'].unsqueeze(1).to(device)
+            
+            outputs = model(input_ids)
+            preds = outputs.round()
+            predictions.extend(preds.cpu().numpy())
+            true_labels.extend(labels.cpu().numpy())
+
+    accuracy = accuracy_score(true_labels, predictions)
+    print(f"Accuracy: {accuracy}")
+
+
+    ################################################################################################
+    # Save model and hyperparameters
+    ################################################################################################
+    timestamp = time.strftime("%Y%m%d-%H%M%S")
+    
+    ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp, 
+                                             max_len=max_len, 
+                                             vocab_size=vocab_size, 
+                                             embed_dim=embed_dim, 
+                                             num_heads=num_heads, 
+                                             num_layers=num_layers, 
+                                             hidden_dim=hidden_dim, 
+                                             epochs=epochs, 
+                                             batch_size=batch_size, 
+                                             learning_rate=learning_rate)
+
+    #save history
+    
+    history_path = f'models/transformer_history_{timestamp}.json'
+    with open(history_path, 'w') as f:
+        json.dump(history.get_history(), f)
--- a/transformer_evaluation.ipynb
+++ b/transformer_evaluation.ipynb