klara 2025-01-27 13:56:02 +01:00
commit 8097362c61
18 changed files with 1814 additions and 19 deletions

8
.gitignore vendored
View File

@ -1,3 +1,6 @@
# Ignore pycache directory
__pycache__/
# Ignore virtual environment directory
.venv/
@ -17,4 +20,7 @@ plots/
# Ignore plot file
*.png
*.jpg
*.jpg
# Ignore everything with delete_me in name
*delete_me*

42
HumorDataset.py 100644
View File

@ -0,0 +1,42 @@
"""
This file contains the HumorDataset class.
"""
import torch
import numpy as np
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, data, labels, vocab_size=0, emb_dim=None):
self.original_indices = labels.index.to_list()
self.data = data
self.labels = labels.reset_index(drop=True)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
# TODO: bug fix
self.shape = self.get_shape()
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
def get_single_shape(self, data):
shape_data = None
if type(data) == list:
shape_data = len(data[0])
elif type(data) == torch.Tensor:
shape_data = data[0].shape
elif type(data) == np.ndarray:
shape_data = data[0].shape
return shape_data
def get_shape(self):
shape_data = self.get_single_shape(self.data)
shape_labels = self.get_single_shape(self.labels)
return shape_data, shape_labels

View File

@ -4,6 +4,17 @@
## TODOS
data
- maybe buffer zone between good and bad jokes (trade off would be less data)
- maybe not bineary classification
- maybe change to humor detection (more data available)
- dataset shape doesnt work correctly
- history: integrate validation loss
## Data

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -914,7 +914,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.10.4"
}
},
"nbformat": 4,

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,123 @@
"""
This file contains the dataset generation and preprocessing.
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gensim
import torch
import os
from HumorDataset import HumorDataset
def get_embedding_idx(model, word):
if word in model.wv:
return model.wv.key_to_index[word]
else:
return unk_index
def get_embedding_vector(model, word):
if word in model.wv:
return model.wv[word]
else:
return np.zeros(model.vector_size)
def encode_tokens(tokens, vector=False):
if vector:
return [get_embedding_vector(model_embedding, token) for token in tokens]
else:
return [get_embedding_idx(model_embedding, token) for token in tokens]
def pad_sequences(sequences, max_len, pad_index):
return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
def split_data(X, y, test_size=0.1, val_size=0.1):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
val_split_ratio = val_size / (val_size + test_size)
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=val_split_ratio, random_state=42)
ret_dict = {
'train': {'X': X_train, 'y': y_train},
'test': {'X': X_test, 'y': y_test},
'val': {'X': X_val, 'y': y_val}
}
return ret_dict
def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
if not os.path.exists(path):
print('Creating directory:', path)
os.makedirs(path)
print('saving data into:', path)
for key, value in data_dict.items():
# tansform to Dataset
dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
# save dataset
torch.save(dataset, path + prefix + key + '.pt')
if __name__ == "__main__":
# Load the data from csv
df = pd.read_csv('data/hack.csv')
print(df.shape)
df = df.dropna(subset=['humor_rating'])
# find median of humor_rating
median_rating = df['humor_rating'].median()
#print('median and therefore middle of humor_rating:', median_rating)
df['y'] = df['humor_rating'] > median_rating
# transfrom data into dataset
X = df['text']
y = df['y']
# Tokenize the data with nltk
tokens = [word_tokenize(text.lower()) for text in X]
vocab_size = len(set([word for sentence in tokens for word in sentence]))
print('vocab size:', vocab_size)
# Pad the sequences
# NOTE: Info comes from data explore notebook: 280 is max length,
# 139 contains 80% and 192 contains 95% of the data
max_len = 280
padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
# split data into train, test, and validation
data_dict = split_data(padded_indices, y)
# TODO: test gloVe embeddings
# Embed the data with word2vec
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
# Add a special token for out-of-vocabulary words
model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
unk_index = model_embedding.wv.key_to_index['<UNK>']
# Add padding index for padding
model_embedding.wv.add_vector('<PAD>', np.zeros(model_embedding.vector_size))
pad_index = model_embedding.wv.key_to_index['<PAD>']
data_idx_based = data_dict.copy()
vector_based = False
for key in data_idx_based.keys():
data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
# print shape of data
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
# save the data
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
vector_based = True
# Encode the tokens
for key in data_dict.keys():
data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
# print shape of data
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
# Save the data
save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)

View File

@ -1,16 +0,0 @@
import torch
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
# Print the current CUDA device
current_device = torch.cuda.current_device()
print(f"Current CUDA device: {current_device}")
# Print the name of the current CUDA device
device_name = torch.cuda.get_device_name(current_device)
print(f"CUDA device name: {device_name}")
else:
print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")

89
ml_helper.py 100644
View File

@ -0,0 +1,89 @@
import torch
import nltk
import time
import json
import os
def get_device(verbose=False):
"""
Get the current device (CPU or GPU) for PyTorch.
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if verbose:
print('Using device:', device)
return device
def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
"""
Save the model and hyperparameters to disk.
**kwargs: hyperparameters to save
"""
# Create a timestamp
if timestamp is None:
timestamp = time.strftime("%Y%m%d-%H%M%S")
accuracy = round(accuracy, 4)
# Save the model state dictionary
model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}.")
# Save the hyperparameters as a JSON file
hyperparameters = kwargs
hyperparameters['accuracy'] = accuracy
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
with open(hyperparameters_path, 'w') as f:
json.dump(hyperparameters, f)
print(f"Hyperparameters saved to {hyperparameters_path}.")
def get_newest_model_path(path, name=None, extension=".pth"):
"""
Get the newest file in a directory.
"""
# List all files in the directory
files = [f for f in os.listdir(path) if f.endswith(extension)]
# List all files with name in it
if name:
files = [f for f in files if name in f]
# Sort files by modification time
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
# Get the newest file
if files:
newest_model_path = os.path.join(path, files[0])
return newest_model_path
else:
print("No File found in the directory")
return None
def main():
"""
Main function used to set up the environment.
"""
# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
# Print the current CUDA device
current_device = torch.cuda.current_device()
print(f"Current CUDA device: {current_device}")
# Print the name of the current CUDA device
device_name = torch.cuda.get_device_name(current_device)
print(f"CUDA device name: {device_name}")
else:
print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
if __name__ == "__main__":
main()

48
ml_history.py 100644
View File

@ -0,0 +1,48 @@
import numpy as np
class History:
"""
Class to store the history of the training process.
Used to store the loss and accuracy of the training and validation sets.
"""
def __init__(self):
self.history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
self.batch_history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
def update(self):
self.history['loss'].append(np.mean(self.batch_history['loss']))
self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
def get_history(self):
return self.history
def batch_reset(self):
self.batch_history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
def batch_update(self, loss, train_acc, val_acc):
self.batch_history['loss'].append(loss)
self.batch_history['train_acc'].append(train_acc)
self.batch_history['val_acc'].append(val_acc)
def batch_update_train(self, loss, train_acc):
self.batch_history['loss'].append(loss)
self.batch_history['train_acc'].append(train_acc)
def batch_update_val(self, val_acc):
self.batch_history['val_acc'].append(val_acc)
def get_batch_history(self):
return self.batch_history

View File

@ -42,6 +42,13 @@ import time
import torchvision
torchvision.disable_beta_transforms_warning()
def get_device(verbose=False):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if verbose:
print('Using device:', device)
return device
# Test if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
@ -69,7 +76,7 @@ def pad_sequences(sequences, MAX_LEN):
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
self.labels = labels.reset_index(drop=True)
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)}

199
transformer_1b.py 100644
View File

@ -0,0 +1,199 @@
"""
This file contains the transformer model.
"""
# TODO refactor the code
# TODO create ml helper script
# TODO create ml evaluation script
# TODO track overfitting better
# TODO validate model in training (accuracy, loss, etc)
# TODO set length to a constant value which is the max length of the sentences or nearly
# TODO user gloVe embeddings
#TODO: add attention mask
# TODO: add positional encoding
#TODO: add dropout (if needed)
import time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
import ml_helper
import ml_history
class TransformerBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
super(TransformerBinaryClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
self.fc = nn.Linear(embed_dim, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids):
input_ids = input_ids.long()
embedded = self.embedding(input_ids)
transformer_output = self.transformer(embedded, embedded)
pooled_output = transformer_output.mean(dim=1)
logits = self.fc(pooled_output)
return self.sigmoid(logits)
if __name__ == "__main__":
# Load the data
data_path = 'data/idx_based_padded'
train_dataset = torch.load(data_path + '/train.pt')
test_dataset = torch.load(data_path + '/test.pt')
val_dataset = torch.load(data_path + '/val.pt')
# +2 for padding and unk tokens
vocab_size = train_dataset.vocab_size + 2
embed_dim = 100 #train_dataset.emb_dim
# NOTE: Info comes from data explore notebook: 280 is max length,
# 139 contains 80% and 192 contains 95% of the data
max_len = 280
device = ml_helper.get_device(verbose=True)
# Model hyperparameters
num_heads = 2
num_layers = 2
hidden_dim = 256
model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
# Training parameters
epochs = 3 #3
batch_size = 8
learning_rate = 2e-5
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
################################################################################################
# Training
################################################################################################
# Initialize the history
history = ml_history.History()
# Model to device
model.to(device)
print("Starting training...")
start_training_time = time.time()
# Training loop
model.train()
for epoch in range(epochs):
# init batch tracking
epoch_start_time = time.time()
history.batch_reset()
for batch in train_loader:
optimizer.zero_grad()
# prepare batch
input_ids = batch['input_ids'].to(device)
labels = batch['labels'].unsqueeze(1).to(device)
# forward pass
outputs = model(input_ids)
loss = criterion(outputs, labels)
# backward pass
loss.backward()
optimizer.step()
# calculate accuracy train
preds = outputs.round()
train_acc = accuracy_score(labels.cpu().detach().numpy(),
preds.cpu().detach().numpy())
# update batch history
history.batch_update_train(loss.item(), train_acc)
# calculate accuracy val
model.eval()
with torch.no_grad():
for val_batch in val_loader:
val_input_ids = val_batch['input_ids'].to(device)
val_labels_batch = val_batch['labels'].unsqueeze(1).to(device)
val_outputs = model(val_input_ids)
val_acc = accuracy_score(val_outputs.round().cpu().numpy(),
val_labels_batch.cpu().numpy())
history.batch_update_val(val_acc)
model.train()
# update epoch history
history.update()
epoch_end_time = time.time()
print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {history.history['loss'][-1]:.4f}, Train Acc: {history.history['train_acc'][-1]:.4f}, Val Acc: {history.history['val_acc'][-1]:.4f}")
end_training_time = time.time()
print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
################################################################################################
# Evaluation
################################################################################################
print("Starting evaluation...")
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
labels = batch['labels'].unsqueeze(1).to(device)
outputs = model(input_ids)
preds = outputs.round()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
################################################################################################
# Save model and hyperparameters
################################################################################################
timestamp = time.strftime("%Y%m%d-%H%M%S")
ml_helper.save_model_and_hyperparameters(model, 'transformer', accuracy, timestamp,
max_len=max_len,
vocab_size=vocab_size,
embed_dim=embed_dim,
num_heads=num_heads,
num_layers=num_layers,
hidden_dim=hidden_dim,
epochs=epochs,
batch_size=batch_size,
learning_rate=learning_rate)
#save history
history_path = f'models/transformer_history_{timestamp}.json'
with open(history_path, 'w') as f:
json.dump(history.get_history(), f)

File diff suppressed because one or more lines are too long