diff --git a/HumorDataset.py b/HumorDataset.py new file mode 100644 index 0000000..c793d1b --- /dev/null +++ b/HumorDataset.py @@ -0,0 +1,42 @@ +""" +This file contains the HumorDataset class. +""" +import torch +import numpy as np + +class HumorDataset(torch.utils.data.Dataset): + def __init__(self, data, labels, vocab_size=0, emb_dim=None): + self.original_indices = labels.index.to_list() + + self.data = data + self.labels = labels.reset_index(drop=True) + self.vocab_size = vocab_size + self.emb_dim = emb_dim + + # TODO: bug fix + self.shape = self.get_shape() + + + def __getitem__(self, idx): + item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)} + item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) + return item + + def __len__(self): + return len(self.labels) + + def get_single_shape(self, data): + shape_data = None + if type(data) == list: + shape_data = len(data[0]) + elif type(data) == torch.Tensor: + shape_data = data[0].shape + elif type(data) == np.ndarray: + shape_data = data[0].shape + return shape_data + + def get_shape(self): + shape_data = self.get_single_shape(self.data) + shape_labels = self.get_single_shape(self.labels) + return shape_data, shape_labels + diff --git a/gpu_check.py b/gpu_check.py deleted file mode 100644 index e3c2513..0000000 --- a/gpu_check.py +++ /dev/null @@ -1,16 +0,0 @@ -import torch - -# Check if CUDA is available -cuda_available = torch.cuda.is_available() -print(f"CUDA available: {cuda_available}") - -if cuda_available: - # Print the current CUDA device - current_device = torch.cuda.current_device() - print(f"Current CUDA device: {current_device}") - - # Print the name of the current CUDA device - device_name = torch.cuda.get_device_name(current_device) - print(f"CUDA device name: {device_name}") -else: - print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.") \ No newline at end of file diff --git a/ml_helper.py b/ml_helper.py new file mode 100644 index 0000000..646c104 --- /dev/null +++ b/ml_helper.py @@ -0,0 +1,89 @@ +import torch +import nltk + +import time +import json +import os + +def get_device(verbose=False): + """ + Get the current device (CPU or GPU) for PyTorch. + """ + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if verbose: + print('Using device:', device) + return device + +def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs): + """ + Save the model and hyperparameters to disk. + **kwargs: hyperparameters to save + """ + # Create a timestamp + if timestamp is None: + timestamp = time.strftime("%Y%m%d-%H%M%S") + + accuracy = round(accuracy, 4) + + # Save the model state dictionary + model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth' + torch.save(model.state_dict(), model_path) + print(f"Model saved to {model_path}.") + + # Save the hyperparameters as a JSON file + hyperparameters = kwargs + hyperparameters['accuracy'] = accuracy + hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json' + with open(hyperparameters_path, 'w') as f: + json.dump(hyperparameters, f) + print(f"Hyperparameters saved to {hyperparameters_path}.") + +def get_newest_model_path(path, name=None, extension=".pth"): + """ + Get the newest file in a directory. + """ + # List all files in the directory + files = [f for f in os.listdir(path) if f.endswith(extension)] + # List all files with name in it + if name: + files = [f for f in files if name in f] + + # Sort files by modification time + files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True) + + # Get the newest file + if files: + newest_model_path = os.path.join(path, files[0]) + return newest_model_path + else: + print("No File found in the directory") + return None + + +def main(): + """ + Main function used to set up the environment. + """ + # download nltk data + nltk.download('punkt') + nltk.download('punkt_tab') + + + # Check if CUDA is available + cuda_available = torch.cuda.is_available() + print(f"CUDA available: {cuda_available}") + + if cuda_available: + # Print the current CUDA device + current_device = torch.cuda.current_device() + print(f"Current CUDA device: {current_device}") + + # Print the name of the current CUDA device + device_name = torch.cuda.get_device_name(current_device) + print(f"CUDA device name: {device_name}") + else: + print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ml_history.py b/ml_history.py new file mode 100644 index 0000000..d7c500b --- /dev/null +++ b/ml_history.py @@ -0,0 +1,48 @@ +import numpy as np + +class History: + """ + Class to store the history of the training process. + Used to store the loss and accuracy of the training and validation sets. + """ + def __init__(self): + self.history = { + 'loss': [], + 'train_acc': [], + 'val_acc': [], + } + self.batch_history = { + 'loss': [], + 'train_acc': [], + 'val_acc': [], + } + + def update(self): + self.history['loss'].append(np.mean(self.batch_history['loss'])) + self.history['train_acc'].append(np.mean(self.batch_history['train_acc'])) + self.history['val_acc'].append(np.mean(self.batch_history['val_acc'])) + + def get_history(self): + return self.history + + def batch_reset(self): + self.batch_history = { + 'loss': [], + 'train_acc': [], + 'val_acc': [], + } + + def batch_update(self, loss, train_acc, val_acc): + self.batch_history['loss'].append(loss) + self.batch_history['train_acc'].append(train_acc) + self.batch_history['val_acc'].append(val_acc) + + def batch_update_train(self, loss, train_acc): + self.batch_history['loss'].append(loss) + self.batch_history['train_acc'].append(train_acc) + + def batch_update_val(self, val_acc): + self.batch_history['val_acc'].append(val_acc) + + def get_batch_history(self): + return self.batch_history \ No newline at end of file