added machine learning helper

2025-01-27 07:10:07 +01:00 · 2025-01-27 07:10:07 +01:00 · aff7c6170e
parent bbbf9f8c83
commit aff7c6170e
4 changed files with 179 additions and 16 deletions
--- a/HumorDataset.py
+++ b/HumorDataset.py
@ -0,0 +1,42 @@
 """
 This file contains the HumorDataset class.
 """
 import torch
 import numpy as np
 class HumorDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, vocab_size=0, emb_dim=None):
        self.original_indices = labels.index.to_list()
        self.data = data
        self.labels = labels.reset_index(drop=True)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        # TODO: bug fix
        self.shape = self.get_shape()
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.labels)
    def get_single_shape(self, data):
        shape_data = None
        if type(data) == list:
            shape_data = len(data[0])
        elif type(data) == torch.Tensor:
            shape_data = data[0].shape
        elif type(data) == np.ndarray:
            shape_data = data[0].shape
        return shape_data
    def get_shape(self):
        shape_data = self.get_single_shape(self.data)
        shape_labels = self.get_single_shape(self.labels)
        return shape_data, shape_labels
--- a/gpu_check.py
+++ b/gpu_check.py
@ -1,16 +0,0 @@
 import torch
 # Check if CUDA is available
 cuda_available = torch.cuda.is_available()
 print(f"CUDA available: {cuda_available}")
 if cuda_available:
    # Print the current CUDA device
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device: {current_device}")
    # Print the name of the current CUDA device
    device_name = torch.cuda.get_device_name(current_device)
    print(f"CUDA device name: {device_name}")
 else:
    print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
--- a/ml_helper.py
+++ b/ml_helper.py
@ -0,0 +1,89 @@
 import torch
 import nltk
 import time
 import json
 import os
 def get_device(verbose=False):
    """
    Get the current device (CPU or GPU) for PyTorch.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if verbose:
        print('Using device:', device)
    return device
 def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
    """
    Save the model and hyperparameters to disk.
    **kwargs: hyperparameters to save
    """
    # Create a timestamp
    if timestamp is None:
        timestamp = time.strftime("%Y%m%d-%H%M%S")
    accuracy = round(accuracy, 4)
    # Save the model state dictionary
    model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}.")
    # Save the hyperparameters as a JSON file
    hyperparameters = kwargs
    hyperparameters['accuracy'] = accuracy
    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
    with open(hyperparameters_path, 'w') as f:
        json.dump(hyperparameters, f)
    print(f"Hyperparameters saved to {hyperparameters_path}.")
 def get_newest_model_path(path, name=None, extension=".pth"):
    """
    Get the newest file in a directory.
    """
    # List all files in the directory
    files = [f for f in os.listdir(path) if f.endswith(extension)]
    # List all files with name in it
    if name:
        files = [f for f in files if name in f]
    # Sort files by modification time
    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
    # Get the newest file
    if files:
        newest_model_path = os.path.join(path, files[0])
        return newest_model_path
    else:
        print("No File found in the directory")
        return None
 def main():
    """
    Main function used to set up the environment.
    """
    # download nltk data
    nltk.download('punkt')
    nltk.download('punkt_tab')
    # Check if CUDA is available
    cuda_available = torch.cuda.is_available()
    print(f"CUDA available: {cuda_available}")
    if cuda_available:
        # Print the current CUDA device
        current_device = torch.cuda.current_device()
        print(f"Current CUDA device: {current_device}")
        # Print the name of the current CUDA device
        device_name = torch.cuda.get_device_name(current_device)
        print(f"CUDA device name: {device_name}")
    else:
        print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
 if __name__ == "__main__":
    main()
--- a/ml_history.py
+++ b/ml_history.py
@ -0,0 +1,48 @@
 import numpy as np
 class History:
    """
    Class to store the history of the training process.
    Used to store the loss and accuracy of the training and validation sets.
    """
    def __init__(self):
        self.history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
        self.batch_history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
    def update(self):
        self.history['loss'].append(np.mean(self.batch_history['loss']))
        self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
        self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
    def get_history(self):
        return self.history
    def batch_reset(self):
        self.batch_history = {
            'loss': [],
            'train_acc': [],
            'val_acc': [],
        }
    def batch_update(self, loss, train_acc, val_acc):
        self.batch_history['loss'].append(loss)
        self.batch_history['train_acc'].append(train_acc)
        self.batch_history['val_acc'].append(val_acc)
    def batch_update_train(self, loss, train_acc):
        self.batch_history['loss'].append(loss)
        self.batch_history['train_acc'].append(train_acc)
    def batch_update_val(self, val_acc):
        self.batch_history['val_acc'].append(val_acc)
    def get_batch_history(self):
        return self.batch_history