added machine learning helper

2025-01-27 07:10:07 +01:00 · 2025-01-27 07:10:07 +01:00 · aff7c6170e
parent bbbf9f8c83
commit aff7c6170e
4 changed files with 179 additions and 16 deletions
--- a/HumorDataset.py
+++ b/HumorDataset.py
@ -0,0 +1,42 @@
+"""
+This file contains the HumorDataset class.
+"""
+import torch
+import numpy as np
+
+class HumorDataset(torch.utils.data.Dataset):
+    def __init__(self, data, labels, vocab_size=0, emb_dim=None):
+        self.original_indices = labels.index.to_list()
+
+        self.data = data
+        self.labels = labels.reset_index(drop=True)
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+
+        # TODO: bug fix
+        self.shape = self.get_shape()
+            
+
+    def __getitem__(self, idx):
+        item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+    def get_single_shape(self, data):
+        shape_data = None
+        if type(data) == list:
+            shape_data = len(data[0])
+        elif type(data) == torch.Tensor:
+            shape_data = data[0].shape
+        elif type(data) == np.ndarray:
+            shape_data = data[0].shape
+        return shape_data
+
+    def get_shape(self):
+        shape_data = self.get_single_shape(self.data)
+        shape_labels = self.get_single_shape(self.labels)
+        return shape_data, shape_labels
+    
--- a/gpu_check.py
+++ b/gpu_check.py
@ -1,16 +0,0 @@
-import torch
-
-# Check if CUDA is available
-cuda_available = torch.cuda.is_available()
-print(f"CUDA available: {cuda_available}")
-
-if cuda_available:
-    # Print the current CUDA device
-    current_device = torch.cuda.current_device()
-    print(f"Current CUDA device: {current_device}")
-
-    # Print the name of the current CUDA device
-    device_name = torch.cuda.get_device_name(current_device)
-    print(f"CUDA device name: {device_name}")
-else:
-    print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
--- a/ml_helper.py
+++ b/ml_helper.py
@ -0,0 +1,89 @@
+import torch
+import nltk
+
+import time
+import json
+import os
+
+def get_device(verbose=False):
+    """
+    Get the current device (CPU or GPU) for PyTorch.
+    """
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if verbose:
+        print('Using device:', device)
+    return device
+
+def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
+    """
+    Save the model and hyperparameters to disk.
+    **kwargs: hyperparameters to save
+    """
+    # Create a timestamp
+    if timestamp is None:
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+    
+    accuracy = round(accuracy, 4)
+
+    # Save the model state dictionary
+    model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
+    torch.save(model.state_dict(), model_path)
+    print(f"Model saved to {model_path}.")
+    
+    # Save the hyperparameters as a JSON file
+    hyperparameters = kwargs
+    hyperparameters['accuracy'] = accuracy
+    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
+    with open(hyperparameters_path, 'w') as f:
+        json.dump(hyperparameters, f)
+    print(f"Hyperparameters saved to {hyperparameters_path}.")
+
+def get_newest_model_path(path, name=None, extension=".pth"):
+    """
+    Get the newest file in a directory.
+    """
+    # List all files in the directory
+    files = [f for f in os.listdir(path) if f.endswith(extension)]
+    # List all files with name in it
+    if name:
+        files = [f for f in files if name in f]
+
+    # Sort files by modification time
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
+    
+    # Get the newest file
+    if files:
+        newest_model_path = os.path.join(path, files[0])
+        return newest_model_path
+    else:
+        print("No File found in the directory")
+        return None
+
+
+def main():
+    """
+    Main function used to set up the environment.
+    """
+    # download nltk data
+    nltk.download('punkt')
+    nltk.download('punkt_tab')
+
+
+    # Check if CUDA is available
+    cuda_available = torch.cuda.is_available()
+    print(f"CUDA available: {cuda_available}")
+
+    if cuda_available:
+        # Print the current CUDA device
+        current_device = torch.cuda.current_device()
+        print(f"Current CUDA device: {current_device}")
+
+        # Print the name of the current CUDA device
+        device_name = torch.cuda.get_device_name(current_device)
+        print(f"CUDA device name: {device_name}")
+    else:
+        print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
+
+
+if __name__ == "__main__":
+    main()
--- a/ml_history.py
+++ b/ml_history.py
@ -0,0 +1,48 @@
+import numpy as np
+
+class History:
+    """
+    Class to store the history of the training process.
+    Used to store the loss and accuracy of the training and validation sets.
+    """
+    def __init__(self):
+        self.history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+        self.batch_history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+
+    def update(self):
+        self.history['loss'].append(np.mean(self.batch_history['loss']))
+        self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
+        self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
+
+    def get_history(self):
+        return self.history
+    
+    def batch_reset(self):
+        self.batch_history = {
+            'loss': [],
+            'train_acc': [],
+            'val_acc': [],
+        }
+    
+    def batch_update(self, loss, train_acc, val_acc):
+        self.batch_history['loss'].append(loss)
+        self.batch_history['train_acc'].append(train_acc)
+        self.batch_history['val_acc'].append(val_acc)
+
+    def batch_update_train(self, loss, train_acc):
+        self.batch_history['loss'].append(loss)
+        self.batch_history['train_acc'].append(train_acc)
+
+    def batch_update_val(self, val_acc):
+        self.batch_history['val_acc'].append(val_acc)
+
+    def get_batch_history(self):
+        return self.batch_history