added machine learning helper

main
Felix Jan Michael Mucha 2025-01-27 07:10:07 +01:00
parent bbbf9f8c83
commit aff7c6170e
4 changed files with 179 additions and 16 deletions

42
HumorDataset.py 100644
View File

@ -0,0 +1,42 @@
"""
This file contains the HumorDataset class.
"""
import torch
import numpy as np
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, data, labels, vocab_size=0, emb_dim=None):
self.original_indices = labels.index.to_list()
self.data = data
self.labels = labels.reset_index(drop=True)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
# TODO: bug fix
self.shape = self.get_shape()
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
def get_single_shape(self, data):
shape_data = None
if type(data) == list:
shape_data = len(data[0])
elif type(data) == torch.Tensor:
shape_data = data[0].shape
elif type(data) == np.ndarray:
shape_data = data[0].shape
return shape_data
def get_shape(self):
shape_data = self.get_single_shape(self.data)
shape_labels = self.get_single_shape(self.labels)
return shape_data, shape_labels

View File

@ -1,16 +0,0 @@
import torch
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
# Print the current CUDA device
current_device = torch.cuda.current_device()
print(f"Current CUDA device: {current_device}")
# Print the name of the current CUDA device
device_name = torch.cuda.get_device_name(current_device)
print(f"CUDA device name: {device_name}")
else:
print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")

89
ml_helper.py 100644
View File

@ -0,0 +1,89 @@
import torch
import nltk
import time
import json
import os
def get_device(verbose=False):
"""
Get the current device (CPU or GPU) for PyTorch.
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if verbose:
print('Using device:', device)
return device
def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
"""
Save the model and hyperparameters to disk.
**kwargs: hyperparameters to save
"""
# Create a timestamp
if timestamp is None:
timestamp = time.strftime("%Y%m%d-%H%M%S")
accuracy = round(accuracy, 4)
# Save the model state dictionary
model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}.")
# Save the hyperparameters as a JSON file
hyperparameters = kwargs
hyperparameters['accuracy'] = accuracy
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
with open(hyperparameters_path, 'w') as f:
json.dump(hyperparameters, f)
print(f"Hyperparameters saved to {hyperparameters_path}.")
def get_newest_model_path(path, name=None, extension=".pth"):
"""
Get the newest file in a directory.
"""
# List all files in the directory
files = [f for f in os.listdir(path) if f.endswith(extension)]
# List all files with name in it
if name:
files = [f for f in files if name in f]
# Sort files by modification time
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
# Get the newest file
if files:
newest_model_path = os.path.join(path, files[0])
return newest_model_path
else:
print("No File found in the directory")
return None
def main():
"""
Main function used to set up the environment.
"""
# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
# Print the current CUDA device
current_device = torch.cuda.current_device()
print(f"Current CUDA device: {current_device}")
# Print the name of the current CUDA device
device_name = torch.cuda.get_device_name(current_device)
print(f"CUDA device name: {device_name}")
else:
print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")
if __name__ == "__main__":
main()

48
ml_history.py 100644
View File

@ -0,0 +1,48 @@
import numpy as np
class History:
"""
Class to store the history of the training process.
Used to store the loss and accuracy of the training and validation sets.
"""
def __init__(self):
self.history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
self.batch_history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
def update(self):
self.history['loss'].append(np.mean(self.batch_history['loss']))
self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
def get_history(self):
return self.history
def batch_reset(self):
self.batch_history = {
'loss': [],
'train_acc': [],
'val_acc': [],
}
def batch_update(self, loss, train_acc, val_acc):
self.batch_history['loss'].append(loss)
self.batch_history['train_acc'].append(train_acc)
self.batch_history['val_acc'].append(val_acc)
def batch_update_train(self, loss, train_acc):
self.batch_history['loss'].append(loss)
self.batch_history['train_acc'].append(train_acc)
def batch_update_val(self, val_acc):
self.batch_history['val_acc'].append(val_acc)
def get_batch_history(self):
return self.batch_history