added grid search

main
Felix Jan Michael Mucha 2025-02-16 14:21:11 +01:00
parent e9e2bf1b8a
commit 3ad2d37ea2
2 changed files with 175 additions and 98 deletions

71
BERT.py
View File

@ -9,6 +9,7 @@ from transformers import BertForSequenceClassification, AutoTokenizer
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
import json import json
import itertools
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -52,18 +53,26 @@ class CustomBert(nn.Module):
if __name__ == '__main__': if __name__ == '__main__':
# Hyperparameter und Konfigurationen # Hyperparameter und Konfigurationen
params = { params = {
# Config
"max_len": 128,
# Training # Training
"epochs": 1, "epochs": [1],
"patience": 7, "patience": [7],
"batch_size": 32, "learning_rate": [1e-5, 1e-6],
"learning_rate": 1e-6, "weight_decay": [5e-4],
"weight_decay": 5e-4 ,
# Model # Model
"dropout": 0.6 "dropout": [0.6]
} }
# Generate permutations of hyperparameters
keys, values = zip(*params.items())
grid_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
best_params = {}
best_params_rmse = -1
# Example usage of grid_params
# for param_set in grid_params:
# print(param_set)
print('Number of grid_params:', len(grid_params))
# Configs # Configs
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
@ -72,7 +81,11 @@ if __name__ == '__main__':
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2 MAX_LEN = 280
BATCH_SIZE = 32
N_MODELS = 1
USE_GIRD_SEARCH = True
models = [] models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@ -91,17 +104,29 @@ if __name__ == '__main__':
print("Tokenizer Initialized") print("Tokenizer Initialized")
# Dataset und DataLoader # Dataset und DataLoader
train_dataset = Datasets.BertDataset(tokenizer, data_split['train']['X'], data_split['train']['y'], max_len=params["max_len"]) train_dataset = Datasets.BertDataset(tokenizer, data_split['train']['X'], data_split['train']['y'], max_len=MAX_LEN)
val_dataset = Datasets.BertDataset(tokenizer, data_split['val']['X'], data_split['val']['y'], max_len=params["max_len"]) val_dataset = Datasets.BertDataset(tokenizer, data_split['val']['X'], data_split['val']['y'], max_len=MAX_LEN)
test_dataset = Datasets.BertDataset(tokenizer, data_split['test']['X'], data_split['test']['y'], max_len=params["max_len"]) test_dataset = Datasets.BertDataset(tokenizer, data_split['test']['X'], data_split['test']['y'], max_len=MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
subset_size = len(train_dataset) // N_MODELS subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False) device = ml_helper.get_device(verbose=True, include_mps=False)
# assert if N_MODLES > 1, than grid_params should be len 1
if N_MODELS > 1 and len(grid_params) > 1 or N_MODELS > 1 and USE_GIRD_SEARCH:
raise ValueError("If N_MODELS > 1, than grid_params should be len 1")
if not USE_GIRD_SEARCH:
print('Using best params')
# load best params
params_name = f'models/best_params_BERT.json'
with open(params_name, 'r') as f:
best_params = json.load(f)
grid_params = [best_params]
for i in range(N_MODELS): for i in range(N_MODELS):
model_name = f'BERT.pt' model_name = f'BERT.pt'
hist_name = f'BERT_history' hist_name = f'BERT_history'
@ -112,7 +137,12 @@ if __name__ == '__main__':
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap') subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices) train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True) train_loader = DataLoader(train_dataset_sub, batch_size=BATCH_SIZE, shuffle=True)
for para_idx, params in enumerate(grid_params):
if len(grid_params) > 1:
model_name = f'BERT_{i}_param_{para_idx}.pt'
hist_name = f'BERT_{i}_param_{para_idx}_history'
model = CustomBert(dropout=params["dropout"]) model = CustomBert(dropout=params["dropout"])
@ -153,6 +183,15 @@ if __name__ == '__main__':
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if test_rmse > best_params_rmse:
best_params_rmse = test_rmse
best_params = params
if len(grid_params) > 1:
best_params_name = f'models/best_params_BERT.json'
with open(best_params_name, 'w') as f:
json.dump(best_params, f)
if N_MODELS >1: if N_MODELS >1:
# Ensemble Prediction # Ensemble Prediction

View File

@ -9,6 +9,7 @@ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
import json import json
import itertools
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -108,21 +109,28 @@ class TransformerBinaryClassifier(nn.Module):
if __name__ == '__main__': if __name__ == '__main__':
# Hyperparameter und Konfigurationen # Hyperparameter und Konfigurationen
params = { params = {
# Config
"max_len": 280,
# Training # Training
"epochs": 1, "epochs": [1],
"patience": 7, "patience": [7],
"batch_size": 32, "learning_rate": [1e-4], # 1e-4
"learning_rate": 1e-4, # 1e-4 "weight_decay": [5e-4],
"weight_decay": 5e-4 ,
# Model # Model
'nhead': 2, # 5 'nhead': [2], # 5
"dropout": 0.2, "dropout": [0.2],
'hiden_dim': 2048, 'hiden_dim': [1024, 2048],
'num_layers': 6 'num_layers': [6]
} }
# Generate permutations of hyperparameters
keys, values = zip(*params.items())
grid_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
best_params = {}
best_params_rmse = -1
# Example usage of grid_params
# for param_set in grid_params:
# print(param_set)
print('Number of grid_params:', len(grid_params))
# Configs # Configs
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
@ -130,7 +138,11 @@ if __name__ == '__main__':
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2 MAX_LEN = 280
BATCH_SIZE = 32
N_MODELS = 1
USE_GIRD_SEARCH = True
models = [] models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@ -145,17 +157,29 @@ if __name__ == '__main__':
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE) data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
# Dataset und DataLoader # Dataset und DataLoader
train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=MAX_LEN)
val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=MAX_LEN)
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
subset_size = len(train_dataset) // N_MODELS subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False) device = ml_helper.get_device(verbose=True, include_mps=False)
# assert if N_MODLES > 1, than grid_params should be len 1
if N_MODELS > 1 and len(grid_params) > 1 or N_MODELS > 1 and USE_GIRD_SEARCH:
raise ValueError("If N_MODELS > 1, than grid_params should be len 1")
if not USE_GIRD_SEARCH:
print('Using best params')
# load best params
params_name = f'models/best_params_Transformer.json'
with open(params_name, 'r') as f:
best_params = json.load(f)
grid_params = [best_params]
for i in range(N_MODELS): for i in range(N_MODELS):
model_name = f'Transformer.pt' model_name = f'Transformer.pt'
hist_name = f'Transformer_history' hist_name = f'Transformer_history'
@ -166,7 +190,12 @@ if __name__ == '__main__':
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap') subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices) train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True) train_loader = DataLoader(train_dataset_sub, batch_size=BATCH_SIZE, shuffle=True)
for para_idx, params in enumerate(grid_params):
if len(grid_params) > 1:
model_name = f'Transformer_{i}_param_{para_idx}.pt'
hist_name = f'Transformer_{i}_param_{para_idx}_history'
# Modell initialisieren # Modell initialisieren
model = TransformerBinaryClassifier( model = TransformerBinaryClassifier(
@ -214,6 +243,15 @@ if __name__ == '__main__':
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if test_rmse > best_params_rmse:
best_params_rmse = test_rmse
best_params = params
if len(grid_params) > 1:
best_params_name = f'models/best_params_Transformer.json'
with open(best_params_name, 'w') as f:
json.dump(best_params, f)
if N_MODELS >1: if N_MODELS >1:
# Ensemble Prediction # Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device) ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device)