added bootstrap avg / ensemble preds

main
Felix Jan Michael Mucha 2025-02-16 03:56:50 +01:00
parent 603eab83b4
commit 4469f55889
6 changed files with 299 additions and 124 deletions

64
BERT.py
View File

@ -3,10 +3,12 @@ import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import BertForSequenceClassification, AutoTokenizer from transformers import BertForSequenceClassification, AutoTokenizer
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -53,20 +55,16 @@ if __name__ == '__main__':
# Config # Config
"max_len": 128, "max_len": 128,
# Training # Training
"epochs": 10, "epochs": 1,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 0.001, "learning_rate": 1e-6,
"weight_decay": 5e-4 , "weight_decay": 5e-4 ,
# Model # Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6 "dropout": 0.6
} }
# Configs # Configs
MODEL_NAME = 'BERT.pt'
HIST_NAME = 'BERT_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
FREEZE_BERT = False FREEZE_BERT = False
@ -74,6 +72,11 @@ if __name__ == '__main__':
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -96,15 +99,28 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren subset_size = len(train_dataset) // N_MODELS
model = CustomBert(dropout=params["dropout"])
device = ml_helper.get_device(verbose=True, include_mps=False) device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'BERT.pt'
hist_name = f'BERT_history'
if N_MODELS > 1:
model_name = f'BERT_{i}_ensemble.pt'
hist_name = f'BERT_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
model = CustomBert(dropout=params["dropout"])
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -120,7 +136,8 @@ if __name__ == '__main__':
break break
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True) test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True)
@ -128,10 +145,31 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device, is_bert=True)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_BERT_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

65
CNN.py
View File

@ -3,9 +3,11 @@ import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -57,7 +59,7 @@ if __name__ == '__main__':
# Config # Config
"max_len": 280, "max_len": 280,
# Training # Training
"epochs": 25, "epochs": 5,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 0.001, "learning_rate": 0.001,
@ -69,14 +71,17 @@ if __name__ == '__main__':
} }
# Configs # Configs
MODEL_NAME = 'CNN.pt'
HIST_NAME = 'CNN_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100 EMBEDDING_DIM = 100
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 1
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -95,7 +100,21 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'CNN.pt'
hist_name = f'CNN_history'
if N_MODELS > 1:
model_name = f'CNN_{i}_ensemble.pt'
hist_name = f'CNN_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
model = EnhancedCNNRegressor( model = EnhancedCNNRegressor(
vocab_size=vocab_size, vocab_size=vocab_size,
embedding_dim=EMBEDDING_DIM, embedding_dim=EMBEDDING_DIM,
@ -104,14 +123,11 @@ if __name__ == '__main__':
embedding_matrix=embedding_matrix, embedding_matrix=embedding_matrix,
dropout=params["dropout"] dropout=params["dropout"]
) )
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -126,11 +142,9 @@ if __name__ == '__main__':
print("Early stopping triggered.") print("Early stopping triggered.")
break break
# save training history
hist.save_history(HIST_NAME)
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device) test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
@ -138,10 +152,31 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Model: {model_name} Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_CNN_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels.to_numpy()
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,11 +1,14 @@
import math import math
import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -14,6 +17,12 @@ import ml_helper
import ml_history import ml_history
import ml_train import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class PositionalEncoding(nn.Module): class PositionalEncoding(nn.Module):
""" """
@ -102,7 +111,7 @@ if __name__ == '__main__':
# Config # Config
"max_len": 280, "max_len": 280,
# Training # Training
"epochs": 25, "epochs": 1,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 1e-4, # 1e-4 "learning_rate": 1e-4, # 1e-4
@ -113,17 +122,19 @@ if __name__ == '__main__':
'hiden_dim': 2048, 'hiden_dim': 2048,
'num_layers': 6 'num_layers': 6
} }
# TODO set seeds
# Configs # Configs
MODEL_NAME = 'transfomrer.pt'
HIST_NAME = 'transformer_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100 EMBEDDING_DIM = 100
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -142,6 +153,21 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'Transformer.pt'
hist_name = f'Transformer_history'
if N_MODELS > 1:
model_name = f'Transformer_{i}_ensemble.pt'
hist_name = f'Transformer_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
# Modell initialisieren # Modell initialisieren
model = TransformerBinaryClassifier( model = TransformerBinaryClassifier(
embeddings=embedding_matrix, embeddings=embedding_matrix,
@ -151,13 +177,11 @@ if __name__ == '__main__':
positional_dropout=params["dropout"], positional_dropout=params["dropout"],
classifier_dropout=params["dropout"], classifier_dropout=params["dropout"],
) )
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -172,14 +196,9 @@ if __name__ == '__main__':
print("Early stopping triggered.") print("Early stopping triggered.")
break break
# save training history
hist.save_history(HIST_NAME)
# save training history
hist.save_history(HIST_NAME)
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device) test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
@ -187,10 +206,30 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_Transformer_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels.to_numpy()
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -8,6 +8,7 @@ import torch
import regex as re import regex as re
def load_glove_embeddings(glove_file_path, emb_len=100): def load_glove_embeddings(glove_file_path, emb_len=100):
print('Loading GloVe embeddings...')
embeddings_index = {} embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f: with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
@ -100,3 +101,38 @@ def split_data(X, y, test_size=0.1, val_size=0.1):
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y'])) print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
return ret_dict return ret_dict
def ensemble_data_idx(labels, n_models, cur_models_idx, methods='bootstrap'):
if methods == 'bootstrap':
# Calculate the size of the subset
subset_size = len(labels) // n_models
# Calculate the start and end index of the subset
start_idx = cur_models_idx * subset_size
end_idx = start_idx + subset_size
# Calculate the indices of the subset
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(labels)))
return subset_indices
if methods == 'shuffle':
subset_indices = np.random.permutation(len(labels))
return subset_indices
if methods == 'random':
subset_indices = np.random.choice(len(labels), len(labels), replace=False)
return subset_indices
if methods == 'flatten_normal_dist':
# TODO: test this and plot if it works
subset_size = len(labels) // n_models
std_range = 1
mean = np.mean(labels)
std = np.std(labels)
# Randomly select samples arounnd the mean in the std
del_subset_indices = np.random.choice(np.where((labels >= mean - std_range * std) & (labels <= mean + std_range * std))[0], size=subset_size, replace=False)
subset = np.delete(labels, del_subset_indices)
# TODO i dont think this really uses the indices
subset_indices = np.where(np.isin(labels, subset))[0]
return subset_indices
else:
raise ValueError(f"Unknown method: {methods}")

View File

@ -99,10 +99,11 @@ class History:
return history_to_save return history_to_save
def save_history(self, hist_name): def save_history(self, hist_name, timestamp=None):
directory = "histories" directory = "histories"
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) # Create the directory if it does not exist os.makedirs(directory) # Create the directory if it does not exist
if timestamp is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json") filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json")

View File

@ -85,3 +85,29 @@ def test_loop(model, test_loader, device, is_bert=False):
test_labels.extend(labels.cpu().detach().numpy()) test_labels.extend(labels.cpu().detach().numpy())
return test_labels, test_preds return test_labels, test_preds
def ensemble_predict(models, test_loader, device, is_bert=False):
for model in models:
model.eval()
test_preds = []
with torch.no_grad():
for batch in test_loader:
if is_bert:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
predictions = [model(input_ids, attention_mask=attention_mask).float().cpu().detach().numpy() for model in models]
else:
X_batch, y_batch = batch
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = [model(X_batch).float().cpu().detach().numpy() for model in models]
predictions = predictions
test_preds.append(predictions)
#check if predictions are empty lists
if not test_preds[0]:
raise ValueError("No predictions were made in ensemble prediction.")
test_preds = np.concatenate(test_preds, axis=1)
return test_preds