NIls Rekus 2025-02-16 12:00:09 +01:00
commit 544f16d316
12 changed files with 1167 additions and 589 deletions

64
BERT.py
View File

@ -3,10 +3,12 @@ import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import BertForSequenceClassification, AutoTokenizer from transformers import BertForSequenceClassification, AutoTokenizer
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -53,20 +55,16 @@ if __name__ == '__main__':
# Config # Config
"max_len": 128, "max_len": 128,
# Training # Training
"epochs": 10, "epochs": 1,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 0.001, "learning_rate": 1e-6,
"weight_decay": 5e-4 , "weight_decay": 5e-4 ,
# Model # Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6 "dropout": 0.6
} }
# Configs # Configs
MODEL_NAME = 'BERT.pt'
HIST_NAME = 'BERT_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
FREEZE_BERT = False FREEZE_BERT = False
@ -74,6 +72,11 @@ if __name__ == '__main__':
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -96,15 +99,28 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren subset_size = len(train_dataset) // N_MODELS
model = CustomBert(dropout=params["dropout"])
device = ml_helper.get_device(verbose=True, include_mps=False) device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'BERT.pt'
hist_name = f'BERT_history'
if N_MODELS > 1:
model_name = f'BERT_{i}_ensemble.pt'
hist_name = f'BERT_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
model = CustomBert(dropout=params["dropout"])
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -120,7 +136,8 @@ if __name__ == '__main__':
break break
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True) test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True)
@ -128,10 +145,31 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device, is_bert=True)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_BERT_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

65
CNN.py
View File

@ -3,9 +3,11 @@ import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -57,7 +59,7 @@ if __name__ == '__main__':
# Config # Config
"max_len": 280, "max_len": 280,
# Training # Training
"epochs": 25, "epochs": 5,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 0.001, "learning_rate": 0.001,
@ -69,14 +71,17 @@ if __name__ == '__main__':
} }
# Configs # Configs
MODEL_NAME = 'CNN.pt'
HIST_NAME = 'CNN_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100 EMBEDDING_DIM = 100
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 1
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -95,7 +100,21 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'CNN.pt'
hist_name = f'CNN_history'
if N_MODELS > 1:
model_name = f'CNN_{i}_ensemble.pt'
hist_name = f'CNN_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
model = EnhancedCNNRegressor( model = EnhancedCNNRegressor(
vocab_size=vocab_size, vocab_size=vocab_size,
embedding_dim=EMBEDDING_DIM, embedding_dim=EMBEDDING_DIM,
@ -104,14 +123,11 @@ if __name__ == '__main__':
embedding_matrix=embedding_matrix, embedding_matrix=embedding_matrix,
dropout=params["dropout"] dropout=params["dropout"]
) )
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -126,11 +142,9 @@ if __name__ == '__main__':
print("Early stopping triggered.") print("Early stopping triggered.")
break break
# save training history
hist.save_history(HIST_NAME)
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device) test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
@ -138,10 +152,31 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Model: {model_name} Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_CNN_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels.to_numpy()
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,11 +1,14 @@
import math import math
import random
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader, Subset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np import numpy as np
from datetime import datetime
import json
import Datasets import Datasets
import dataset_helper import dataset_helper
@ -14,6 +17,12 @@ import ml_helper
import ml_history import ml_history
import ml_train import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class PositionalEncoding(nn.Module): class PositionalEncoding(nn.Module):
""" """
@ -102,7 +111,7 @@ if __name__ == '__main__':
# Config # Config
"max_len": 280, "max_len": 280,
# Training # Training
"epochs": 25, "epochs": 1,
"patience": 7, "patience": 7,
"batch_size": 32, "batch_size": 32,
"learning_rate": 1e-4, # 1e-4 "learning_rate": 1e-4, # 1e-4
@ -113,17 +122,19 @@ if __name__ == '__main__':
'hiden_dim': 2048, 'hiden_dim': 2048,
'num_layers': 6 'num_layers': 6
} }
# TODO set seeds
# Configs # Configs
MODEL_NAME = 'transfomrer.pt'
HIST_NAME = 'transformer_history'
GLOVE_PATH = 'data/glove.6B.100d.txt' GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv' DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100 EMBEDDING_DIM = 100
TEST_SIZE = 0.1 TEST_SIZE = 0.1
VAL_SIZE = 0.1 VAL_SIZE = 0.1
N_MODELS = 2
models = []
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Daten laden und vorbereiten # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
@ -142,6 +153,21 @@ if __name__ == '__main__':
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
subset_size = len(train_dataset) // N_MODELS
device = ml_helper.get_device(verbose=True, include_mps=False)
for i in range(N_MODELS):
model_name = f'Transformer.pt'
hist_name = f'Transformer_history'
if N_MODELS > 1:
model_name = f'Transformer_{i}_ensemble.pt'
hist_name = f'Transformer_{i}_ensemble_history'
subset_indices = dataset_helper.ensemble_data_idx(train_dataset.labels, N_MODELS, i, methods='bootstrap')
train_dataset_sub = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_dataset_sub, batch_size=params["batch_size"], shuffle=True)
# Modell initialisieren # Modell initialisieren
model = TransformerBinaryClassifier( model = TransformerBinaryClassifier(
embeddings=embedding_matrix, embeddings=embedding_matrix,
@ -151,13 +177,11 @@ if __name__ == '__main__':
positional_dropout=params["dropout"], positional_dropout=params["dropout"],
classifier_dropout=params["dropout"], classifier_dropout=params["dropout"],
) )
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device) model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME) early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=model_name)
hist = ml_history.History() hist = ml_history.History()
@ -172,14 +196,9 @@ if __name__ == '__main__':
print("Early stopping triggered.") print("Early stopping triggered.")
break break
# save training history
hist.save_history(HIST_NAME)
# save training history
hist.save_history(HIST_NAME)
# Load best model # Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME)) model.load_state_dict(torch.load('models/checkpoints/' + model_name, weights_only=False))
models.append(model)
# Test Evaluation # Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device) test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
@ -187,10 +206,30 @@ if __name__ == '__main__':
hist.add_test_results(test_labels, test_preds) hist.add_test_results(test_labels, test_preds)
# save training history # save training history
hist.save_history(HIST_NAME) hist.save_history(hist_name, timestamp)
# RMSE, MAE und R²-Score für das Test-Set # RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
if N_MODELS >1:
# Ensemble Prediction
ensemble_test_preds = ml_train.ensemble_predict(models, test_loader, device)
ensemble_avg_preds = np.mean(ensemble_test_preds, axis=0)
# Save ensemble predictions as json
ensemble_preds_path = f'histories/ensemble_preds_Transformer_{timestamp}.json'
with open(ensemble_preds_path, 'w') as f:
json.dump(ensemble_avg_preds.tolist(), f)
# Test Evaluation
test_labels = test_dataset.labels.to_numpy()
test_mse = mean_squared_error(test_labels, ensemble_avg_preds)
test_mae = mean_absolute_error(test_labels, ensemble_avg_preds)
test_r2 = r2_score(test_labels, ensemble_avg_preds)
print(f"Ensemble Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,101 +1,159 @@
import pandas as pd import random
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
from dataset_generator import create_embedding_matrix
from EarlyStopping import EarlyStopping
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset # Import Subset import matplotlib.pyplot as plt
#from utils import tokenize_and_pad, HumorDataset, evaluate_model, bootstrap_aggregation from torch.utils.data import DataLoader, Subset
def train_model(model, train_dataset, val_dataset, criterion, optimizer, epochs, batch_size): from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) import numpy as np
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
model.to(device) import Datasets
history = {'train_loss': [], 'val_loss': [], 'train_r2': [], 'val_r2': []} import dataset_helper
import EarlyStopping
import ml_helper
import ml_history
import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class EnhancedCNNRegressor(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNRegressor, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
# Convolutional Schichten mit Batch-Normalisierung
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout) # Dropout nach jeder Schicht
)
for fs in filter_sizes
])
# Fully-Connected Layer
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
x = self.dropout(x)
return self.fc2(x).squeeze(1)
def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size):
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_losses, train_losses = [], []
train_r2_scores, test_r2_scores = [], []
for epoch in range(epochs): for epoch in range(epochs):
model.train() model.train()
total_loss = 0 running_loss = 0.0
all_train_preds, all_train_targets = [], [] running_r2 = 0.0
# Training
for inputs, labels in train_loader:
inputs = inputs.to(device)
labels = labels.to(device)
for inputs, targets in train_dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad() optimizer.zero_grad()
outputs = model(inputs).squeeze() outputs = model(inputs)
loss = criterion(outputs, targets) loss = criterion(outputs, labels)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
total_loss += loss.item()
all_train_preds.extend(outputs.detach().cpu().numpy()) running_loss += loss.item()
all_train_targets.extend(targets.detach().cpu().numpy()) running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
train_r2 = r2_score(all_train_targets, all_train_preds) train_losses.append(running_loss / len(train_loader))
train_loss = total_loss / len(train_dataloader) train_r2_scores.append(running_r2 / len(train_loader))
history['train_loss'].append(train_loss)
history['train_r2'].append(train_r2)
# Test
model.eval() # Set model to evaluation mode
test_loss = 0.0
test_r2 = 0.0
with torch.no_grad(): # No gradient calculation for testing
for inputs, labels in test_loader:
inputs = inputs.to(device)
labels = labels.to(device)
model.eval() outputs = model(inputs)
val_loss = 0 loss = criterion(outputs, labels)
all_val_preds, all_val_targets = [], []
with torch.no_grad(): test_loss += loss.item()
for inputs, targets in val_dataloader: test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets)
val_loss += loss.item()
all_val_preds.extend(outputs.cpu().numpy()) test_losses.append(test_loss / len(test_loader))
all_val_targets.extend(targets.cpu().numpy()) test_r2_scores.append(test_r2 / len(test_loader))
val_r2 = r2_score(all_val_targets, all_val_preds) print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}')
val_loss /= len(val_dataloader)
history['val_loss'].append(val_loss)
history['val_r2'].append(val_r2)
print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}") return train_losses, test_losses, train_r2_scores, test_r2_scores
return history # Bootstrap Aggregation (Bagging) Update
def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
def bootstrap_aggregation(ModelClass, train_dataset, num_models=3, epochs=5, batch_size=32, learning_rate=0.001):
models = [] models = []
all_histories = [] all_train_losses, all_test_losses = [], []
all_train_r2_scores, all_test_r2_scores = [], []
subset_size = len(train_dataset) // num_models subset_size = len(train_dataset) // num_models
for i in range(num_models): for i in range(num_models):
print(f"Training Model {i+1}/{num_models}...") print(f"Training Model {i + 1}/{num_models}...")
start_idx = i * subset_size start_idx = i * subset_size
end_idx = start_idx + subset_size end_idx = start_idx + subset_size
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices) subset = Subset(train_dataset, subset_indices)
model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"])
val_indices = list(range(start_idx, end_idx)) model.to(device)
val_subset = Subset(train_dataset, val_indices)
model = ModelClass()
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate)
history = train_model(model, subset, val_subset, criterion, optimizer, epochs, batch_size) train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size)
all_histories.append(history)
models.append(model) models.append(model)
all_train_losses.append(train_losses)
all_test_losses.append(test_losses)
all_train_r2_scores.append(train_r2_scores)
all_test_r2_scores.append(test_r2_scores)
return models, all_histories # Plot für alle Modelle
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss')
plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed')
plt.title("Training and Test Loss for all Models")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²')
plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed')
plt.title("Training and Test R² for all Models")
plt.xlabel('Epochs')
plt.ylabel('')
plt.legend()
plt.show()
return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores
# Ensemble Prediction
def ensemble_predict(models, test_dataset): def ensemble_predict(models, test_dataset):
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False) dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
all_predictions = [] all_predictions = []
@ -104,160 +162,64 @@ def ensemble_predict(models, test_dataset):
for inputs, _ in dataloader: for inputs, _ in dataloader:
inputs = inputs.to(device) inputs = inputs.to(device)
predictions = torch.stack([model(inputs).squeeze() for model in models]) predictions = torch.stack([model(inputs).squeeze() for model in models])
avg_predictions = predictions.mean(dim=0) # Mittelwert über alle Modelle avg_predictions = predictions.mean(dim=0)
all_predictions.extend(avg_predictions.cpu().numpy()) all_predictions.extend(avg_predictions.cpu().numpy())
return np.array(all_predictions) return np.array(all_predictions)
import matplotlib.pyplot as plt if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 2,
"patience": 7,
"batch_size": 16,
"learning_rate": 0.001,
"weight_decay": 5e-4 ,
# Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6
}
def plot_training_histories(histories, num_models): # Configs
epochs = range(1, len(histories[0]['train_loss']) + 1) MODEL_NAME = 'CNN.pt'
HIST_NAME = 'CNN_history'
GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100
TEST_SIZE = 0.1
VAL_SIZE = 0.1
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
for i in range(num_models): # Aufteilen der Daten
axes[0].plot(epochs, histories[i]['train_loss'], label=f"Train Loss Model {i+1}") data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
axes[0].plot(epochs, histories[i]['val_loss'], linestyle='dashed', label=f"Val Loss Model {i+1}")
axes[0].set_title("Train & Validation Loss") # Dataset und DataLoader
axes[0].set_xlabel("Epochs") train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
axes[0].set_ylabel("Loss") val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
axes[0].legend() test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
for i in range(num_models): # Bootstrap Aggregation (Bagging) Training
axes[1].plot(epochs, histories[i]['train_r2'], label=f"Train R² Model {i+1}") models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation(
axes[1].plot(epochs, histories[i]['val_r2'], linestyle='dashed', label=f"Val R² Model {i+1}") EnhancedCNNRegressor, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"])
axes[1].set_title("Train & Validation R² Score") # Ensemble Prediction
axes[1].set_xlabel("Epochs") test_predictions = ensemble_predict(models, test_dataset)
axes[1].set_ylabel("R² Score")
axes[1].legend()
plt.show() # Test Evaluation
# test_labels = np.array([y for _, y in test_dataset])
test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions)
test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions)
test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions)
print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
# 1. Gerät automatisch erkennen
device = torch.device('mps' if torch.backends.mps.is_available()
else 'cuda' if torch.cuda.is_available()
else 'cpu')
print(f"Using device: {device}")
# 2. Daten laden
data = pd.read_csv('data/hack.csv')
# 3. Filtern humorvoller Texte
humor_data = data[data['is_humor'] == 1].dropna(subset=['humor_rating']).copy()
# 4. Einbettungsmatrix erstellen
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path='data/glove.6B.100d.txt', emb_len=100
)
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
# 5. Tokenisierung und Padding
def tokenize_and_pad(texts, word_index, max_len=50):
sequences = []
for text in texts:
tokens = [word_index.get(word, 0) for word in text.split()]
if len(tokens) < max_len:
tokens += [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
sequences.append(tokens)
return torch.tensor(sequences, dtype=torch.long)
max_len = 50
train_texts, test_texts, train_labels, test_labels = train_test_split(
humor_data['text'], humor_data['humor_rating'], test_size=0.2, random_state=42
)
train_input_ids = tokenize_and_pad(train_texts, word_index, max_len=max_len)
test_input_ids = tokenize_and_pad(test_texts, word_index, max_len=max_len)
# Labels in Tensor konvertieren
train_labels = torch.tensor(train_labels.values, dtype=torch.float)
test_labels = torch.tensor(test_labels.values, dtype=torch.float)
# 6. Dataset und DataLoader
class HumorDataset(Dataset):
def __init__(self, input_ids, labels):
self.input_ids = input_ids
self.labels = labels
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.labels[idx]
dataset = HumorDataset(train_input_ids, train_labels)
# 7. CNN-Regression-Modell
def create_cnn(vocab_size, embed_dim, embedding_matrix):
class CNNRegressor(nn.Module):
def __init__(self, vocab_size, embed_dim, embedding_matrix):
super(CNNRegressor, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.embedding.weight.data.copy_(embedding_matrix.clone().detach())
self.embedding.weight.requires_grad = False
self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3)
self.conv2 = nn.Conv1d(128, 64, kernel_size=3)
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(64, 1)
def forward(self, x):
x = self.embedding(x).permute(0, 2, 1)
x = torch.relu(self.conv1(x))
x = torch.relu(self.conv2(x))
x = self.dropout(x)
x = torch.max(x, dim=2).values
x = self.fc(x)
return torch.sigmoid(x) * 5
return CNNRegressor(vocab_size, embed_dim, embedding_matrix)
# 8. Bootstrap Aggregation mit CNN
models, histories = bootstrap_aggregation(
lambda: create_cnn(vocab_size, d_model, embedding_matrix),
dataset,
num_models=5,
epochs=10,
batch_size=32,
learning_rate=0.001
)
# **Plot Training & Validation Loss & R²**
plot_training_histories(histories, num_models=5)
# Vorhersagen mit Ensemble
predictions = ensemble_predict(models, HumorDataset(test_input_ids, test_labels))
actuals = test_labels.numpy()
# 9. Metriken berechnen
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
r2 = r2_score(actuals, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
# 10. Visualisierung
tolerance = 0.5 # Toleranz für korrekte Vorhersagen
predictions = np.array(predictions)
actuals = np.array(actuals)
correct = np.abs(predictions - actuals) <= tolerance
colors = np.where(correct, 'green', 'red')
plt.figure(figsize=(8, 6))
plt.scatter(actuals, predictions, c=colors, alpha=0.6, edgecolor='k', s=50)
plt.plot([0, 5], [0, 5], color='red', linestyle='--')
green_patch = mpatches.Patch(color='green', label='Correct Predictions')
red_patch = mpatches.Patch(color='red', label='Incorrect Predictions')
plt.legend(handles=[green_patch, red_patch])
plt.xlabel("True Humor Ratings")
plt.ylabel("Predicted Humor Ratings")
plt.title("True vs Predicted Humor Ratings (Correct vs Incorrect)")
plt.show()

View File

@ -8,6 +8,7 @@ import torch
import regex as re import regex as re
def load_glove_embeddings(glove_file_path, emb_len=100): def load_glove_embeddings(glove_file_path, emb_len=100):
print('Loading GloVe embeddings...')
embeddings_index = {} embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f: with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
@ -100,3 +101,38 @@ def split_data(X, y, test_size=0.1, val_size=0.1):
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y'])) print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
return ret_dict return ret_dict
def ensemble_data_idx(labels, n_models, cur_models_idx, methods='bootstrap'):
if methods == 'bootstrap':
# Calculate the size of the subset
subset_size = len(labels) // n_models
# Calculate the start and end index of the subset
start_idx = cur_models_idx * subset_size
end_idx = start_idx + subset_size
# Calculate the indices of the subset
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(labels)))
return subset_indices
if methods == 'shuffle':
subset_indices = np.random.permutation(len(labels))
return subset_indices
if methods == 'random':
subset_indices = np.random.choice(len(labels), len(labels), replace=False)
return subset_indices
if methods == 'flatten_normal_dist':
# TODO: test this and plot if it works
subset_size = len(labels) // n_models
std_range = 1
mean = np.mean(labels)
std = np.std(labels)
# Randomly select samples arounnd the mean in the std
del_subset_indices = np.random.choice(np.where((labels >= mean - std_range * std) & (labels <= mean + std_range * std))[0], size=subset_size, replace=False)
subset = np.delete(labels, del_subset_indices)
# TODO i dont think this really uses the indices
subset_indices = np.where(np.isin(labels, subset))[0]
return subset_indices
else:
raise ValueError(f"Unknown method: {methods}")

View File

@ -4,6 +4,7 @@ import nltk
import time import time
import json import json
import os import os
import re
def get_device(verbose=False, include_mps=False): def get_device(verbose=False, include_mps=False):
""" """
@ -39,7 +40,7 @@ def save_model_and_hyperparams(model, model_prefix_name, rmse, hyperparameters,
json.dump(hyperparameters, f) json.dump(hyperparameters, f)
print(f"Hyperparameters saved to {hyperparameters_path}.") print(f"Hyperparameters saved to {hyperparameters_path}.")
def get_newest_file(path, name=None, extension=".pth"): def get_newest_file(path, name=None, extension=".pth", ensemble=False):
""" """
Get the newest file in a directory. Get the newest file in a directory.
""" """
@ -49,13 +50,35 @@ def get_newest_file(path, name=None, extension=".pth"):
if name: if name:
files = [f for f in files if name in f] files = [f for f in files if name in f]
if ensemble:
files = [f for f in files if "ensemble" in f]
# Sort files by modification time # Sort files by modification time
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True) files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)), reverse=True)
# Get the newest file # Get the newest file
if files: if files:
if not ensemble:
newest_model_path = os.path.join(path, files[0]) newest_model_path = os.path.join(path, files[0])
return newest_model_path return newest_model_path
else:
# Extract timestamp from the newest file's filename
regex = r"(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})"
newest_stamp = None
ret_files = []
for file in files:
match = re.search(regex, file)
if match:
newest_timestamp = match.group(1)
if not newest_stamp or newest_timestamp > newest_stamp:
newest_stamp = newest_timestamp
if newest_stamp:
ret_files.append(os.path.join(path, file))
if ret_files:
return ret_files
else:
print("No File found in the directory")
return None
else: else:
print("No File found in the directory") print("No File found in the directory")
return None return None

View File

@ -99,10 +99,11 @@ class History:
return history_to_save return history_to_save
def save_history(self, hist_name): def save_history(self, hist_name, timestamp=None):
directory = "histories" directory = "histories"
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) # Create the directory if it does not exist os.makedirs(directory) # Create the directory if it does not exist
if timestamp is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json") filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json")

View File

@ -1,6 +1,11 @@
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import matplotlib.cm as cm
import scipy.stats as stats
import matplotlib.gridspec as gridspec
from sklearn.linear_model import LinearRegression
import os import os
import time import time
@ -11,23 +16,23 @@ def save_plot(plt, plot_name):
time_stamp = time.strftime('%Y%m%d-%H%M%S') time_stamp = time.strftime('%Y%m%d-%H%M%S')
plt.savefig(f'plots/{plot_name}_{time_stamp}.png') plt.savefig(f'plots/{plot_name}_{time_stamp}.png')
def plot_training_history(hist_data, title='Training History', save=True): def plot_training_history(hist_data, colors, title='Training History', save=True):
epochs = range(1, len(hist_data['train_loss']) + 1) epochs = range(1, len(hist_data['train_loss']) + 1)
fig, axs = plt.subplots(1, 2, figsize=(12, 5)) fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Plot accuracy # Plot accuracy
axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE') axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE', color=colors['blue'])
axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE') axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE', color=colors['green'])
axs[1].set_title('RMSE') axs[1].set_title('RMSE')
axs[1].set_xlabel('Epochs') axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('RMSE') axs[1].set_ylabel('RMSE')
axs[1].legend() axs[1].legend()
# Plot loss # Plot loss
axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss') axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss', color=colors['blue'])
axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss') axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss', color=colors['green'])
axs[0].set_title('Loss') axs[0].set_title('Loss')
axs[0].set_xlabel('Epochs') axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss') axs[0].set_ylabel('Loss')
@ -41,10 +46,10 @@ def plot_training_history(hist_data, title='Training History', save=True):
save_plot(plt, title) save_plot(plt, title)
return plt return plt
def plot_distribution(true_values, predicted_values, title='Distribution of Predicted and True Values', save=True): def plot_distribution(true_values, predicted_values, colors, title='Distribution of Predicted and True Values', save=True):
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
plt.hist(true_values, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values') plt.hist(true_values, bins=20, color=colors['green'], edgecolor='black', alpha=0.7, label='True Values')
plt.hist(predicted_values, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values') plt.hist(predicted_values, bins=20, color=colors['blue'], edgecolor='black', alpha=0.7, label='Predicted Values')
plt.title(title) plt.title(title)
plt.xlabel('Score') plt.xlabel('Score')
plt.ylabel('Frequency') plt.ylabel('Frequency')
@ -55,15 +60,15 @@ def plot_distribution(true_values, predicted_values, title='Distribution of Pred
save_plot(plt, title) save_plot(plt, title)
return plt return plt
def plot_predictions(true_values, predicted_values, title='True vs Predicted Values', threshold=0.3, save=True): def plot_predictions(true_values, predicted_values, colors, title='True vs Predicted Values', threshold=0.3, save=True):
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
# Difference between predicted and true values # Difference between predicted and true values
correct_indices = np.isclose(true_values, predicted_values, atol=threshold) correct_indices = np.isclose(true_values, predicted_values, atol=threshold)
incorrect_indices = ~correct_indices incorrect_indices = ~correct_indices
# Plot # Plot
plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color='green', label='Correctly predicted') plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color=colors['green'], alpha=0.5, label='Correctly predicted')
plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color='red', label='Incorrectly predicted') plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color=colors['red'], alpha=0.5, label='Incorrectly predicted')
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal Line') plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color=colors['blue'], linestyle='--', label='Ideal Line')
plt.xlabel('True Values') plt.xlabel('True Values')
plt.ylabel('Predicted Values') plt.ylabel('Predicted Values')
plt.title(title) plt.title(title)
@ -73,3 +78,86 @@ def plot_predictions(true_values, predicted_values, title='True vs Predicted Val
if save: if save:
save_plot(plt, title) save_plot(plt, title)
return plt return plt
def plot_residuals(labels, preds, colors, title='Residuals Plot', save=True):
residuals = np.array(preds) - np.array(labels)
fig = plt.figure(figsize=(14, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[4, 1])
# Main plot
ax0 = plt.subplot(gs[0])
ax0.scatter(labels, residuals, label='Residuals', color=colors['blue'], alpha=0.5)
# Fit linear regression model to residuals
labels_reshaped = np.array(labels).reshape(-1, 1)
model = LinearRegression()
model.fit(labels_reshaped, residuals)
trend_line = model.predict(labels_reshaped)
# Plot trend line
ax0.plot(labels, trend_line, color=colors['red'], label='Trend Line', linewidth=2)
ax0.set_xlabel('True Values')
ax0.set_ylabel('Residuals')
ax0.axhline(y=0, color='k', linestyle='--')
ax0.set_title(title)
ax0.legend()
# Side plot for distribution of true values
ax1 = plt.subplot(gs[1], sharey=ax0)
ax1.hist(residuals, bins=30, alpha=0.5, color=colors['blue'], orientation='horizontal')
ax1.set_xlabel('Frequency')
ax1.set_title('Distribution of residuals')
ax1.yaxis.tick_right()
ax1.yaxis.set_label_position("right")
plt.tight_layout()
# save plot
if save:
save_plot(plt, title)
return plt
def plot_qq(labels, preds, colors, title='Q-Q Plot of Residuals', save=True):
residuals = np.array(preds) - np.array(labels)
# Generate a Normal Q-Q plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
stats.probplot(residuals, dist="norm", plot=ax)
# Set colors
line = ax.get_lines()
line[0].set_color(colors['blue']) # Data points
line[1].set_color(colors['red']) # Fit line
plt.title(title)
# save plot
if save:
save_plot(plt, title)
return plt
def plot_val_preds(val_preds, val_labels, colors, title='Histogram of Validation Predictions', save=True):
plt.figure(figsize=(10, 6))
plt.hist(val_labels, bins=20, alpha=0.5, label='True Values', color=colors['green'],)
cmap = cm.get_cmap('coolwarm', len(val_preds)) # Use 'coolwarm' colormap for gradient from red to blue
for epoch, preds in val_preds.items():
color = cmap(len(val_preds) - epoch ) # Get color from colormap
plt.hist(preds, bins=20, alpha=0.5, label=f'Epoch {epoch}', color=color)
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.title(title)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
# save plot
if save:
save_plot(plt, title)
return plt
####################################################################################################
############### Comparison Plots ###################################################################
####################################################################################################

View File

@ -85,3 +85,29 @@ def test_loop(model, test_loader, device, is_bert=False):
test_labels.extend(labels.cpu().detach().numpy()) test_labels.extend(labels.cpu().detach().numpy())
return test_labels, test_preds return test_labels, test_preds
def ensemble_predict(models, test_loader, device, is_bert=False):
for model in models:
model.eval()
test_preds = []
with torch.no_grad():
for batch in test_loader:
if is_bert:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
predictions = [model(input_ids, attention_mask=attention_mask).float().cpu().detach().numpy() for model in models]
else:
X_batch, y_batch = batch
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = [model(X_batch).float().cpu().detach().numpy() for model in models]
predictions = predictions
test_preds.append(predictions)
#check if predictions are empty lists
if not test_preds[0]:
raise ValueError("No predictions were made in ensemble prediction.")
test_preds = np.concatenate(test_preds, axis=1)
return test_preds

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,50 +1,33 @@
import time import random
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score import Datasets
from sklearn.model_selection import KFold import dataset_helper
# local imports import EarlyStopping
import ml_evaluation as ml_eval
import ml_helper import ml_helper
import ml_history import ml_history
import dataset_generator as data_gen import ml_train
# class imports
import HumorDataset as humor_ds
import EarlyStopping
import BalancedCELoss
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.manual_seed(0)
np.random.seed(0)
best_model_filename = 'best_transformer_reg_model.pt'
device = ml_helper.get_device(verbose=True)
embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
class PositionalEncoding(nn.Module): class PositionalEncoding(nn.Module):
"""
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
"""
def __init__(self, d_model, vocab_size=5000, dropout=0.1): def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__() super().__init__()
self.dropout = nn.Dropout(p=dropout) self.dropout = nn.Dropout(p=dropout)
@ -66,6 +49,10 @@ class PositionalEncoding(nn.Module):
class TransformerBinaryClassifier(nn.Module): class TransformerBinaryClassifier(nn.Module):
"""
Text classifier based on a pytorch TransformerEncoder.
"""
def __init__( def __init__(
self, self,
embeddings, embeddings,
@ -74,8 +61,8 @@ class TransformerBinaryClassifier(nn.Module):
num_layers=6, num_layers=6,
positional_dropout=0.1, positional_dropout=0.1,
classifier_dropout=0.1, classifier_dropout=0.1,
activation="relu",
): ):
super().__init__() super().__init__()
vocab_size, d_model = embeddings.size() vocab_size, d_model = embeddings.size()
@ -99,6 +86,7 @@ class TransformerBinaryClassifier(nn.Module):
encoder_layer, encoder_layer,
num_layers=num_layers, num_layers=num_layers,
) )
# normalize to stabilize and stop overfitting
self.batch_norm = nn.BatchNorm1d(d_model) self.batch_norm = nn.BatchNorm1d(d_model)
self.classifier = nn.Linear(d_model, 1) self.classifier = nn.Linear(d_model, 1)
self.d_model = d_model self.d_model = d_model
@ -108,113 +96,70 @@ class TransformerBinaryClassifier(nn.Module):
x = self.pos_encoder(x) x = self.pos_encoder(x)
x = self.transformer_encoder(x) x = self.transformer_encoder(x)
x = x.mean(dim=1) x = x.mean(dim=1)
# normalize to stabilize and stop overfitting
#x = self.batch_norm(x)
#NOTE: no activation function for regression
x = self.classifier(x) x = self.classifier(x)
x = x.squeeze(1)
return x return x
def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size):
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
def load_preprocess_data(path_data='data/hack.csv'): test_losses, train_losses = [], []
df = pd.read_csv(path_data) train_r2_scores, test_r2_scores = [], []
df = df.dropna(subset=['humor_rating'])
df['y'] = df['humor_rating']
X = df['text']
y = df['y']
return X, y
X, y = load_preprocess_data()
ret_dict = data_gen.split_data(X, y)
params = {
'equalize_classes_loss_factor': 0.15,
'batch_size': 32,
'epochs': 2,
'lr': 1e-4,
'clipping_max_norm': 0,
'early_stopping_patience': 5,
'lr_scheduler_factor': 0.5,
'lr_scheduler_patience': 3,
'nhead': 2,
'num_layers': 3,
'hidden_dim': 10,
'positional_dropout': 0.5,
'classifier_dropout': 0.5,
'weight_decay': 1e-2
}
max_len = 280
train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)
val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)
test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)
def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size):
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
model.to(device)
# Store for plotting
train_losses, val_losses = [], []
train_r2_scores, val_r2_scores = [], []
for epoch in range(epochs): for epoch in range(epochs):
model.train() model.train()
total_loss = 0 running_loss = 0.0
all_preds, all_targets = [], [] running_r2 = 0.0
# Training
for inputs, labels in train_loader:
inputs = inputs.to(device)
labels = labels.to(device)
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad() optimizer.zero_grad()
outputs = model(inputs).squeeze() outputs = model(inputs)
loss = criterion(outputs, targets.float()) loss = criterion(outputs, labels)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
total_loss += loss.item()
all_preds.extend(outputs.detach().cpu().numpy()) running_loss += loss.item()
all_targets.extend(targets.detach().cpu().numpy()) running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
# Calculate R2 train_losses.append(running_loss / len(train_loader))
r2 = r2_score(all_targets, all_preds) train_r2_scores.append(running_r2 / len(train_loader))
train_losses.append(total_loss / len(dataloader))
train_r2_scores.append(r2)
# Validation phase # Test
model.eval() model.eval() # Set model to evaluation mode
val_loss = 0 test_loss = 0.0
val_preds, val_targets = [], [] test_r2 = 0.0
with torch.no_grad(): # No gradient calculation for testing
for inputs, labels in test_loader:
inputs = inputs.to(device)
labels = labels.to(device)
with torch.no_grad(): outputs = model(inputs)
for inputs, targets in val_loader: loss = criterion(outputs, labels)
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
val_loss += loss.item()
val_preds.extend(outputs.cpu().numpy()) test_loss += loss.item()
val_targets.extend(targets.cpu().numpy()) test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
# Calculate Validation R2 test_losses.append(test_loss / len(test_loader))
val_r2 = r2_score(val_targets, val_preds) test_r2_scores.append(test_r2 / len(test_loader))
val_losses.append(val_loss / len(val_loader))
val_r2_scores.append(val_r2)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}") print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}')
return train_losses, val_losses, train_r2_scores, val_r2_scores return train_losses, test_losses, train_r2_scores, test_r2_scores
# Bootstrap Aggregation (Bagging) Update
def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001): def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
models = [] models = []
all_train_losses, all_val_losses = [], [] all_train_losses, all_test_losses = [], []
all_train_r2_scores, all_val_r2_scores = [], [] all_train_r2_scores, all_test_r2_scores = [], []
subset_size = len(train_dataset) // num_models subset_size = len(train_dataset) // num_models
@ -225,20 +170,41 @@ def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, ba
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices) subset = Subset(train_dataset, subset_indices)
model = ModelClass() model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"])
model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size) train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size)
models.append(model) models.append(model)
all_train_losses.append(train_losses) all_train_losses.append(train_losses)
all_val_losses.append(val_losses) all_test_losses.append(test_losses)
all_train_r2_scores.append(train_r2_scores) all_train_r2_scores.append(train_r2_scores)
all_val_r2_scores.append(val_r2_scores) all_test_r2_scores.append(test_r2_scores)
return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores # Plot für alle Modelle
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss')
plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed')
plt.title("Training and Test Loss for all Models")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²')
plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed')
plt.title("Training and Test R² for all Models")
plt.xlabel('Epochs')
plt.ylabel('')
plt.legend()
plt.show()
return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores
# Ensemble Prediction # Ensemble Prediction
def ensemble_predict(models, test_dataset): def ensemble_predict(models, test_dataset):
@ -254,57 +220,61 @@ def ensemble_predict(models, test_dataset):
return np.array(all_predictions) return np.array(all_predictions)
if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 25,
"patience": 7,
"batch_size": 32,
"learning_rate": 1e-4, # 1e-4
"weight_decay": 5e-4 ,
# Model
'nhead': 2, # 5
"dropout": 0.2,
'hiden_dim': 2048,
'num_layers': 6
}
# TODO set seeds
# Bootstrap Aggregating # Configs
num_models = 2 MODEL_NAME = 'transfomrer.pt'
ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation( HIST_NAME = 'transformer_history'
lambda: TransformerBinaryClassifier( GLOVE_PATH = 'data/glove.6B.100d.txt'
embeddings=embedding_matrix, DATA_PATH = 'data/hack.csv'
nhead=params['nhead'], EMBEDDING_DIM = 100
num_layers=params['num_layers'], TEST_SIZE = 0.1
dim_feedforward=params['hidden_dim'], VAL_SIZE = 0.1
positional_dropout=params['positional_dropout'],
classifier_dropout=params['classifier_dropout']
).to(device),
train_dataset,
num_models=num_models,
epochs=params['epochs'],
batch_size=params['batch_size'],
learning_rate=params['lr']
)
# Ensemble Prediction on Testset device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ensemble_predictions = ensemble_predict(ensemble_models, test_dataset) # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
# Plotting X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Train and Validation Losses # Aufteilen der Daten
for i in range(num_models): data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}")
ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}", linestyle='dashed')
ax1.set_title('Train and Validation Loss') # Dataset und DataLoader
ax1.set_xlabel('Epochs') train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
ax1.set_ylabel('Loss') val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
ax1.legend() test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
# Plot Train and Validation R² # Bootstrap Aggregation (Bagging) Training
for i in range(num_models): models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation(
ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}") TransformerBinaryClassifier, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"])
ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}", linestyle='dashed')
ax2.set_title('Train and Validation R²') # Ensemble Prediction
ax2.set_xlabel('Epochs') test_predictions = ensemble_predict(models, test_dataset)
ax2.set_ylabel('')
ax2.legend()
plt.tight_layout() # Test Evaluation
plt.show() # test_labels = np.array([y for _, y in test_dataset])
# Evaluation test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions)
mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions) test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions)
mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions) test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions)
r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions)
print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}") print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")