From 95216088e58f91a60772d7cbc770a48761ccc0fa Mon Sep 17 00:00:00 2001 From: arman Date: Sun, 16 Feb 2025 00:42:57 +0100 Subject: [PATCH] refactored bootstrap --- cnn_bootstrap_agg.py | 394 ++++++++++++++++------------------- transformer_bootstrap_agg.py | 340 ++++++++++++++---------------- 2 files changed, 333 insertions(+), 401 deletions(-) diff --git a/cnn_bootstrap_agg.py b/cnn_bootstrap_agg.py index 36e6599..06f02ce 100644 --- a/cnn_bootstrap_agg.py +++ b/cnn_bootstrap_agg.py @@ -1,101 +1,159 @@ -import pandas as pd -import numpy as np +import random import torch import torch.nn as nn -from torch.utils.data import DataLoader, Dataset -from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -from tqdm import tqdm -from dataset_generator import create_embedding_matrix -from EarlyStopping import EarlyStopping import torch.optim as optim -from torch.utils.data import DataLoader, Dataset, Subset # Import Subset -#from utils import tokenize_and_pad, HumorDataset, evaluate_model, bootstrap_aggregation -def train_model(model, train_dataset, val_dataset, criterion, optimizer, epochs, batch_size): - train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) +import matplotlib.pyplot as plt +from torch.utils.data import DataLoader, Subset +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +import numpy as np - model.to(device) - history = {'train_loss': [], 'val_loss': [], 'train_r2': [], 'val_r2': []} +import Datasets +import dataset_helper +import EarlyStopping +import ml_helper +import ml_history +import ml_train +SEED = 501 +random.seed(SEED) +np.random.seed(SEED) +torch.manual_seed(SEED) +torch.cuda.manual_seed_all(SEED) +torch.backends.cudnn.deterministic = True + +class EnhancedCNNRegressor(nn.Module): + def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout): + super(EnhancedCNNRegressor, self).__init__() + self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False) + + # Convolutional Schichten mit Batch-Normalisierung + self.convs = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(1, num_filters, (fs, embedding_dim)), + nn.BatchNorm2d(num_filters), # Batch-Normalisierung + nn.ReLU(), + nn.MaxPool2d((params["max_len"] - fs + 1, 1)), + nn.Dropout(dropout) # Dropout nach jeder Schicht + ) + for fs in filter_sizes + ]) + + # Fully-Connected Layer + self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht + self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding] + conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim + x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern + x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht + x = self.dropout(x) + return self.fc2(x).squeeze(1) + +def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size): + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + + test_losses, train_losses = [], [] + train_r2_scores, test_r2_scores = [], [] + for epoch in range(epochs): model.train() - total_loss = 0 - all_train_preds, all_train_targets = [], [] - - for inputs, targets in train_dataloader: - inputs, targets = inputs.to(device), targets.to(device) + running_loss = 0.0 + running_r2 = 0.0 + + # Training + for inputs, labels in train_loader: + inputs = inputs.to(device) + labels = labels.to(device) + optimizer.zero_grad() - outputs = model(inputs).squeeze() - loss = criterion(outputs, targets) + outputs = model(inputs) + loss = criterion(outputs, labels) loss.backward() optimizer.step() - total_loss += loss.item() - - all_train_preds.extend(outputs.detach().cpu().numpy()) - all_train_targets.extend(targets.detach().cpu().numpy()) - - train_r2 = r2_score(all_train_targets, all_train_preds) - train_loss = total_loss / len(train_dataloader) - history['train_loss'].append(train_loss) - history['train_r2'].append(train_r2) - + + running_loss += loss.item() + running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy()) - model.eval() - val_loss = 0 - all_val_preds, all_val_targets = [], [] + train_losses.append(running_loss / len(train_loader)) + train_r2_scores.append(running_r2 / len(train_loader)) + + # Test + model.eval() # Set model to evaluation mode + test_loss = 0.0 + test_r2 = 0.0 + with torch.no_grad(): # No gradient calculation for testing + for inputs, labels in test_loader: + inputs = inputs.to(device) + labels = labels.to(device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + + test_loss += loss.item() + test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy()) + + test_losses.append(test_loss / len(test_loader)) + test_r2_scores.append(test_r2 / len(test_loader)) + + print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}') + + return train_losses, test_losses, train_r2_scores, test_r2_scores - with torch.no_grad(): - for inputs, targets in val_dataloader: - inputs, targets = inputs.to(device), targets.to(device) - outputs = model(inputs).squeeze() - loss = criterion(outputs, targets) - val_loss += loss.item() - - all_val_preds.extend(outputs.cpu().numpy()) - all_val_targets.extend(targets.cpu().numpy()) - - val_r2 = r2_score(all_val_targets, all_val_preds) - val_loss /= len(val_dataloader) - history['val_loss'].append(val_loss) - history['val_r2'].append(val_r2) - - print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}") - - return history - - -def bootstrap_aggregation(ModelClass, train_dataset, num_models=3, epochs=5, batch_size=32, learning_rate=0.001): +# Bootstrap Aggregation (Bagging) Update +def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001): models = [] - all_histories = [] - + all_train_losses, all_test_losses = [], [] + all_train_r2_scores, all_test_r2_scores = [], [] + subset_size = len(train_dataset) // num_models for i in range(num_models): - print(f"Training Model {i+1}/{num_models}...") - + print(f"Training Model {i + 1}/{num_models}...") start_idx = i * subset_size end_idx = start_idx + subset_size subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) - subset = Subset(train_dataset, subset_indices) - - val_indices = list(range(start_idx, end_idx)) - val_subset = Subset(train_dataset, val_indices) - - model = ModelClass() + model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"]) + model.to(device) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) - history = train_model(model, subset, val_subset, criterion, optimizer, epochs, batch_size) - all_histories.append(history) + train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size) + models.append(model) + all_train_losses.append(train_losses) + all_test_losses.append(test_losses) + all_train_r2_scores.append(train_r2_scores) + all_test_r2_scores.append(test_r2_scores) - return models, all_histories + # Plot für alle Modelle + plt.figure(figsize=(12, 6)) + for i in range(num_models): + plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss') + plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed') + plt.title("Training and Test Loss for all Models") + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + plt.show() + plt.figure(figsize=(12, 6)) + for i in range(num_models): + plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²') + plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed') + plt.title("Training and Test R² for all Models") + plt.xlabel('Epochs') + plt.ylabel('R²') + plt.legend() + plt.show() + + return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores + +# Ensemble Prediction def ensemble_predict(models, test_dataset): dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False) all_predictions = [] @@ -104,160 +162,64 @@ def ensemble_predict(models, test_dataset): for inputs, _ in dataloader: inputs = inputs.to(device) predictions = torch.stack([model(inputs).squeeze() for model in models]) - avg_predictions = predictions.mean(dim=0) # Mittelwert über alle Modelle + avg_predictions = predictions.mean(dim=0) all_predictions.extend(avg_predictions.cpu().numpy()) return np.array(all_predictions) -import matplotlib.pyplot as plt +if __name__ == '__main__': + # Hyperparameter und Konfigurationen + params = { + # Config + "max_len": 280, + # Training + "epochs": 2, + "patience": 7, + "batch_size": 16, + "learning_rate": 0.001, + "weight_decay": 5e-4 , + # Model + "filter_sizes": [2, 3, 4, 5], + "num_filters": 150, + "dropout": 0.6 + } -def plot_training_histories(histories, num_models): - epochs = range(1, len(histories[0]['train_loss']) + 1) + # Configs + MODEL_NAME = 'CNN.pt' + HIST_NAME = 'CNN_history' + GLOVE_PATH = 'data/glove.6B.100d.txt' + DATA_PATH = 'data/hack.csv' + EMBEDDING_DIM = 100 + TEST_SIZE = 0.1 + VAL_SIZE = 0.1 - fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # Daten laden und vorbereiten + embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( + gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) - - for i in range(num_models): - axes[0].plot(epochs, histories[i]['train_loss'], label=f"Train Loss Model {i+1}") - axes[0].plot(epochs, histories[i]['val_loss'], linestyle='dashed', label=f"Val Loss Model {i+1}") + X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True) - axes[0].set_title("Train & Validation Loss") - axes[0].set_xlabel("Epochs") - axes[0].set_ylabel("Loss") - axes[0].legend() + # Aufteilen der Daten + data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE) - for i in range(num_models): - axes[1].plot(epochs, histories[i]['train_r2'], label=f"Train R² Model {i+1}") - axes[1].plot(epochs, histories[i]['val_r2'], linestyle='dashed', label=f"Val R² Model {i+1}") + # Dataset und DataLoader + train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) + val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) + test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) - axes[1].set_title("Train & Validation R² Score") - axes[1].set_xlabel("Epochs") - axes[1].set_ylabel("R² Score") - axes[1].legend() + # Bootstrap Aggregation (Bagging) Training + models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation( + EnhancedCNNRegressor, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"]) - plt.show() - - - -# 1. Gerät automatisch erkennen -device = torch.device('mps' if torch.backends.mps.is_available() - else 'cuda' if torch.cuda.is_available() - else 'cpu') -print(f"Using device: {device}") - -# 2. Daten laden -data = pd.read_csv('data/hack.csv') - -# 3. Filtern humorvoller Texte -humor_data = data[data['is_humor'] == 1].dropna(subset=['humor_rating']).copy() - -# 4. Einbettungsmatrix erstellen -embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix( - gloVe_path='data/glove.6B.100d.txt', emb_len=100 -) -print(f"vocab_size: {vocab_size}, d_model: {d_model}") - -# 5. Tokenisierung und Padding -def tokenize_and_pad(texts, word_index, max_len=50): - sequences = [] - for text in texts: - tokens = [word_index.get(word, 0) for word in text.split()] - if len(tokens) < max_len: - tokens += [0] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - sequences.append(tokens) - return torch.tensor(sequences, dtype=torch.long) -max_len = 50 -train_texts, test_texts, train_labels, test_labels = train_test_split( - humor_data['text'], humor_data['humor_rating'], test_size=0.2, random_state=42 -) -train_input_ids = tokenize_and_pad(train_texts, word_index, max_len=max_len) -test_input_ids = tokenize_and_pad(test_texts, word_index, max_len=max_len) - -# Labels in Tensor konvertieren -train_labels = torch.tensor(train_labels.values, dtype=torch.float) -test_labels = torch.tensor(test_labels.values, dtype=torch.float) - -# 6. Dataset und DataLoader -class HumorDataset(Dataset): - def __init__(self, input_ids, labels): - self.input_ids = input_ids - self.labels = labels - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return self.input_ids[idx], self.labels[idx] -dataset = HumorDataset(train_input_ids, train_labels) - -# 7. CNN-Regression-Modell -def create_cnn(vocab_size, embed_dim, embedding_matrix): - class CNNRegressor(nn.Module): - def __init__(self, vocab_size, embed_dim, embedding_matrix): - super(CNNRegressor, self).__init__() - self.embedding = nn.Embedding(vocab_size, embed_dim) - self.embedding.weight.data.copy_(embedding_matrix.clone().detach()) - self.embedding.weight.requires_grad = False - self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3) - self.conv2 = nn.Conv1d(128, 64, kernel_size=3) - self.dropout = nn.Dropout(0.5) - self.fc = nn.Linear(64, 1) - - def forward(self, x): - x = self.embedding(x).permute(0, 2, 1) - x = torch.relu(self.conv1(x)) - x = torch.relu(self.conv2(x)) - x = self.dropout(x) - x = torch.max(x, dim=2).values - x = self.fc(x) - return torch.sigmoid(x) * 5 - - return CNNRegressor(vocab_size, embed_dim, embedding_matrix) - -# 8. Bootstrap Aggregation mit CNN -models, histories = bootstrap_aggregation( - lambda: create_cnn(vocab_size, d_model, embedding_matrix), - dataset, - num_models=5, - epochs=10, - batch_size=32, - learning_rate=0.001 -) -# **Plot Training & Validation Loss & R²** -plot_training_histories(histories, num_models=5) - - -# Vorhersagen mit Ensemble -predictions = ensemble_predict(models, HumorDataset(test_input_ids, test_labels)) -actuals = test_labels.numpy() - -# 9. Metriken berechnen -mse = mean_squared_error(actuals, predictions) -mae = mean_absolute_error(actuals, predictions) -r2 = r2_score(actuals, predictions) - -print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}") - -# 10. Visualisierung -tolerance = 0.5 # Toleranz für korrekte Vorhersagen -predictions = np.array(predictions) -actuals = np.array(actuals) - -correct = np.abs(predictions - actuals) <= tolerance -colors = np.where(correct, 'green', 'red') - -plt.figure(figsize=(8, 6)) -plt.scatter(actuals, predictions, c=colors, alpha=0.6, edgecolor='k', s=50) -plt.plot([0, 5], [0, 5], color='red', linestyle='--') - -green_patch = mpatches.Patch(color='green', label='Correct Predictions') -red_patch = mpatches.Patch(color='red', label='Incorrect Predictions') -plt.legend(handles=[green_patch, red_patch]) - -plt.xlabel("True Humor Ratings") -plt.ylabel("Predicted Humor Ratings") -plt.title("True vs Predicted Humor Ratings (Correct vs Incorrect)") -plt.show() + # Ensemble Prediction + test_predictions = ensemble_predict(models, test_dataset) + # Test Evaluation + # test_labels = np.array([y for _, y in test_dataset]) + + test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions) + test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions) + test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions) + + print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") diff --git a/transformer_bootstrap_agg.py b/transformer_bootstrap_agg.py index 5cfb764..3cca389 100644 --- a/transformer_bootstrap_agg.py +++ b/transformer_bootstrap_agg.py @@ -1,50 +1,33 @@ -import time -import json -import math - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns - -from nltk.tokenize import word_tokenize - +import random import torch import torch.nn as nn import torch.optim as optim +import matplotlib.pyplot as plt from torch.utils.data import DataLoader, Subset -from torch.optim.lr_scheduler import ReduceLROnPlateau +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +import numpy as np -from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score -from sklearn.model_selection import KFold -# local imports -import ml_evaluation as ml_eval +import Datasets +import dataset_helper +import EarlyStopping import ml_helper import ml_history -import dataset_generator as data_gen -# class imports -import HumorDataset as humor_ds -import EarlyStopping -import BalancedCELoss +import ml_train +SEED = 501 +random.seed(SEED) +np.random.seed(SEED) +torch.manual_seed(SEED) +torch.cuda.manual_seed_all(SEED) +torch.backends.cudnn.deterministic = True -torch.manual_seed(0) -np.random.seed(0) - - -best_model_filename = 'best_transformer_reg_model.pt' - -device = ml_helper.get_device(verbose=True) - -embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix() - -vocab_size = len(embedding_matrix) -d_model = len(embedding_matrix[0]) -vocab_size, d_model = embedding_matrix.size() -print(f"vocab_size: {vocab_size}, d_model: {d_model}") class PositionalEncoding(nn.Module): + """ + https://pytorch.org/tutorials/beginner/transformer_tutorial.html + """ + def __init__(self, d_model, vocab_size=5000, dropout=0.1): super().__init__() self.dropout = nn.Dropout(p=dropout) @@ -66,6 +49,10 @@ class PositionalEncoding(nn.Module): class TransformerBinaryClassifier(nn.Module): + """ + Text classifier based on a pytorch TransformerEncoder. + """ + def __init__( self, embeddings, @@ -74,8 +61,8 @@ class TransformerBinaryClassifier(nn.Module): num_layers=6, positional_dropout=0.1, classifier_dropout=0.1, - activation="relu", ): + super().__init__() vocab_size, d_model = embeddings.size() @@ -99,6 +86,7 @@ class TransformerBinaryClassifier(nn.Module): encoder_layer, num_layers=num_layers, ) + # normalize to stabilize and stop overfitting self.batch_norm = nn.BatchNorm1d(d_model) self.classifier = nn.Linear(d_model, 1) self.d_model = d_model @@ -108,114 +96,71 @@ class TransformerBinaryClassifier(nn.Module): x = self.pos_encoder(x) x = self.transformer_encoder(x) x = x.mean(dim=1) + # normalize to stabilize and stop overfitting + #x = self.batch_norm(x) + + #NOTE: no activation function for regression x = self.classifier(x) + x = x.squeeze(1) return x - -def load_preprocess_data(path_data='data/hack.csv'): - df = pd.read_csv(path_data) - df = df.dropna(subset=['humor_rating']) - - df['y'] = df['humor_rating'] - X = df['text'] - y = df['y'] - return X, y - - -X, y = load_preprocess_data() - -ret_dict = data_gen.split_data(X, y) - -params = { - 'equalize_classes_loss_factor': 0.15, - 'batch_size': 32, - 'epochs': 2, - 'lr': 1e-4, - 'clipping_max_norm': 0, - 'early_stopping_patience': 5, - 'lr_scheduler_factor': 0.5, - 'lr_scheduler_patience': 3, - 'nhead': 2, - 'num_layers': 3, - 'hidden_dim': 10, - 'positional_dropout': 0.5, - 'classifier_dropout': 0.5, - 'weight_decay': 1e-2 -} - -max_len = 280 - -train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len) -val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len) -test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len) - -train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True) -val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False) -test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False) - -early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False) - - -def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size): - dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - model.to(device) - - # Store for plotting - train_losses, val_losses = [], [] - train_r2_scores, val_r2_scores = [], [] - +def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size): + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + + test_losses, train_losses = [], [] + train_r2_scores, test_r2_scores = [], [] + for epoch in range(epochs): model.train() - total_loss = 0 - all_preds, all_targets = [], [] - - for inputs, targets in dataloader: - inputs, targets = inputs.to(device), targets.to(device) + running_loss = 0.0 + running_r2 = 0.0 + + # Training + for inputs, labels in train_loader: + inputs = inputs.to(device) + labels = labels.to(device) + optimizer.zero_grad() - outputs = model(inputs).squeeze() - loss = criterion(outputs, targets.float()) + outputs = model(inputs) + loss = criterion(outputs, labels) loss.backward() optimizer.step() - total_loss += loss.item() + + running_loss += loss.item() + running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy()) + + train_losses.append(running_loss / len(train_loader)) + train_r2_scores.append(running_r2 / len(train_loader)) + + # Test + model.eval() # Set model to evaluation mode + test_loss = 0.0 + test_r2 = 0.0 + with torch.no_grad(): # No gradient calculation for testing + for inputs, labels in test_loader: + inputs = inputs.to(device) + labels = labels.to(device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + + test_loss += loss.item() + test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy()) + + test_losses.append(test_loss / len(test_loader)) + test_r2_scores.append(test_r2 / len(test_loader)) + + print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}') + + return train_losses, test_losses, train_r2_scores, test_r2_scores - all_preds.extend(outputs.detach().cpu().numpy()) - all_targets.extend(targets.detach().cpu().numpy()) - - # Calculate R2 - r2 = r2_score(all_targets, all_preds) - train_losses.append(total_loss / len(dataloader)) - train_r2_scores.append(r2) - - # Validation phase - model.eval() - val_loss = 0 - val_preds, val_targets = [], [] - - with torch.no_grad(): - for inputs, targets in val_loader: - inputs, targets = inputs.to(device), targets.to(device) - outputs = model(inputs).squeeze() - loss = criterion(outputs, targets.float()) - val_loss += loss.item() - - val_preds.extend(outputs.cpu().numpy()) - val_targets.extend(targets.cpu().numpy()) - - # Calculate Validation R2 - val_r2 = r2_score(val_targets, val_preds) - val_losses.append(val_loss / len(val_loader)) - val_r2_scores.append(val_r2) - - print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}") - - return train_losses, val_losses, train_r2_scores, val_r2_scores - - -def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001): +# Bootstrap Aggregation (Bagging) Update +def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001): models = [] - all_train_losses, all_val_losses = [], [] - all_train_r2_scores, all_val_r2_scores = [], [] - + all_train_losses, all_test_losses = [], [] + all_train_r2_scores, all_test_r2_scores = [], [] + subset_size = len(train_dataset) // num_models for i in range(num_models): @@ -225,20 +170,41 @@ def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, ba subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) subset = Subset(train_dataset, subset_indices) - model = ModelClass() + model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"]) + model.to(device) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) - train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size) + train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size) models.append(model) all_train_losses.append(train_losses) - all_val_losses.append(val_losses) + all_test_losses.append(test_losses) all_train_r2_scores.append(train_r2_scores) - all_val_r2_scores.append(val_r2_scores) + all_test_r2_scores.append(test_r2_scores) - return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores + # Plot für alle Modelle + plt.figure(figsize=(12, 6)) + for i in range(num_models): + plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss') + plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed') + plt.title("Training and Test Loss for all Models") + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + plt.show() + plt.figure(figsize=(12, 6)) + for i in range(num_models): + plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²') + plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed') + plt.title("Training and Test R² for all Models") + plt.xlabel('Epochs') + plt.ylabel('R²') + plt.legend() + plt.show() + + return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores # Ensemble Prediction def ensemble_predict(models, test_dataset): @@ -254,57 +220,61 @@ def ensemble_predict(models, test_dataset): return np.array(all_predictions) +if __name__ == '__main__': + # Hyperparameter und Konfigurationen + params = { + # Config + "max_len": 280, + # Training + "epochs": 25, + "patience": 7, + "batch_size": 32, + "learning_rate": 1e-4, # 1e-4 + "weight_decay": 5e-4 , + # Model + 'nhead': 2, # 5 + "dropout": 0.2, + 'hiden_dim': 2048, + 'num_layers': 6 + } + # TODO set seeds -# Bootstrap Aggregating -num_models = 2 -ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation( - lambda: TransformerBinaryClassifier( - embeddings=embedding_matrix, - nhead=params['nhead'], - num_layers=params['num_layers'], - dim_feedforward=params['hidden_dim'], - positional_dropout=params['positional_dropout'], - classifier_dropout=params['classifier_dropout'] - ).to(device), - train_dataset, - num_models=num_models, - epochs=params['epochs'], - batch_size=params['batch_size'], - learning_rate=params['lr'] -) + # Configs + MODEL_NAME = 'transfomrer.pt' + HIST_NAME = 'transformer_history' + GLOVE_PATH = 'data/glove.6B.100d.txt' + DATA_PATH = 'data/hack.csv' + EMBEDDING_DIM = 100 + TEST_SIZE = 0.1 + VAL_SIZE = 0.1 -# Ensemble Prediction on Testset -ensemble_predictions = ensemble_predict(ensemble_models, test_dataset) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # Daten laden und vorbereiten + embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix( + gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM) -# Plotting -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) + X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True) -# Plot Train and Validation Losses -for i in range(num_models): - ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}") - ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}", linestyle='dashed') + # Aufteilen der Daten + data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE) -ax1.set_title('Train and Validation Loss') -ax1.set_xlabel('Epochs') -ax1.set_ylabel('Loss') -ax1.legend() + # Dataset und DataLoader + train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) + val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) + test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) -# Plot Train and Validation R² -for i in range(num_models): - ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}") - ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}", linestyle='dashed') + # Bootstrap Aggregation (Bagging) Training + models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation( + TransformerBinaryClassifier, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"]) -ax2.set_title('Train and Validation R²') -ax2.set_xlabel('Epochs') -ax2.set_ylabel('R²') -ax2.legend() + # Ensemble Prediction + test_predictions = ensemble_predict(models, test_dataset) -plt.tight_layout() -plt.show() - -# Evaluation -mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions) -mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions) -r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions) - -print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}") + # Test Evaluation + # test_labels = np.array([y for _, y in test_dataset]) + + test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions) + test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions) + test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions) + + print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")