refactored bootstrap

main
arman 2025-02-16 00:42:57 +01:00
parent 8b655b58ca
commit 95216088e5
2 changed files with 333 additions and 401 deletions

View File

@ -1,101 +1,159 @@
import pandas as pd import random
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
from dataset_generator import create_embedding_matrix
from EarlyStopping import EarlyStopping
import torch.optim as optim import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset # Import Subset import matplotlib.pyplot as plt
#from utils import tokenize_and_pad, HumorDataset, evaluate_model, bootstrap_aggregation from torch.utils.data import DataLoader, Subset
def train_model(model, train_dataset, val_dataset, criterion, optimizer, epochs, batch_size): from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) import numpy as np
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
model.to(device) import Datasets
history = {'train_loss': [], 'val_loss': [], 'train_r2': [], 'val_r2': []} import dataset_helper
import EarlyStopping
import ml_helper
import ml_history
import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class EnhancedCNNRegressor(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNRegressor, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
# Convolutional Schichten mit Batch-Normalisierung
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout) # Dropout nach jeder Schicht
)
for fs in filter_sizes
])
# Fully-Connected Layer
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
x = self.dropout(x)
return self.fc2(x).squeeze(1)
def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size):
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_losses, train_losses = [], []
train_r2_scores, test_r2_scores = [], []
for epoch in range(epochs): for epoch in range(epochs):
model.train() model.train()
total_loss = 0 running_loss = 0.0
all_train_preds, all_train_targets = [], [] running_r2 = 0.0
for inputs, targets in train_dataloader: # Training
inputs, targets = inputs.to(device), targets.to(device) for inputs, labels in train_loader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad() optimizer.zero_grad()
outputs = model(inputs).squeeze() outputs = model(inputs)
loss = criterion(outputs, targets) loss = criterion(outputs, labels)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
total_loss += loss.item()
running_loss += loss.item()
all_train_preds.extend(outputs.detach().cpu().numpy()) running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
all_train_targets.extend(targets.detach().cpu().numpy())
train_r2 = r2_score(all_train_targets, all_train_preds)
train_loss = total_loss / len(train_dataloader)
history['train_loss'].append(train_loss)
history['train_r2'].append(train_r2)
model.eval() train_losses.append(running_loss / len(train_loader))
val_loss = 0 train_r2_scores.append(running_r2 / len(train_loader))
all_val_preds, all_val_targets = [], []
# Test
model.eval() # Set model to evaluation mode
test_loss = 0.0
test_r2 = 0.0
with torch.no_grad(): # No gradient calculation for testing
for inputs, labels in test_loader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
test_loss += loss.item()
test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
test_losses.append(test_loss / len(test_loader))
test_r2_scores.append(test_r2 / len(test_loader))
print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}')
return train_losses, test_losses, train_r2_scores, test_r2_scores
with torch.no_grad(): # Bootstrap Aggregation (Bagging) Update
for inputs, targets in val_dataloader: def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets)
val_loss += loss.item()
all_val_preds.extend(outputs.cpu().numpy())
all_val_targets.extend(targets.cpu().numpy())
val_r2 = r2_score(all_val_targets, all_val_preds)
val_loss /= len(val_dataloader)
history['val_loss'].append(val_loss)
history['val_r2'].append(val_r2)
print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}")
return history
def bootstrap_aggregation(ModelClass, train_dataset, num_models=3, epochs=5, batch_size=32, learning_rate=0.001):
models = [] models = []
all_histories = [] all_train_losses, all_test_losses = [], []
all_train_r2_scores, all_test_r2_scores = [], []
subset_size = len(train_dataset) // num_models subset_size = len(train_dataset) // num_models
for i in range(num_models): for i in range(num_models):
print(f"Training Model {i+1}/{num_models}...") print(f"Training Model {i + 1}/{num_models}...")
start_idx = i * subset_size start_idx = i * subset_size
end_idx = start_idx + subset_size end_idx = start_idx + subset_size
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices) subset = Subset(train_dataset, subset_indices)
model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"])
val_indices = list(range(start_idx, end_idx)) model.to(device)
val_subset = Subset(train_dataset, val_indices)
model = ModelClass()
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate)
history = train_model(model, subset, val_subset, criterion, optimizer, epochs, batch_size) train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size)
all_histories.append(history)
models.append(model) models.append(model)
all_train_losses.append(train_losses)
all_test_losses.append(test_losses)
all_train_r2_scores.append(train_r2_scores)
all_test_r2_scores.append(test_r2_scores)
return models, all_histories # Plot für alle Modelle
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss')
plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed')
plt.title("Training and Test Loss for all Models")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²')
plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed')
plt.title("Training and Test R² for all Models")
plt.xlabel('Epochs')
plt.ylabel('')
plt.legend()
plt.show()
return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores
# Ensemble Prediction
def ensemble_predict(models, test_dataset): def ensemble_predict(models, test_dataset):
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False) dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
all_predictions = [] all_predictions = []
@ -104,160 +162,64 @@ def ensemble_predict(models, test_dataset):
for inputs, _ in dataloader: for inputs, _ in dataloader:
inputs = inputs.to(device) inputs = inputs.to(device)
predictions = torch.stack([model(inputs).squeeze() for model in models]) predictions = torch.stack([model(inputs).squeeze() for model in models])
avg_predictions = predictions.mean(dim=0) # Mittelwert über alle Modelle avg_predictions = predictions.mean(dim=0)
all_predictions.extend(avg_predictions.cpu().numpy()) all_predictions.extend(avg_predictions.cpu().numpy())
return np.array(all_predictions) return np.array(all_predictions)
import matplotlib.pyplot as plt if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 2,
"patience": 7,
"batch_size": 16,
"learning_rate": 0.001,
"weight_decay": 5e-4 ,
# Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6
}
def plot_training_histories(histories, num_models): # Configs
epochs = range(1, len(histories[0]['train_loss']) + 1) MODEL_NAME = 'CNN.pt'
HIST_NAME = 'CNN_history'
GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100
TEST_SIZE = 0.1
VAL_SIZE = 0.1
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
for i in range(num_models):
axes[0].plot(epochs, histories[i]['train_loss'], label=f"Train Loss Model {i+1}")
axes[0].plot(epochs, histories[i]['val_loss'], linestyle='dashed', label=f"Val Loss Model {i+1}")
axes[0].set_title("Train & Validation Loss") # Aufteilen der Daten
axes[0].set_xlabel("Epochs") data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
axes[0].set_ylabel("Loss")
axes[0].legend()
for i in range(num_models): # Dataset und DataLoader
axes[1].plot(epochs, histories[i]['train_r2'], label=f"Train R² Model {i+1}") train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
axes[1].plot(epochs, histories[i]['val_r2'], linestyle='dashed', label=f"Val R² Model {i+1}") val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
axes[1].set_title("Train & Validation R² Score") # Bootstrap Aggregation (Bagging) Training
axes[1].set_xlabel("Epochs") models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation(
axes[1].set_ylabel("R² Score") EnhancedCNNRegressor, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"])
axes[1].legend()
plt.show() # Ensemble Prediction
test_predictions = ensemble_predict(models, test_dataset)
# 1. Gerät automatisch erkennen
device = torch.device('mps' if torch.backends.mps.is_available()
else 'cuda' if torch.cuda.is_available()
else 'cpu')
print(f"Using device: {device}")
# 2. Daten laden
data = pd.read_csv('data/hack.csv')
# 3. Filtern humorvoller Texte
humor_data = data[data['is_humor'] == 1].dropna(subset=['humor_rating']).copy()
# 4. Einbettungsmatrix erstellen
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path='data/glove.6B.100d.txt', emb_len=100
)
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
# 5. Tokenisierung und Padding
def tokenize_and_pad(texts, word_index, max_len=50):
sequences = []
for text in texts:
tokens = [word_index.get(word, 0) for word in text.split()]
if len(tokens) < max_len:
tokens += [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
sequences.append(tokens)
return torch.tensor(sequences, dtype=torch.long)
max_len = 50
train_texts, test_texts, train_labels, test_labels = train_test_split(
humor_data['text'], humor_data['humor_rating'], test_size=0.2, random_state=42
)
train_input_ids = tokenize_and_pad(train_texts, word_index, max_len=max_len)
test_input_ids = tokenize_and_pad(test_texts, word_index, max_len=max_len)
# Labels in Tensor konvertieren
train_labels = torch.tensor(train_labels.values, dtype=torch.float)
test_labels = torch.tensor(test_labels.values, dtype=torch.float)
# 6. Dataset und DataLoader
class HumorDataset(Dataset):
def __init__(self, input_ids, labels):
self.input_ids = input_ids
self.labels = labels
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.labels[idx]
dataset = HumorDataset(train_input_ids, train_labels)
# 7. CNN-Regression-Modell
def create_cnn(vocab_size, embed_dim, embedding_matrix):
class CNNRegressor(nn.Module):
def __init__(self, vocab_size, embed_dim, embedding_matrix):
super(CNNRegressor, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.embedding.weight.data.copy_(embedding_matrix.clone().detach())
self.embedding.weight.requires_grad = False
self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3)
self.conv2 = nn.Conv1d(128, 64, kernel_size=3)
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(64, 1)
def forward(self, x):
x = self.embedding(x).permute(0, 2, 1)
x = torch.relu(self.conv1(x))
x = torch.relu(self.conv2(x))
x = self.dropout(x)
x = torch.max(x, dim=2).values
x = self.fc(x)
return torch.sigmoid(x) * 5
return CNNRegressor(vocab_size, embed_dim, embedding_matrix)
# 8. Bootstrap Aggregation mit CNN
models, histories = bootstrap_aggregation(
lambda: create_cnn(vocab_size, d_model, embedding_matrix),
dataset,
num_models=5,
epochs=10,
batch_size=32,
learning_rate=0.001
)
# **Plot Training & Validation Loss & R²**
plot_training_histories(histories, num_models=5)
# Vorhersagen mit Ensemble
predictions = ensemble_predict(models, HumorDataset(test_input_ids, test_labels))
actuals = test_labels.numpy()
# 9. Metriken berechnen
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
r2 = r2_score(actuals, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
# 10. Visualisierung
tolerance = 0.5 # Toleranz für korrekte Vorhersagen
predictions = np.array(predictions)
actuals = np.array(actuals)
correct = np.abs(predictions - actuals) <= tolerance
colors = np.where(correct, 'green', 'red')
plt.figure(figsize=(8, 6))
plt.scatter(actuals, predictions, c=colors, alpha=0.6, edgecolor='k', s=50)
plt.plot([0, 5], [0, 5], color='red', linestyle='--')
green_patch = mpatches.Patch(color='green', label='Correct Predictions')
red_patch = mpatches.Patch(color='red', label='Incorrect Predictions')
plt.legend(handles=[green_patch, red_patch])
plt.xlabel("True Humor Ratings")
plt.ylabel("Predicted Humor Ratings")
plt.title("True vs Predicted Humor Ratings (Correct vs Incorrect)")
plt.show()
# Test Evaluation
# test_labels = np.array([y for _, y in test_dataset])
test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions)
test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions)
test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions)
print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,50 +1,33 @@
import time import random
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score import Datasets
from sklearn.model_selection import KFold import dataset_helper
# local imports import EarlyStopping
import ml_evaluation as ml_eval
import ml_helper import ml_helper
import ml_history import ml_history
import dataset_generator as data_gen import ml_train
# class imports
import HumorDataset as humor_ds
import EarlyStopping
import BalancedCELoss
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.manual_seed(0)
np.random.seed(0)
best_model_filename = 'best_transformer_reg_model.pt'
device = ml_helper.get_device(verbose=True)
embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
class PositionalEncoding(nn.Module): class PositionalEncoding(nn.Module):
"""
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
"""
def __init__(self, d_model, vocab_size=5000, dropout=0.1): def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__() super().__init__()
self.dropout = nn.Dropout(p=dropout) self.dropout = nn.Dropout(p=dropout)
@ -66,6 +49,10 @@ class PositionalEncoding(nn.Module):
class TransformerBinaryClassifier(nn.Module): class TransformerBinaryClassifier(nn.Module):
"""
Text classifier based on a pytorch TransformerEncoder.
"""
def __init__( def __init__(
self, self,
embeddings, embeddings,
@ -74,8 +61,8 @@ class TransformerBinaryClassifier(nn.Module):
num_layers=6, num_layers=6,
positional_dropout=0.1, positional_dropout=0.1,
classifier_dropout=0.1, classifier_dropout=0.1,
activation="relu",
): ):
super().__init__() super().__init__()
vocab_size, d_model = embeddings.size() vocab_size, d_model = embeddings.size()
@ -99,6 +86,7 @@ class TransformerBinaryClassifier(nn.Module):
encoder_layer, encoder_layer,
num_layers=num_layers, num_layers=num_layers,
) )
# normalize to stabilize and stop overfitting
self.batch_norm = nn.BatchNorm1d(d_model) self.batch_norm = nn.BatchNorm1d(d_model)
self.classifier = nn.Linear(d_model, 1) self.classifier = nn.Linear(d_model, 1)
self.d_model = d_model self.d_model = d_model
@ -108,114 +96,71 @@ class TransformerBinaryClassifier(nn.Module):
x = self.pos_encoder(x) x = self.pos_encoder(x)
x = self.transformer_encoder(x) x = self.transformer_encoder(x)
x = x.mean(dim=1) x = x.mean(dim=1)
# normalize to stabilize and stop overfitting
#x = self.batch_norm(x)
#NOTE: no activation function for regression
x = self.classifier(x) x = self.classifier(x)
x = x.squeeze(1)
return x return x
def train_model(model, train_dataset, test_dataset, criterion, optimizer, epochs, batch_size):
def load_preprocess_data(path_data='data/hack.csv'): train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
df = pd.read_csv(path_data) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
df = df.dropna(subset=['humor_rating'])
test_losses, train_losses = [], []
df['y'] = df['humor_rating'] train_r2_scores, test_r2_scores = [], []
X = df['text']
y = df['y']
return X, y
X, y = load_preprocess_data()
ret_dict = data_gen.split_data(X, y)
params = {
'equalize_classes_loss_factor': 0.15,
'batch_size': 32,
'epochs': 2,
'lr': 1e-4,
'clipping_max_norm': 0,
'early_stopping_patience': 5,
'lr_scheduler_factor': 0.5,
'lr_scheduler_patience': 3,
'nhead': 2,
'num_layers': 3,
'hidden_dim': 10,
'positional_dropout': 0.5,
'classifier_dropout': 0.5,
'weight_decay': 1e-2
}
max_len = 280
train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)
val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)
test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)
def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size):
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
model.to(device)
# Store for plotting
train_losses, val_losses = [], []
train_r2_scores, val_r2_scores = [], []
for epoch in range(epochs): for epoch in range(epochs):
model.train() model.train()
total_loss = 0 running_loss = 0.0
all_preds, all_targets = [], [] running_r2 = 0.0
for inputs, targets in dataloader: # Training
inputs, targets = inputs.to(device), targets.to(device) for inputs, labels in train_loader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad() optimizer.zero_grad()
outputs = model(inputs).squeeze() outputs = model(inputs)
loss = criterion(outputs, targets.float()) loss = criterion(outputs, labels)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
total_loss += loss.item()
running_loss += loss.item()
running_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
train_losses.append(running_loss / len(train_loader))
train_r2_scores.append(running_r2 / len(train_loader))
# Test
model.eval() # Set model to evaluation mode
test_loss = 0.0
test_r2 = 0.0
with torch.no_grad(): # No gradient calculation for testing
for inputs, labels in test_loader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
test_loss += loss.item()
test_r2 += r2_score(labels.cpu().numpy(), outputs.cpu().detach().numpy())
test_losses.append(test_loss / len(test_loader))
test_r2_scores.append(test_r2 / len(test_loader))
print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train R²: {train_r2_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test R²: {test_r2_scores[-1]:.4f}')
return train_losses, test_losses, train_r2_scores, test_r2_scores
all_preds.extend(outputs.detach().cpu().numpy()) # Bootstrap Aggregation (Bagging) Update
all_targets.extend(targets.detach().cpu().numpy()) def bootstrap_aggregation(ModelClass, train_dataset, test_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
# Calculate R2
r2 = r2_score(all_targets, all_preds)
train_losses.append(total_loss / len(dataloader))
train_r2_scores.append(r2)
# Validation phase
model.eval()
val_loss = 0
val_preds, val_targets = [], []
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
val_loss += loss.item()
val_preds.extend(outputs.cpu().numpy())
val_targets.extend(targets.cpu().numpy())
# Calculate Validation R2
val_r2 = r2_score(val_targets, val_preds)
val_losses.append(val_loss / len(val_loader))
val_r2_scores.append(val_r2)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}")
return train_losses, val_losses, train_r2_scores, val_r2_scores
def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
models = [] models = []
all_train_losses, all_val_losses = [], [] all_train_losses, all_test_losses = [], []
all_train_r2_scores, all_val_r2_scores = [], [] all_train_r2_scores, all_test_r2_scores = [], []
subset_size = len(train_dataset) // num_models subset_size = len(train_dataset) // num_models
for i in range(num_models): for i in range(num_models):
@ -225,20 +170,41 @@ def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, ba
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset))) subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices) subset = Subset(train_dataset, subset_indices)
model = ModelClass() model = ModelClass(vocab_size, EMBEDDING_DIM, params["filter_sizes"], params["num_filters"], embedding_matrix, params["dropout"])
model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size) train_losses, test_losses, train_r2_scores, test_r2_scores = train_model(model, subset, test_dataset, criterion, optimizer, epochs, batch_size)
models.append(model) models.append(model)
all_train_losses.append(train_losses) all_train_losses.append(train_losses)
all_val_losses.append(val_losses) all_test_losses.append(test_losses)
all_train_r2_scores.append(train_r2_scores) all_train_r2_scores.append(train_r2_scores)
all_val_r2_scores.append(val_r2_scores) all_test_r2_scores.append(test_r2_scores)
return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores # Plot für alle Modelle
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_losses[i], label=f'Model {i + 1} Train Loss')
plt.plot(all_test_losses[i], label=f'Model {i + 1} Test Loss', linestyle = 'dashed')
plt.title("Training and Test Loss for all Models")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
for i in range(num_models):
plt.plot(all_train_r2_scores[i], label=f'Model {i + 1} Train R²')
plt.plot(all_test_r2_scores[i], label=f'Model {i + 1} Test R²', linestyle = 'dashed')
plt.title("Training and Test R² for all Models")
plt.xlabel('Epochs')
plt.ylabel('')
plt.legend()
plt.show()
return models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores
# Ensemble Prediction # Ensemble Prediction
def ensemble_predict(models, test_dataset): def ensemble_predict(models, test_dataset):
@ -254,57 +220,61 @@ def ensemble_predict(models, test_dataset):
return np.array(all_predictions) return np.array(all_predictions)
if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 25,
"patience": 7,
"batch_size": 32,
"learning_rate": 1e-4, # 1e-4
"weight_decay": 5e-4 ,
# Model
'nhead': 2, # 5
"dropout": 0.2,
'hiden_dim': 2048,
'num_layers': 6
}
# TODO set seeds
# Bootstrap Aggregating # Configs
num_models = 2 MODEL_NAME = 'transfomrer.pt'
ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation( HIST_NAME = 'transformer_history'
lambda: TransformerBinaryClassifier( GLOVE_PATH = 'data/glove.6B.100d.txt'
embeddings=embedding_matrix, DATA_PATH = 'data/hack.csv'
nhead=params['nhead'], EMBEDDING_DIM = 100
num_layers=params['num_layers'], TEST_SIZE = 0.1
dim_feedforward=params['hidden_dim'], VAL_SIZE = 0.1
positional_dropout=params['positional_dropout'],
classifier_dropout=params['classifier_dropout']
).to(device),
train_dataset,
num_models=num_models,
epochs=params['epochs'],
batch_size=params['batch_size'],
learning_rate=params['lr']
)
# Ensemble Prediction on Testset device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ensemble_predictions = ensemble_predict(ensemble_models, test_dataset) # Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
# Plotting X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Train and Validation Losses # Aufteilen der Daten
for i in range(num_models): data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}")
ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}", linestyle='dashed')
ax1.set_title('Train and Validation Loss') # Dataset und DataLoader
ax1.set_xlabel('Epochs') train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
ax1.set_ylabel('Loss') val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
ax1.legend() test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
# Plot Train and Validation R² # Bootstrap Aggregation (Bagging) Training
for i in range(num_models): models, all_train_losses, all_test_losses, all_train_r2_scores, all_test_r2_scores = bootstrap_aggregation(
ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}") TransformerBinaryClassifier, train_dataset, test_dataset, num_models=2, epochs=params["epochs"], batch_size=params["batch_size"], learning_rate=params["learning_rate"])
ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}", linestyle='dashed')
ax2.set_title('Train and Validation R²') # Ensemble Prediction
ax2.set_xlabel('Epochs') test_predictions = ensemble_predict(models, test_dataset)
ax2.set_ylabel('')
ax2.legend()
plt.tight_layout() # Test Evaluation
plt.show() # test_labels = np.array([y for _, y in test_dataset])
# Evaluation test_mse = mean_squared_error(test_dataset.labels.to_numpy(), test_predictions)
mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions) test_mae = mean_absolute_error(test_dataset.labels.to_numpy(), test_predictions)
mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions) test_r2 = r2_score(test_dataset.labels.to_numpy(), test_predictions)
r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions)
print(f"Test RMSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")