!!!WARNING!!! Nuclear refactoring bomb in coming (Now 90% more confusing but 100% cleaner)

2025-02-15 17:16:34 +01:00 · 2025-02-15 17:16:34 +01:00 · 2ff92b9e15
parent 556ed1c292
commit 2ff92b9e15
38 changed files with 15114 additions and 164515 deletions
--- a/BERT.py
+++ b/BERT.py
@ -0,0 +1,137 @@
+import random
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from transformers import BertForSequenceClassification, AutoTokenizer
+import numpy as np
+
+import Datasets
+import dataset_helper
+import EarlyStopping
+import ml_helper
+import ml_history
+import ml_train
+
+SEED = 501
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+torch.cuda.manual_seed_all(SEED)
+torch.backends.cudnn.deterministic = True
+
+class CustomBert(nn.Module):
+    def __init__(self,dropout):
+        super().__init__()
+        #Bert + Custom Layers (Not a tuple any longer -- idk why)
+        self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(2,1)
+        # self.sm = nn.Softmax(dim=1)
+         
+    def forward(self, input_ids, attention_mask):
+        x = self.bfsc(input_ids, attention_mask = attention_mask)
+        x = self.dropout(x[0])
+        x = self.classifier(x)
+        x = x.squeeze()
+        return x
+        
+    
+    def freeze_bert_params(self):
+        for param in self.bfsc.named_parameters():
+            param[1].requires_grad_(False)
+
+    def unfreeze_bert_params(self):
+        for param in self.bfsc.named_parameters():
+            param[1].requires_grad_(True)
+
+if __name__ == '__main__':
+    # Hyperparameter und Konfigurationen
+    params = {
+        # Config
+        "max_len": 128,
+        # Training
+        "epochs": 10,
+        "patience": 7,
+        "batch_size": 32,
+        "learning_rate": 0.001,
+        "weight_decay": 5e-4 ,
+        # Model
+        "filter_sizes": [2, 3, 4, 5],
+        "num_filters": 150,
+        "dropout": 0.6
+    }
+
+    # Configs
+    MODEL_NAME = 'BERT.pt'
+    HIST_NAME = 'BERT_history'
+    GLOVE_PATH = 'data/glove.6B.100d.txt'
+    DATA_PATH = 'data/hack.csv'
+    FREEZE_BERT = False
+    EMBEDDING_DIM = 100
+    TEST_SIZE = 0.1
+    VAL_SIZE = 0.1
+
+    # Daten laden und vorbereiten
+    embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
+        gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
+
+    X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
+
+    # Aufteilen der Daten
+    data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
+
+    # Initialize BertTokenizer from Pretrained 
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
+    print("Tokenizer Initialized")
+
+    # Dataset und DataLoader
+    train_dataset = Datasets.BertDataset(tokenizer, data_split['train']['X'], data_split['train']['y'], max_len=params["max_len"])
+    val_dataset = Datasets.BertDataset(tokenizer, data_split['val']['X'], data_split['val']['y'], max_len=params["max_len"])
+    test_dataset = Datasets.BertDataset(tokenizer, data_split['test']['X'], data_split['test']['y'], max_len=params["max_len"])
+
+    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
+
+    # Modell initialisieren
+    model = CustomBert(dropout=params["dropout"])
+
+    device = ml_helper.get_device(verbose=True, include_mps=False)
+    model = model.to(device)
+
+    criterion = nn.MSELoss()
+    optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
+    early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
+
+    hist = ml_history.History()
+
+    # Training und Validierung
+    for epoch in range(params["epochs"]):
+        ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"], bert_freeze=FREEZE_BERT, is_bert=True)
+
+        val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist, is_bert=True)
+
+        early_stopping(val_rmse, model)
+        if early_stopping.early_stop:
+            print("Early stopping triggered.")
+            break
+
+    # Load best model
+    model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
+
+    # Test Evaluation
+    test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True)
+
+    hist.add_test_results(test_labels, test_preds)
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # RMSE, MAE und R²-Score für das Test-Set
+    test_mae = mean_absolute_error(test_labels, test_preds)
+    test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
+    test_r2 = r2_score(test_labels, test_preds)
+    print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
--- a/BalancedCELoss.py
+++ b/BalancedCELoss.py
@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-
-class BalancedCELoss(nn.Module):
-    def __init__(self, alpha=0.1):
-        super(BalancedCELoss, self).__init__()
-        self.bce_loss = nn.CrossEntropyLoss()
-        self.alpha = alpha
-
-    def forward(self, predictions, targets):
-        # detect num of unique classes
-        num_classes = len(torch.unique(targets))
-        if num_classes == 1:
-        # If only one class than split it into two classes
-            predictions = torch.cat((1 - predictions, predictions), dim=1)
-
-
-        # Calculate the standard binary cross-entropy loss
-        bce_loss = self.bce_loss(predictions, targets)
-
-        predictions = torch.argmax(predictions, dim=1)
-
-        # Calculate the number of predictions for each class
-        class_0_preds_n = predictions[predictions == 0]
-        class_1_preds_n = predictions[predictions == 1]
-
-        # Calculate the number of labels for each class based on predictions
-        class_0_labels_n = targets[targets == 0]
-        class_1_labels_n = targets[targets == 1]
-
-        preds_ratio_0 = len(class_0_preds_n) / len(predictions)
-        preds_ratio_1 = len(class_1_preds_n) / len(predictions)
-
-        labels_ratio_0 = len(class_0_labels_n) / len(targets)
-        labels_ratio_1 = len(class_1_labels_n) / len(targets)
-
-        # Calculate the imbalance penalty
-        imbalance_penalty = np.abs(preds_ratio_0 - labels_ratio_0) + np.abs(preds_ratio_1 - labels_ratio_1)
-        
-        # Combine the BCE loss with the imbalance penalty
-        total_loss = bce_loss + self.alpha * imbalance_penalty
-        
-        return total_loss
--- a/BertFine.ipynb
+++ b/BertFine.ipynb
--- a/CNN.py
+++ b/CNN.py
@ -0,0 +1,147 @@
+import random
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+import numpy as np
+
+import Datasets
+import dataset_helper
+import EarlyStopping
+import ml_helper
+import ml_history
+import ml_train
+
+SEED = 501
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+torch.cuda.manual_seed_all(SEED)
+torch.backends.cudnn.deterministic = True
+
+class EnhancedCNNRegressor(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
+        super(EnhancedCNNRegressor, self).__init__()
+        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
+        
+        # Convolutional Schichten mit Batch-Normalisierung
+        self.convs = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(1, num_filters, (fs, embedding_dim)),
+                nn.BatchNorm2d(num_filters),  # Batch-Normalisierung
+                nn.ReLU(),
+                nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
+                nn.Dropout(dropout)  # Dropout nach jeder Schicht
+            )
+            for fs in filter_sizes
+        ])
+        
+        # Fully-Connected Layer
+        self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)  # Erweiterte Dense-Schicht
+        self.fc2 = nn.Linear(128, 1)  # Ausgangsschicht (Regression)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.embedding(x).unsqueeze(1)  # [Batch, 1, Seq, Embedding]
+        conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]  # Pooling reduziert Dim
+        x = torch.cat(conv_outputs, 1)  # Kombiniere Features von allen Filtern
+        x = torch.relu(self.fc1(x))  # Zusätzliche Dense-Schicht
+        x = self.dropout(x)
+        return self.fc2(x).squeeze(1)
+
+if __name__ == '__main__':
+    # Hyperparameter und Konfigurationen
+    params = {
+        # Config
+        "max_len": 280,
+        # Training
+        "epochs": 25,
+        "patience": 7,
+        "batch_size": 32,
+        "learning_rate": 0.001,
+        "weight_decay": 5e-4 ,
+        # Model
+        "filter_sizes": [2, 3, 4, 5],
+        "num_filters": 150,
+        "dropout": 0.6
+    }
+
+    # Configs
+    MODEL_NAME = 'CNN.pt'
+    HIST_NAME = 'CNN_history'
+    GLOVE_PATH = 'data/glove.6B.100d.txt'
+    DATA_PATH = 'data/hack.csv'
+    EMBEDDING_DIM = 100
+    TEST_SIZE = 0.1
+    VAL_SIZE = 0.1
+
+    # Daten laden und vorbereiten
+    embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
+        gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
+
+    X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
+
+    # Aufteilen der Daten
+    data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
+
+    # Dataset und DataLoader
+    train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
+    val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
+    test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
+
+    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
+
+    # Modell initialisieren
+    model = EnhancedCNNRegressor(
+        vocab_size=vocab_size,
+        embedding_dim=EMBEDDING_DIM,
+        filter_sizes=params["filter_sizes"],
+        num_filters=params["num_filters"],
+        embedding_matrix=embedding_matrix,
+        dropout=params["dropout"]
+    )
+
+
+    device = ml_helper.get_device(verbose=True, include_mps=False)
+    model = model.to(device)
+
+    criterion = nn.MSELoss()
+    optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
+    early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
+
+    hist = ml_history.History()
+
+    # Training und Validierung
+    for epoch in range(params["epochs"]):
+        ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
+
+        val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
+
+        early_stopping(val_rmse, model)
+        if early_stopping.early_stop:
+            print("Early stopping triggered.")
+            break
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # Load best model
+    model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
+
+    # Test Evaluation
+    test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
+
+    hist.add_test_results(test_labels, test_preds)
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # RMSE, MAE und R²-Score für das Test-Set
+    test_mae = mean_absolute_error(test_labels, test_preds)
+    test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
+    test_r2 = r2_score(test_labels, test_preds)
+    print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
--- a/CNN_CLASS.py
+++ b/CNN_CLASS.py
@ -1,227 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader
-from sklearn.metrics import accuracy_score
-from tqdm import tqdm
-from dataset_generator import create_embedding_matrix, split_data, load_preprocess_data
-from HumorDataset import TextDataset
-from BalancedCELoss import BalancedCELoss
-import matplotlib.pyplot as plt
-import numpy as np
-
-# Hyperparameter und Konfigurationen
-params = {
-    "embedding_dim": 100,
-    "filter_sizes": [2, 3, 4, 5],
-    "num_filters": 150,
-    "batch_size": 32,
-    "learning_rate": 0.001,
-    "epochs": 25,
-    "glove_path": 'data/glove.6B.100d.txt',
-    "max_len": 280,
-    "test_size": 0.1,
-    "val_size": 0.1,
-    "patience": 5,
-    "data_path": 'data/hack.csv',
-    "dropout": 0.6,
-    "weight_decay": 5e-4,
-    "alpha": 0.1  # Alpha für die Balance in der Loss-Funktion
-}
-
-# CNN-Modell für binäre Klassifikation
-class EnhancedCNNBinaryClassifier(nn.Module):
-    def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
-        super(EnhancedCNNBinaryClassifier, self).__init__()
-        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
-        self.convs = nn.ModuleList([
-            nn.Sequential(
-                nn.Conv2d(1, num_filters, (fs, embedding_dim)),
-                nn.BatchNorm2d(num_filters),
-                nn.ReLU(),
-                nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
-                nn.Dropout(dropout)
-            )
-            for fs in filter_sizes
-        ])
-        self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)
-        self.fc2 = nn.Linear(128, 2)  # 2 Klassen, daher 2 Outputs für CrossEntropyLoss
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        x = self.embedding(x).unsqueeze(1)
-        conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]
-        x = torch.cat(conv_outputs, 1)
-        x = torch.relu(self.fc1(x))
-        x = self.dropout(x)
-        return self.fc2(x)  # 2 Outputs, CrossEntropyLoss übernimmt die Softmax
-
-# Visualisierungsfunktionen
-def visualize_predictions(true_values, predicted_values):
-    plt.figure(figsize=(10, 6))
-
-    # Unterschied zwischen vorhergesagten und wahren Werten
-    true_values = np.array(true_values)
-    predicted_values = np.array(predicted_values)
-
-    correct_indices = true_values == predicted_values
-    incorrect_indices = ~correct_indices
-
-    # Scatterplot
-    plt.scatter(
-        np.arange(len(true_values))[correct_indices],
-        true_values[correct_indices],
-        color='green',
-        label='Richtig vorhergesagt'
-    )
-    plt.scatter(
-        np.arange(len(true_values))[incorrect_indices],
-        true_values[incorrect_indices],
-        color='red',
-        label='Falsch vorhergesagt'
-    )
-
-    plt.axhline(0.5, linestyle='--', color='blue', label='Schwelle (0.5)')
-    plt.ylim(-0.5, 1.5)
-    plt.yticks([0, 1], labels=['Klasse 0', 'Klasse 1'])
-    plt.xlabel('Datenindex')
-    plt.ylabel('Klassifikation')
-    plt.title('Richtige vs. Falsche Vorhersagen')
-    plt.legend()
-    plt.grid(True, linestyle='--', alpha=0.6)
-    plt.tight_layout()
-    plt.show()
-
-def visualize_distribution(true_values, predicted_values):
-    plt.figure(figsize=(10, 6))
-
-    # Häufigkeiten der Klassen berechnen
-    true_counts = np.bincount(true_values, minlength=2)
-    predicted_counts = np.bincount(predicted_values, minlength=2)
-
-    # Barplot erstellen
-    labels = ['Klasse 0', 'Klasse 1']
-    x = np.arange(len(labels))
-
-    plt.bar(x - 0.2, true_counts, width=0.4, color='skyblue', label='Wahre Werte', edgecolor='black')
-    plt.bar(x + 0.2, predicted_counts, width=0.4, color='salmon', label='Vorhergesagte Werte', edgecolor='black')
-
-    plt.title('Verteilung der wahren Werte und Vorhersagen')
-    plt.xticks(x, labels)
-    plt.ylabel('Häufigkeit')
-    plt.xlabel('Klassen')
-    plt.legend()
-    plt.grid(axis='y', linestyle='--', alpha=0.7)
-    plt.tight_layout()
-    plt.show()
-
-# Gerät initialisieren
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-
-# Daten laden
-embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
-    gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
-)
-X, y = load_preprocess_data(path_data=params["data_path"])
-
-# Daten splitten
-data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
-train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
-val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
-test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
-
-train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
-val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
-test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
-
-# Modell initialisieren
-model = EnhancedCNNBinaryClassifier(
-    vocab_size=vocab_size,
-    embedding_dim=params["embedding_dim"],
-    filter_sizes=params["filter_sizes"],
-    num_filters=params["num_filters"],
-    embedding_matrix=embedding_matrix,
-    dropout=params["dropout"]
-)
-model = model.to(device)
-
-# BalancedCELoss verwenden
-criterion = BalancedCELoss(alpha=params["alpha"])
-optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
-
-# Training
-history = {
-    "train_loss": [],
-    "val_loss": [],
-    "train_acc": [],
-    "val_acc": [],
-}
-
-for epoch in range(params["epochs"]):
-    model.train()
-    train_loss, correct, total = 0.0, 0, 0
-
-    with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
-        for X_batch, y_batch in pbar:
-            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
-            optimizer.zero_grad()
-            outputs = model(X_batch)
-            loss = criterion(outputs, y_batch)
-            loss.backward()
-            optimizer.step()
-
-            train_loss += loss.item()
-            predicted = torch.argmax(outputs, dim=1)
-            correct += (predicted == y_batch).sum().item()
-            total += y_batch.size(0)
-
-            pbar.set_postfix({"Train Loss": loss.item()})
-
-    train_acc = correct / total
-    history["train_loss"].append(train_loss / len(train_loader))
-    history["train_acc"].append(train_acc)
-
-    # Validation
-    model.eval()
-    val_loss, correct, total = 0.0, 0, 0
-    with torch.no_grad():
-        for X_batch, y_batch in val_loader:
-            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
-            outputs = model(X_batch)
-            loss = criterion(outputs, y_batch)
-            val_loss += loss.item()
-            predicted = torch.argmax(outputs, dim=1)
-            correct += (predicted == y_batch).sum().item()
-            total += y_batch.size(0)
-
-    val_acc = correct / total
-    history["val_loss"].append(val_loss / len(val_loader))
-    history["val_acc"].append(val_acc)
-
-    print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
-    print(f"Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")
-
-# Testen und Visualisieren
-model.eval()
-test_correct, test_total = 0, 0
-true_labels, predicted_labels = [], []
-
-with torch.no_grad():
-    for X_batch, y_batch in test_loader:
-        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
-        outputs = model(X_batch)
-        predicted = torch.argmax(outputs, dim=1)
-        true_labels.extend(y_batch.cpu().numpy())
-        predicted_labels.extend(predicted.cpu().numpy())
-        test_correct += (predicted == y_batch).sum().item()
-        test_total += y_batch.size(0)
-
-test_accuracy = test_correct / test_total
-print(f"Test Accuracy: {test_accuracy:.4f}")
-
-# Visualisierung der Vorhersagen (Scatterplot)
-visualize_predictions(true_labels, predicted_labels)
-
-# Visualisierung der Verteilung (Barplot)
-visualize_distribution(true_labels, predicted_labels)
--- a/CNN_REG.py
+++ b/CNN_REG.py
@ -1,316 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader
-from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
-from tqdm import tqdm  # Fortschrittsbalken-Bibliothek
-from dataset_generator import create_embedding_matrix, split_data
-from HumorDataset import TextRegDataset
-import numpy as np
-import pandas as pd
-import os
-import matplotlib.pyplot as plt
-
-# Hyperparameter und Konfigurationen
-params = {
-    "embedding_dim": 100,
-    "filter_sizes": [2, 3, 4, 5],  # Zusätzliche Filtergröße
-    "num_filters": 150,  # Erhöhte Anzahl von Filtern
-    "batch_size": 32,
-    "learning_rate": 0.001,
-    "epochs": 25,
-    "glove_path": 'data/glove.6B.100d.txt',  # Pfad zu GloVe
-    "max_len": 280,
-    "test_size": 0.1,
-    "val_size": 0.1,
-    "patience": 5,
-    "data_path": 'data/hack.csv',  # Pfad zu den Daten
-    "dropout": 0.6,  # Erhöhtes Dropout
-    "weight_decay": 5e-4  # L2-Regularisierung
-}
-
-# EarlyStopping-Klasse mit Ordnerprüfung
-class EarlyStopping:
-    def __init__(self, patience=5, verbose=False):
-        self.patience = patience
-        self.verbose = verbose
-        self.counter = 0
-        self.best_score = None
-        self.early_stop = False
-
-    def __call__(self, val_loss, model):
-        score = -val_loss
-        if self.best_score is None:
-            self.best_score = score
-            self.save_checkpoint(val_loss, model)
-        elif score < self.best_score:
-            self.counter += 1
-            if self.counter >= self.patience:
-                self.early_stop = True
-        else:
-            self.best_score = score
-            self.save_checkpoint(val_loss, model)
-            self.counter = 0
-
-    def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
-        directory = "checkpoints"
-        if not os.path.exists(directory):
-            os.makedirs(directory)  # Erstelle den Ordner, falls er nicht existiert
-        if self.verbose:
-            print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')
-        torch.save(model.state_dict(), os.path.join(directory, filename))
-
-# Plot-Funktion für Training
-def plot_learning_curves(history):
-    epochs = range(1, len(history['train_loss']) + 1)
-
-    # Loss-Plot
-    plt.figure(figsize=(14, 6))
-    plt.subplot(1, 2, 1)
-    plt.plot(epochs, history['train_loss'], label='Train Loss')
-    plt.plot(epochs, history['val_loss'], label='Val Loss')
-    plt.xlabel('Epochs')
-    plt.ylabel('Loss')
-    plt.title('Training and Validation Loss')
-    plt.legend()
-
-    # RMSE-Plot
-    plt.subplot(1, 2, 2)
-    plt.plot(epochs, history['train_rmse'], label='Train RMSE')
-    plt.plot(epochs, history['val_rmse'], label='Val RMSE')
-    plt.xlabel('Epochs')
-    plt.ylabel('RMSE')
-    plt.title('Training and Validation RMSE')
-    plt.legend()
-
-    plt.tight_layout()
-    plt.show()
-
-# Visualisierung der Zielvariablen (Scores)
-def visualize_data_distribution(y):
-    print("\n--- Zielvariable: Statistik ---")
-    print(f"Min: {np.min(y)}, Max: {np.max(y)}")
-    print(f"Mittelwert: {np.mean(y):.4f}, Standardabweichung: {np.std(y):.4f}")
-    
-    # Histogramm plotten
-    plt.figure(figsize=(10, 6))
-    plt.hist(y, bins=20, color='skyblue', edgecolor='black')
-    plt.title('Verteilung der Zielvariable (Scores)')
-    plt.xlabel('Score')
-    plt.ylabel('Häufigkeit')
-    plt.grid(axis='y', linestyle='--', alpha=0.7)
-    plt.show()
-
-# Funktion zum Laden und Vorverarbeiten der Daten
-def load_preprocess_data(path_data='data/hack.csv'):
-    # Daten laden
-    df = pd.read_csv(path_data)
-
-    # Fehlende Werte in der Zielspalte entfernen
-    df = df.dropna(subset=['humor_rating'])
-
-    # Zielvariable aus der Spalte 'humor_rating' extrahieren
-    df['y'] = df['humor_rating'].astype(float)  # Sicherstellen, dass Zielvariable numerisch ist
-
-    # Eingabetexte und Zielvariable zuweisen
-    X = df['text']
-    y = df['y']
-
-    # Debug-Ausgabe zur Überprüfung
-    print(f"Erste Zielwerte: {y.head(10)}")
-    print(f"Datentyp der Zielvariable: {y.dtype}")
-    print(f"Anzahl der Beispiele: {len(X)}")
-    
-    return X, y
-
-# CNN-Modell für Regression mit erweiterten Regularisierungen
-class EnhancedCNNRegressor(nn.Module):
-    def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
-        super(EnhancedCNNRegressor, self).__init__()
-        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
-        
-        # Convolutional Schichten mit Batch-Normalisierung
-        self.convs = nn.ModuleList([
-            nn.Sequential(
-                nn.Conv2d(1, num_filters, (fs, embedding_dim)),
-                nn.BatchNorm2d(num_filters),  # Batch-Normalisierung
-                nn.ReLU(),
-                nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
-                nn.Dropout(dropout)  # Dropout nach jeder Schicht
-            )
-            for fs in filter_sizes
-        ])
-        
-        # Fully-Connected Layer
-        self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)  # Erweiterte Dense-Schicht
-        self.fc2 = nn.Linear(128, 1)  # Ausgangsschicht (Regression)
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        x = self.embedding(x).unsqueeze(1)  # [Batch, 1, Seq, Embedding]
-        conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]  # Pooling reduziert Dim
-        x = torch.cat(conv_outputs, 1)  # Kombiniere Features von allen Filtern
-        x = torch.relu(self.fc1(x))  # Zusätzliche Dense-Schicht
-        x = self.dropout(x)
-        return self.fc2(x).squeeze(1)
-
-# Device auf CPU setzen
-device = torch.device("cpu")
-print(f"Using device: {device}")
-
-# Daten laden und vorbereiten
-embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
-    gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
-)
-X, y = load_preprocess_data(path_data=params["data_path"])
-
-# Visualisierung der Daten
-visualize_data_distribution(y)
-
-# Aufteilen der Daten
-data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
-
-# Dataset und DataLoader
-train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
-val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
-test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
-
-train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
-val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
-test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
-
-# Modell initialisieren
-model = EnhancedCNNRegressor(
-    vocab_size=vocab_size,
-    embedding_dim=params["embedding_dim"],
-    filter_sizes=params["filter_sizes"],
-    num_filters=params["num_filters"],
-    embedding_matrix=embedding_matrix,
-    dropout=params["dropout"]
-)
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = model.to(device)
-
-criterion = nn.MSELoss()
-optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
-early_stopping = EarlyStopping(patience=params["patience"], verbose=True)
-
-# Speicher für Trainingsmetriken
-history = {
-    "train_loss": [],
-    "val_loss": [],
-    "train_rmse": [],
-    "val_rmse": [],
-}
-
-# Training und Validierung
-for epoch in range(params["epochs"]):
-    model.train()
-    train_loss = 0.0
-    train_preds, train_labels = [], []
-
-    # Fortschrittsbalken für Training innerhalb einer Epoche
-    with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
-        for X_batch, y_batch in pbar:
-            X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
-            optimizer.zero_grad()
-            predictions = model(X_batch).float()
-            loss = criterion(predictions, y_batch)
-            loss.backward()
-            optimizer.step()
-            train_loss += loss.item()
-            
-            # Speichere echte und vorhergesagte Werte für Metriken
-            train_preds.extend(predictions.cpu().detach().numpy())
-            train_labels.extend(y_batch.cpu().detach().numpy())
-            
-            # Update der Fortschrittsanzeige
-            pbar.set_postfix({"Train Loss": loss.item()})
-    
-    train_rmse = np.sqrt(mean_squared_error(train_labels, train_preds))  # RMSE
-    history["train_loss"].append(train_loss / len(train_loader))
-    history["train_rmse"].append(train_rmse)
-
-    # Validation
-    model.eval()
-    val_loss = 0.0
-    val_preds, val_labels = [], []
-    with torch.no_grad():
-        for X_batch, y_batch in val_loader:
-            X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
-            predictions = model(X_batch).float()
-            loss = criterion(predictions, y_batch)
-            val_loss += loss.item()
-
-            val_preds.extend(predictions.cpu().detach().numpy())
-            val_labels.extend(y_batch.cpu().detach().numpy())
-
-        val_rmse = np.sqrt(mean_squared_error(val_labels, val_preds))  # RMSE
-    history["val_loss"].append(val_loss / len(val_loader))
-    history["val_rmse"].append(val_rmse)
-
-    print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
-    print(f"Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
-
-    early_stopping(val_rmse, model)
-    if early_stopping.early_stop:
-        print("Early stopping triggered.")
-        break
-
-# Plot der Lernkurven
-plot_learning_curves(history)
-# Funktion zur Visualisierung der richtigen und falschen Vorhersagen
-def visualize_predictions(true_values, predicted_values):
-    plt.figure(figsize=(10, 6))
-    
-    # Unterschied zwischen vorhergesagten und wahren Werten
-    correct_indices = np.isclose(true_values, predicted_values, atol=0.3)  # Als korrekt angenommen, wenn Differenz <= 0.3
-    
-    # Plot
-    plt.scatter(true_values[correct_indices], predicted_values[correct_indices], color='green', label='Richtig vorhergesagt')
-    plt.scatter(true_values[~correct_indices], predicted_values[~correct_indices], color='red', label='Falsch vorhergesagt')
-    plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal-Linie')
-    
-    plt.xlabel('Wahre Werte')
-    plt.ylabel('Vorhergesagte Werte')
-    plt.title('Richtige vs Falsche Vorhersagen')
-    plt.legend()
-    plt.grid(True)
-    plt.show()
-
-# Test Evaluation
-model.eval()
-test_preds, test_labels = [], []
-with torch.no_grad():
-    for X_batch, y_batch in test_loader:
-        X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
-        predictions = model(X_batch).float()
-        test_preds.extend(predictions.cpu().detach().numpy())
-        test_labels.extend(y_batch.cpu().detach().numpy())
-
-# Konvertierung zu NumPy-Arrays
-true_values = np.array(test_labels)
-predicted_values = np.array(test_preds)
-
-# Visualisierung der Ergebnisse
-visualize_predictions(true_values, predicted_values)
-
-# RMSE, MAE und R²-Score für das Test-Set
-test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
-test_mae = mean_absolute_error(test_labels, test_preds)
-test_r2 = r2_score(test_labels, test_preds)
-print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
-
-
-
-# plot distribution of predicted values and true values
-plt.figure(figsize=(10, 6))
-plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
-plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
-plt.title('Distribution of Predicted and True Values')
-plt.xlabel('Score')
-plt.ylabel('Frequency')
-plt.legend()
-plt.grid(axis='y', linestyle='--', alpha=0.7)
-plt.show()
--- a/Datasets.py
+++ b/Datasets.py
@ -0,0 +1,69 @@
+"""
+This file contains the Datasets class.
+"""
+import torch
+from nltk.tokenize import word_tokenize
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+
+class GloveDataset(Dataset):
+    def __init__(self, texts, labels, word_index, max_len=50):
+        self.original_indices = labels.index.to_list()
+        self.texts = texts.reset_index(drop=True)
+        self.labels = labels.reset_index(drop=True)
+        self.word_index = word_index
+        self.max_len = max_len
+
+    def __len__(self):
+        return len(self.texts)
+
+    def __getitem__(self, idx):
+        texts = self.texts[idx]
+        tokens = word_tokenize(texts.lower())
+
+        label = self.labels[idx]
+        # Tokenize and convert to indices
+        input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
+        # Pad or truncate to max_len
+        if len(input_ids) < self.max_len:
+            input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
+        else:
+            input_ids = input_ids[:self.max_len]
+        
+        # Convert to PyTorch tensors
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        label = torch.tensor(label, dtype=torch.float)
+        
+        return input_ids, label
+
+
+class BertDataset(Dataset):
+    def __init__(self,tokenizer:AutoTokenizer, texts, labels, max_len:int=128):
+        super(BertDataset,self).__init__()
+        self.tokenizer = tokenizer
+        self.max_length = max_len
+        self.text = texts.to_numpy()
+        self.labels = labels.to_numpy()
+    
+    def __getitem__(self,idx:int):
+        text = self.text[idx]    
+        labels = self.labels[idx]
+        encoding = self.tokenizer(
+            text,
+            padding="max_length",
+            return_attention_mask = True,
+            max_length=self.max_length,
+            truncation = True,
+            return_tensors = 'pt'
+            )     
+        input_ids = encoding['input_ids'].flatten()
+        attention_mask = encoding['attention_mask'].flatten()
+
+        return {
+            'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
+            'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
+            'labels':torch.tensor(labels,dtype=torch.float)
+            }
+    
+    def __len__(self):
+        return len(self.labels)
--- a/EarlyStopping.py
+++ b/EarlyStopping.py
@ -1,12 +1,14 @@
 import torch
+import os

-class EarlyStopping:
-    def __init__(self, patience=5, verbose=False):
+class EarlyStoppingCallback:
+    def __init__(self, model_name, patience=5, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
+        self.model_name = model_name

    def __call__(self, val_loss, model):
        score = -val_loss
@ -22,7 +24,10 @@ class EarlyStopping:
            self.save_checkpoint(val_loss, model)
            self.counter = 0

-    def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
+    def save_checkpoint(self, val_loss, model):
+        directory = "models/checkpoints"
+        if not os.path.exists(directory):
+            os.makedirs(directory)  # Create the directory if it does not exist
        if self.verbose:
-            print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')
-        torch.save(model.state_dict(), f'checkpoints/{filename}')
+            print(f'└ Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')
+        torch.save(model.state_dict(), os.path.join(directory, self.model_name))
--- a/HumorDataset.py
+++ b/HumorDataset.py
@ -1,111 +0,0 @@
-"""
-This file contains the HumorDataset class.
-"""
-import torch
-import numpy as np
-from nltk.tokenize import word_tokenize
-
-class TextRegDataset(torch.utils.data.Dataset):
-    def __init__(self, texts, labels, word_index, max_len=50):
-
-        self.original_indices = labels.index.to_list()
-
-        self.texts = texts.reset_index(drop=True)
-        self.labels = labels.reset_index(drop=True)
-        self.word_index = word_index
-        self.max_len = max_len
-
-    def __len__(self):
-        return len(self.texts)
-
-    def __getitem__(self, idx):
-        texts = self.texts[idx]
-        tokens = word_tokenize(texts.lower())
-
-        label = self.labels[idx]
-        
-        # Tokenize and convert to indices
-        input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
-
-        # Pad or truncate to max_len
-        if len(input_ids) < self.max_len:
-            input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
-        else:
-            input_ids = input_ids[:self.max_len]
-        
-        # Convert to PyTorch tensors
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
-        label = torch.tensor(label, dtype=torch.float)
-        
-        return input_ids, label
-    
-class TextDataset(torch.utils.data.Dataset):
-    def __init__(self, texts, labels, word_index, max_len=50):
-
-        self.original_indices = labels.index.to_list()
-
-        self.texts = texts.reset_index(drop=True)
-        self.labels = labels.reset_index(drop=True)
-        self.word_index = word_index
-        self.max_len = max_len
-
-    def __len__(self):
-        return len(self.texts)
-
-    def __getitem__(self, idx):
-        texts = self.texts[idx]
-        tokens = word_tokenize(texts.lower())
-
-        label = self.labels[idx]
-        
-        # Tokenize and convert to indices
-        input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
-
-        # Pad or truncate to max_len
-        if len(input_ids) < self.max_len:
-            input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
-        else:
-            input_ids = input_ids[:self.max_len]
-        
-        # Convert to PyTorch tensors
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
-        label = torch.tensor(label, dtype=torch.long)
-        
-        return input_ids, label
-    
-class HumorDataset(torch.utils.data.Dataset):
-    def __init__(self, data, labels, vocab_size=0, emb_dim=None):
-        self.original_indices = labels.index.to_list()
-
-        self.data = data
-        self.labels = labels.reset_index(drop=True)
-        self.vocab_size = vocab_size
-        self.emb_dim = emb_dim
-
-        # TODO: bug fix
-        self.shape = self.get_shape()
-            
-
-    def __getitem__(self, idx):
-        item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
-        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
-        return item
-
-    def __len__(self):
-        return len(self.labels)
-
-    def get_single_shape(self, data):
-        shape_data = None
-        if type(data) == list:
-            shape_data = len(data[0])
-        elif type(data) == torch.Tensor:
-            shape_data = data[0].shape
-        elif type(data) == np.ndarray:
-            shape_data = data[0].shape
-        return shape_data
-
-    def get_shape(self):
-        shape_data = self.get_single_shape(self.data)
-        shape_labels = self.get_single_shape(self.labels)
-        return shape_data, shape_labels
-    
--- a/lstm_1b.py
+++ b/lstm_1b.py
--- a/Transformer.py
+++ b/Transformer.py
@ -0,0 +1,196 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+import numpy as np
+
+import Datasets
+import dataset_helper
+import EarlyStopping
+import ml_helper
+import ml_history
+import ml_train
+
+
+class PositionalEncoding(nn.Module):
+    """
+    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+    """
+
+    def __init__(self, d_model, vocab_size=5000, dropout=0.1):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(vocab_size, d_model)
+        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float()
+            * (-math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        x = x + self.pe[:, : x.size(1), :]
+        return self.dropout(x)
+
+
+class TransformerBinaryClassifier(nn.Module):
+    """
+    Text classifier based on a pytorch TransformerEncoder.
+    """
+
+    def __init__(
+        self,
+        embeddings,
+        nhead=8,
+        dim_feedforward=2048,
+        num_layers=6,
+        positional_dropout=0.1,
+        classifier_dropout=0.1,
+    ):
+
+        super().__init__()
+
+        vocab_size, d_model = embeddings.size()
+        assert d_model % nhead == 0, "nheads must divide evenly into d_model"
+
+        self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
+
+        self.pos_encoder = PositionalEncoding(
+            d_model=d_model,
+            dropout=positional_dropout,
+            vocab_size=vocab_size,
+        )
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=classifier_dropout,
+        )
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer,
+            num_layers=num_layers,
+        )
+        # normalize to stabilize and stop overfitting
+        self.batch_norm = nn.BatchNorm1d(d_model)
+        self.classifier = nn.Linear(d_model, 1)
+        self.d_model = d_model
+
+    def forward(self, x):
+        x = self.emb(x) * math.sqrt(self.d_model)
+        x = self.pos_encoder(x)
+        x = self.transformer_encoder(x)
+        x = x.mean(dim=1)
+        # normalize to stabilize and stop overfitting
+        #x = self.batch_norm(x)
+
+        #NOTE: no activation function for regression
+        x = self.classifier(x)
+        x = x.squeeze(1)
+        return x
+
+if __name__ == '__main__':
+    # Hyperparameter und Konfigurationen
+    params = {
+        # Config
+        "max_len": 280,
+        # Training
+        "epochs": 25,
+        "patience": 7,
+        "batch_size": 32,
+        "learning_rate": 1e-4, # 1e-4
+        "weight_decay": 5e-4 ,
+        # Model
+        'nhead': 2, # 5
+        "dropout": 0.2,
+        'hiden_dim': 2048,
+        'num_layers': 6
+    }
+    # TODO set seeds
+
+    # Configs
+    MODEL_NAME = 'transfomrer.pt'
+    HIST_NAME = 'transformer_history'
+    GLOVE_PATH = 'data/glove.6B.100d.txt'
+    DATA_PATH = 'data/hack.csv'
+    EMBEDDING_DIM = 100
+    TEST_SIZE = 0.1
+    VAL_SIZE = 0.1
+
+    # Daten laden und vorbereiten
+    embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
+        gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
+
+    X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
+
+    # Aufteilen der Daten
+    data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
+
+    # Dataset und DataLoader
+    train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
+    val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
+    test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
+
+    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
+
+    # Modell initialisieren
+    model = TransformerBinaryClassifier(
+        embeddings=embedding_matrix,
+        nhead=params['nhead'],
+        dim_feedforward=params['hiden_dim'],
+        num_layers=params['num_layers'],
+        positional_dropout=params["dropout"],
+        classifier_dropout=params["dropout"],
+    )
+
+    device = ml_helper.get_device(verbose=True, include_mps=False)
+    model = model.to(device)
+
+    criterion = nn.MSELoss()
+    optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"])
+    early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
+
+    hist = ml_history.History()
+
+    # Training und Validierung
+    for epoch in range(params["epochs"]):
+        ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
+
+        val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
+
+        early_stopping(val_rmse, model)
+        if early_stopping.early_stop:
+            print("Early stopping triggered.")
+            break
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # Load best model
+    model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
+
+    # Test Evaluation
+    test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
+
+    hist.add_test_results(test_labels, test_preds)
+
+    # save training history
+    hist.save_history(HIST_NAME)
+
+    # RMSE, MAE und R²-Score für das Test-Set
+    test_mae = mean_absolute_error(test_labels, test_preds)
+    test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
+    test_r2 = r2_score(test_labels, test_preds)
+    print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
--- a/bert_no_ernie.py
+++ b/bert_no_ernie.py
@ -1,266 +0,0 @@
-# PyTorch Imports
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import Dataset, DataLoader
-# scikit-learn Imports
-from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
-from sklearn.model_selection import train_test_split
-# Bert imports
-from transformers import BertForSequenceClassification, AutoTokenizer
-#Default imports (pandas, numpy, matplotlib, etc.)
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-
-## Select Device
-if torch.cuda.is_available():
-   DEVICE = torch.device("cuda")
-else:
-    DEVICE = torch.device("cpu")
-
-
-class SimpleHumorDataset(Dataset):
-    def __init__(self,tokenizer:AutoTokenizer,dataframe:pd.DataFrame,max_length:int=128):
-        super(SimpleHumorDataset,self).__init__()
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        self.text = dataframe['text'].to_numpy()
-        self.labels = dataframe['is_humor'].to_numpy()
-    
-    def __getitem__(self,idx:int):
-        text = self.text[idx]    
-        labels = self.labels[idx]
-        encoding = self.tokenizer(
-            text,
-            padding="max_length",
-            return_attention_mask = True,
-            max_length=self.max_length,
-            truncation = True,
-            return_tensors = 'pt'
-            )     
-        input_ids = encoding['input_ids'].flatten()
-        attention_mask = encoding['attention_mask'].flatten()
-
-        return {
-            'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
-            'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
-            'labels':torch.tensor(labels,dtype=torch.long)
-            }
-    
-    def __len__(self):
-        return len(self.labels)
-
-class CustomBert(nn.Module):
-    def __init__(self,dropout):
-        super().__init__()
-        #Bert + Custom Layers (Not a tuple any longer -- idk why)
-        self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-        self.dropout = nn.Dropout(dropout)
-        self.classifier = nn.Linear(2,2)
-        # self.sm = nn.Softmax(dim=1)
-         
-    def forward(self, input_ids, attention_mask):
-        x = self.bfsc(input_ids, attention_mask = attention_mask)
-        x = self.dropout(x[0])
-        x = self.classifier(x)
-        return x
-        
-    
-    def freeze_bert_params(self):
-        for param in self.bfsc.named_parameters():
-            param[1].requires_grad_(False)
-
-    def unfreeze_bert_params(self):
-        for param in self.bfsc.named_parameters():
-            param[1].requires_grad_(True)
-    
-def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim.AdamW,train_loader:DataLoader,freeze_bert:bool=False):
-    model.train()
-    if freeze_bert:
-        model.freeze_bert_params()
-    
-    total_loss = 0
-    len_train_loader = len(train_loader)
-    for train_batch in train_loader:
-        
-        # Set Gradient to Zero
-        optimizer.zero_grad()
-       
-        # Unpack batch values and "push" it to GPU
-        input_ids, att_mask, labels = train_batch.values()
-        input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE),labels.to(DEVICE)
-       
-        # Feed Model with Data
-        outputs = model(input_ids, attention_mask=att_mask)
-        # print(f"{model.bfsc.}")
-        # print(f"{outputs.shape}")
-        loss = criterion(outputs,labels)
-        loss.backward()
-        optimizer.step()
-        total_loss+=loss.item()
-
-    print(f"Training Loss is {(total_loss/len(train_loader)):.4f}")  
-    return (total_loss/len(train_loader))       
-
-def eval_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,validation_loader:DataLoader):
-    model.eval()
-    total, correct = 0.0, 0.0
-    total_loss = 0.0
-    best_loss = float("Inf")
-    with torch.no_grad():
-        for val_batch in validation_loader:
-
-            input_ids, att_mask ,labels = val_batch.values()
-            input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE), labels.to(DEVICE)
-            
-            outputs = model(input_ids,attention_mask=att_mask)
-            
-            loss = criterion(outputs,labels)
-            total_loss += loss.item()
-            
-            predictions = torch.argmax(outputs,1)
-            total += labels.size(0)
-            correct += (predictions == labels).sum().item()
-            
-            if total_loss/len(validation_loader) < best_loss:
-                best_loss = total_loss/len(validation_loader)
-                torch.save(model,"best_bert_model.pt")
-
-    print(f"Validation Loss: {total_loss/len(validation_loader):.4f} ### Validation Accuracy {correct/total*100:.4f}%")
-    return total_loss/len(validation_loader)
-
-def test_loop(model:CustomBert, test_loader:DataLoader):
-    for batch in test_loader:
-        input_ids, att_mask, labels = batch.values()
-        input_ids, att_mask, labels = input_ids.to(DEVICE), att_mask.to(DEVICE), labels.to(DEVICE)
-        with torch.no_grad():
-            model = torch.load("best_bert_model")
-            model.to(DEVICE)
-            output = model(input_ids,att_mask)
-            output.detach().cpu().numpy()
-            labels.detach().cpu().numpy()
-            pred_flat = np.argmax(output,1).flatten()
-            print(accuracy_score(labels,pred_flat))
-
-def plot_metrics_loss_n_acc(train_loss,validation_loss,train_acc,validation_acc):
-    """
-    Method that plots Loss and Accuracy of Training and Validation Data used in given modelinstance
-    """
-        # Visualize Training Loss 
-    # plt.plot(loss_values)
-    # plt.plot(eval_values)
-    # plt.hlines(np.mean(loss_values),xmin=0,xmax=EPOCH,colors='red',linestyles="dotted",label="Average Loss")
-    # plt.hlines(np.mean(eval_values),xmin=0,xmax=EPOCH,colors='green',linestyles="dashed",label="Average Val Loss")
-    # plt.title("Test Loss")
-    # plt.xlabel("Num Epochs")
-    # plt.ylabel("Total Loss of Epoch")
-    # plt.show()
-    pass
-
-def plot_test_metrics(accuracy):
-    """
-    Plot Test Metrics of Model (Confiuson Matrix, Accuracy)
-    """
-    plt.plot(accuracy)
-    plt.hlines(np.mean(accuracy),0,len(accuracy),'red','dotted','Mean Accuracy %d'.format(np.mean(accuracy)))
-    plt.title("Accuracy of Test")
-    plt.xlabel("Num Epochs")
-    plt.ylabel("Accurcy 0.0 - 1.0")
-    plt.grid(True)
-    plt.legend()
-    plt.show()
-
-# def performance_metrics(true_labels,predictions):
-#     confusion_matrix(true_labels,predictions)
-#     accuracy_score(true_labels,predictions)
-#     f1_score(true_labels,predictions)
-#     pass
-
-def create_datasets(tokenizer:AutoTokenizer,dataframe:pd.DataFrame,train_split_ratio:float,val:bool=False)->tuple[SimpleHumorDataset,SimpleHumorDataset,SimpleHumorDataset]|tuple[SimpleHumorDataset,SimpleHumorDataset]:
-    if train_split_ratio > 1.0:
-        raise AssertionError("Trainsplit sollte kleiner(-gleich) 1.0 sein")
-    train,test = train_test_split(dataframe,train_size=train_split_ratio,random_state=501)
-    if val:
-        test,validation = train_test_split(test,train_size=.5,random_state=501)
-        return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test), SimpleHumorDataset(tokenizer,validation)
-    return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test)    
-
-def create_dataloaders(datasets:tuple|list,batchsize:int,shufflelist:list):
-    train_loader = DataLoader(datasets[0],batchsize,shuffle=shufflelist[0])
-    test_loader = DataLoader(datasets[1],batchsize,shuffle=shufflelist[1])
-    if len(datasets) == 3:
-        return train_loader, test_loader, DataLoader(datasets[2],batchsize,shuffle=shufflelist[2])
-    return train_loader, test_loader
-    
-
-# if __name__ == "__main__":
-    
-    # # HYPERPARAMETERS
-    # # Set Max Epoch Amount
-    # EPOCH = 10
-    # # DROPOUT-PROBABILITY
-    # DROPOUT = 0.1
-    # # BATCHSIZE
-    # BATCH_SIZE = 16
-    # #LEARNING RATE
-    # LEARNING_RATE = 1e-5
-    # # RANDOM SEED
-    # RNDM_SEED = 501
-    # # FREEZE Bert Layers
-    # FREEZE = True
-
-    # torch.manual_seed(RNDM_SEED)
-    # np.random.seed(RNDM_SEED)
-    # torch.cuda.manual_seed_all(RNDM_SEED)
-
-
-    # Initialize Bert Model with dropout probability and port to DEVICE
-    # mybert = CustomBert(DROPOUT)
-    # print("Bert Initialized")
-    # mybert.to(DEVICE)
-
-
-    # Read Raw Data from csv and save as DataFrame
-    # df = pd.read_csv("./data/hack.csv",encoding="latin1")
-    # print("Raw Data read")
-
-
-    # Initialize BertTokenizer from Pretrained 
-    # tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
-    # print("Tokenizer Initialized")
-
-
-    # Split DataFrame into Train and Test Sets
-    # Create Custom Datasets for Train and Test
-    # train_data,test_data,validation_data = create_datasets(tokenizer,df,.7,True)
-    # print("Splitted Data in Train and Test Sets")
-    # print("Custom Datasets created") 
-
-
-   # Initialize Dataloader with Train and Test Sets
-    # train_loader, test_loader, validation_loader = create_dataloaders([train_data,test_data,validation_data],batchsize=BATCH_SIZE,shufflelist=[True,True,False])
-    # print("DataLoaders created")
-
-
-    # Set criterion to Cross Entropy and define Adam Optimizer with model parameters and learning rate
-    # criterion_cross_entropy = nn.CrossEntropyLoss()
-    # optimizer_adamW = optim.Adam(mybert.parameters(), lr = LEARNING_RATE)
-    # import time
-
-
-    # Set Scheduler for dynamically Learning Rate adjustment
-    loss_values, eval_values = np.zeros(EPOCH), np.zeros(EPOCH)
-
-    # for epoch in range(EPOCH):
-    #     start = time.time()
-    #     print(f"For {epoch+1} the Scores are: ")
-    #     loss_values[epoch] = training_loop(mybert,optimizer=optimizer_adamW,criterion=criterion_cross_entropy,train_loader=train_loader,freeze_bert=FREEZE)
-    #     eval_values[epoch] = eval_loop(mybert,criterion=criterion_cross_entropy,validation_loader=test_loader)
-    #     end = time.time()
-    #     print((end-start),"seconds per epoch needed")
-
-    # plot_metrics_loss_n_acc("x","x","x","x")
-
-    # for epoch in range(EPOCH):
-    #     test_loop(mybert,validation_loader)   
--- a/cnn_class.ipynb
+++ b/cnn_class.ipynb
--- a/cnn_reg.ipynb
+++ b/cnn_reg.ipynb
--- a/data/idx_based_padded/test.pt
+++ b/data/idx_based_padded/test.pt
--- a/data/idx_based_padded/train.pt
+++ b/data/idx_based_padded/train.pt
--- a/data/idx_based_padded/val.pt
+++ b/data/idx_based_padded/val.pt
--- a/data_explore_hack_reg.ipynb
+++ b/data_explore_hack_reg.ipynb
--- a/data_explore_hack.ipynb
+++ b/data_explore_hack.ipynb
--- a/data_explore_hack_rating.ipynb
+++ b/data_explore_hack_rating.ipynb
--- a/dataset_generator.py
+++ b/dataset_generator.py
@ -1,207 +0,0 @@
-"""
-This file contains the dataset generation and preprocessing.
-"""
-import pandas as pd
-import numpy as np
-from sklearn.model_selection import train_test_split
-from nltk.tokenize import word_tokenize
-import gensim
-import torch
-import os
-import copy
-import regex as re
-
-import HumorDataset
-
-# def load_glove_embeddings(glove_file_path):
-#     embeddings_index = {}
-#     with open(glove_file_path, 'r', encoding='utf-8') as f:
-#         for line in f:
-#             try:
-#                 values = line.split()
-#                 #print(values)
-#                 word = values[0]
-#                 coefs = np.asarray(values[1:], dtype='float32')
-#                 embeddings_index[word] = coefs
-#             except ValueError:
-#                 print('Error with line:', line[:100])
-#     return embeddings_index
-
-def load_glove_embeddings(glove_file_path, emb_len=100):
-    embeddings_index = {}
-    with open(glove_file_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            try:
-                # Use regex to split the line into word and coefficients
-                match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
-                # regex explanation: Match  word followed by one or more spaces and then the coefficients
-                if match:
-                    word = match.group(1)
-                    coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
-                    
-                    #check list length
-                    if len(coefs) != emb_len:
-                        print('Skip: Length mismatch with line:', line[:100])
-                    else:
-                        embeddings_index[word] = coefs
-                else:
-                    print('Error with line:', line[:100])
-            except ValueError:
-                print('Error with line:', line[:100])
-    return embeddings_index
-
-
-def create_embbedings_matrix(embeddings_glove, max_len=100):
-    embeddings_glove['<UNK>'] = np.random.rand(max_len)
-    embeddings_glove['<PAD>'] = np.zeros(max_len)
-    # Create a word index (vocabulary)
-    word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
-    # Special tokens are in the word index
-    word_index['<UNK>'] = len(word_index) - 2
-    word_index['<PAD>'] = len(word_index) - 1
-    # print len of word_index
-    print(len(word_index))
-    # Create an embedding matrix
-    embedding_dim = len(next(iter(embeddings_glove.values())))
-
-    embedding_matrix = np.zeros((len(word_index), embedding_dim))
-
-    for word, idx in word_index.items():
-        embedding_vector = embeddings_glove.get(word)
-        if embedding_vector is not None:
-            embedding_matrix[idx] = embedding_vector
-
-    # Convert the embedding matrix to a tensor
-    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
-    return embedding_matrix, word_index
-
-
-
-def create_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
-    embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
-
-    embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
-
-    vocab_size = len(embedding_matrix)
-    d_model = len(embedding_matrix[0])
-    vocab_size, d_model = embedding_matrix.size()
-    print(f"vocab_size: {vocab_size}, d_model: {d_model}")
-
-    return embedding_matrix, word_index, vocab_size, d_model
-
-
-def load_preprocess_data(path_data='data/hack.csv'):
-    df = pd.read_csv(path_data)
-    df = df.dropna(subset=['humor_rating'])
-    # find median of humor_rating
-    median_rating = df['humor_rating'].median()
-    df['y'] = df['humor_rating'] > median_rating 
-    X = df['text']
-    y = df['y']
-    return X, y
-
-
-def encode_tokens(tokens, embedding_index, default_vector_len=100):
-    return [embedding_index.get(token, np.random.zeros(default_vector_len)) for token in tokens]
-
-
-def pad_sequences(sequences, max_len, pad_index):
-    return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
-
-
-def split_data(X, y, test_size=0.1, val_size=0.1):
-    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
-    val_split_ratio = val_size / (test_size + val_size)
-    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
-
-    ret_dict = {
-        'train': {'X': X_train, 'y': y_train},
-        'test': {'X': X_test, 'y': y_test},
-        'val': {'X': X_val, 'y': y_val}
-    }
-
-    # for each print len
-    for key in ret_dict.keys():
-        print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
-
-    return ret_dict
-
-def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
-    if not os.path.exists(path):
-        print('Creating directory:', path)
-        os.makedirs(path)
-    print('saving data into:', path)
-    for key, value in data_dict.items():
-        # tansform to Dataset
-        dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
-        # save dataset
-        torch.save(dataset, path + prefix + key + '.pt')
-
-
-
-if __name__ == "__main__":
-    # Load the data from csv
-    df = pd.read_csv('data/hack.csv')
-    print(df.shape)
-
-    df = df.dropna(subset=['humor_rating'])
-
-    # find median of humor_rating
-    median_rating = df['humor_rating'].median()
-    #print('median and therefore middle of humor_rating:', median_rating)
-
-    df['y'] = df['humor_rating'] > median_rating 
-
-    # transfrom data into dataset
-    X = df['text']
-    y = df['y']
-
-    # Tokenize the data with nltk
-    tokens = [word_tokenize(text.lower()) for text in X]
-
-    vocab_size = len(set([word for sentence in tokens for word in sentence]))
-    print('vocab size:', vocab_size)
-
-    # Pad the sequences 
-    # NOTE: Info comes from data explore notebook: 280 is max length, 
-    # 139 contains 80% and 192 contains 95% of the data
-    max_len = 280
-    padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
-
-    # split data into train, test, and validation
-    data_dict = split_data(padded_indices, y)
-
-#     data_idx_based = copy.deepcopy(data_dict)
-#     vector_based = False
-
-#     for key in data_idx_based.keys():
-#         data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
-#         # print shape of data
-#         #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
-
-#     # save the data
-#     save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
-
-    print('loading GloVe embeddings')
-    # Load GloVe embeddings
-    glove_file_path = 'glove.6B/glove.6B.100d.txt'
-    #glove_file_path = 'glove.840B.300d/glove.840B.300d.txt'
-    embeddings_index = load_glove_embeddings(glove_file_path)
-    emb_len = 100
-    print('starting with embedding the data')
-    # Encode the tokens
-    #for key in data_dict.keys():
-        #data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=emb_len) for tokens in data_dict[key]['X']]
-        # print shape of data
-        #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
-        
-    # Save the data
-    #save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
-
-
-    max_len = 100
-    gloVe_path = 'glove.6B/glove.6B.100d.txt'
-    embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=max_len)
-
-    embeddings_glove['<UNK>'] = np.random.rand(max_len)
-    embeddings_glove['<PAD>'] = np.zeros(max_len)
--- a/dataset_helper.py
+++ b/dataset_helper.py
@ -0,0 +1,102 @@
+"""
+This file contains the dataset generation and preprocessing.
+"""
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import torch
+import regex as re
+
+def load_glove_embeddings(glove_file_path, emb_len=100):
+    embeddings_index = {}
+    with open(glove_file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                # Use regex to split the line into word and coefficients
+                match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
+                # regex explanation: Match  word followed by one or more spaces and then the coefficients
+                if match:
+                    word = match.group(1)
+                    coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
+                    
+                    #check list length
+                    if len(coefs) != emb_len:
+                        print('Skip: Length mismatch with line:', line[:100])
+                    else:
+                        embeddings_index[word] = coefs
+                else:
+                    print('Error with line:', line[:100])
+            except ValueError:
+                print('Error with line:', line[:100])
+    return embeddings_index
+
+
+def create_embbedings_matrix(embeddings_glove, max_len=100):
+    embeddings_glove['<UNK>'] = np.random.rand(max_len)
+    embeddings_glove['<PAD>'] = np.zeros(max_len)
+    # Create a word index (vocabulary)
+    word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
+    # Special tokens are in the word index
+    word_index['<UNK>'] = len(word_index) - 2
+    word_index['<PAD>'] = len(word_index) - 1
+    # print len of word_index
+    print(len(word_index))
+    # Create an embedding matrix
+    embedding_dim = len(next(iter(embeddings_glove.values())))
+
+    embedding_matrix = np.zeros((len(word_index), embedding_dim))
+
+    for word, idx in word_index.items():
+        embedding_vector = embeddings_glove.get(word)
+        if embedding_vector is not None:
+            embedding_matrix[idx] = embedding_vector
+
+    # Convert the embedding matrix to a tensor
+    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
+    return embedding_matrix, word_index
+
+def get_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
+    embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
+
+    embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
+
+    vocab_size = len(embedding_matrix)
+    d_model = len(embedding_matrix[0])
+    vocab_size, d_model = embedding_matrix.size()
+    print(f"vocab_size: {vocab_size}, d_model: {d_model}")
+
+    return embedding_matrix, word_index, vocab_size, d_model
+
+
+def load_preprocess_data(path_data='data/hack.csv', verbose=False):
+    # Daten laden
+    df = pd.read_csv(path_data)
+    # Fehlende Werte in der Zielspalte entfernen
+    df = df.dropna(subset=['humor_rating'])
+    # Zielvariable aus der Spalte 'humor_rating' extrahieren
+    df['y'] = df['humor_rating'].astype(float)  # Sicherstellen, dass Zielvariable numerisch ist
+    # Eingabetexte und Zielvariable zuweisen
+    X = df['text']
+    y = df['y']
+    if verbose:
+        print(f"Erste Zielwerte: {y.head(10)}")
+        print(f"Datentyp der Zielvariable: {y.dtype}")
+        print(f"Anzahl der Beispiele: {len(X)}")
+    return X, y
+
+def split_data(X, y, test_size=0.1, val_size=0.1):
+    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
+    val_split_ratio = val_size / (test_size + val_size)
+    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
+
+    ret_dict = {
+        'train': {'X': X_train, 'y': y_train},
+        'test': {'X': X_test, 'y': y_test},
+        'val': {'X': X_val, 'y': y_val}
+    }
+
+    # for each print len
+    for key in ret_dict.keys():
+        print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
+
+    return ret_dict
--- a/histories/CNN_history_20250215_170051.json
+++ b/histories/CNN_history_20250215_170051.json
--- a/ml_evaluation.py
+++ b/ml_evaluation.py
@ -1,129 +0,0 @@
-import torch
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from sklearn.metrics import confusion_matrix, f1_score
-import pandas as pd
-import matplotlib.patches as mpatches
-
-def get_accuracy(outputs, labels):
-    correct = np.array([p == l for p, l in zip(outputs, labels)])
-    accuracy = correct.sum() / len(labels)
-    return accuracy
-
-def get_f1_score(outputs, labels):
-    outputs = torch.tensor(outputs)
-    labels = torch.tensor(labels)
-    f1 = f1_score(labels, outputs)
-    return f1
-
-def plot_confusion_matrix(outputs, labels, class_names=['No Humor', 'Humor'], title='Confusion Matrix'):
-    conf_matrix = confusion_matrix(labels, outputs)
-
-    plt.figure(figsize=(6,5))
-    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", xticklabels=class_names, yticklabels=class_names)
-    plt.xlabel("Predicted Label")
-    plt.ylabel("True Label")
-    plt.title(title)
-    return plt
-
-
-def get_label_distribution(labels, preds):
-    # Calculate wrong predictions
-    wrong_preds = np.array(labels) != np.array(preds)
-
-    # Calculate the number of wrong predictions for each class
-    class_0_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 0)
-    class_1_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 1)
-    # Calculate the total number of wrong predictions
-    total_wrong_preds = np.sum(wrong_preds)
-    # Calculate and print the ratio of wrong predictions for each class
-    class_0_ratio = class_0_wrong_preds / total_wrong_preds
-    class_1_ratio = class_1_wrong_preds / total_wrong_preds
-
-    print(f"Class 0: {class_0_ratio:.2f}")
-    print(f"Class 1: {class_1_ratio:.2f}")
-
-def plot_training_history(history, title='Training History'):
-    hist_data = history.get_history() 
-
-    epochs = range(1, len(hist_data['train_loss']) + 1)
-
-    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
-
-    # Plot accuracy
-    axs[1].plot(epochs, hist_data['train_acc'], label='Train Accuracy')
-    axs[1].plot(epochs, hist_data['val_acc'], label='Validation Accuracy')
-    axs[1].set_title('Accuracy')
-    axs[1].set_xlabel('Epochs')
-    axs[1].set_ylabel('Accuracy')
-    axs[1].legend()
-
-    # Plot loss
-    axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
-    axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
-    axs[0].set_title('Loss')
-    axs[0].set_xlabel('Epochs')
-    axs[0].set_ylabel('Loss')
-    axs[0].legend()
-
-    plt.tight_layout()
-    plt.suptitle(title)
-    return plt
-
-
-
-def load_data(filepath):
-    """
-    Load the data from a CSV file.
-    """
-    df = pd.read_csv(filepath)
-    #print(df.shape)
-    return df
-
-def process_data(df, test_dataset, all_preds, all_labels):
-    """
-    Process the data to prepare it for plotting.
-    """
-    df_test = df.iloc[test_dataset.original_indices].copy()
-    df_test['prediction'] = all_preds
-    df_test['label'] = all_labels
-    df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])
-    df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)
-    return df_test_sorted
-
-def plot_rating_df_based(df_test_sorted, title='Humor Rating vs Prediction for Test Set'):
-    """
-    Plot the results of the predictions.
-    """
-    median_rating = df_test_sorted['humor_rating'].median()
-    median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]
-    #print(median_idx)
-    
-    range_idx = range(len(df_test_sorted))
-    colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})
-    
-    plt.figure(figsize=(12, 6))
-    plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)
-    plt.axvline(x=median_idx, color='black', linestyle='--')
-    
-    green_patch = mpatches.Patch(color='g', label='Correct Prediction')
-    red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')
-    line_patch = mpatches.Patch(color='black', label='humor_rating cut off')
-    
-    plt.title(title)
-    plt.xlabel('Index')
-    plt.ylabel('Humor Rating')
-    plt.legend(handles=[green_patch, red_patch, line_patch])
-    return plt
-
-
-def plot_rating_preds(all_preds, all_labels, 
-                      test_dataset,
-                      title='Humor Rating vs Prediction for Test Set',
-                      data_path = 'data/hack.csv'):
-    
-    data = load_data(data_path)
-    df_test_sorted = process_data(data, test_dataset, all_preds, all_labels)
-    plt = plot_rating_df_based(df_test_sorted, title=title)
-    return plt
--- a/ml_helper.py
+++ b/ml_helper.py
@ -5,40 +5,41 @@ import time
 import json
 import os

-def get_device(verbose=False):
+def get_device(verbose=False, include_mps=False):
    """
    Get the current device (MPS, CPU or GPU) for PyTorch.
    """
-    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if verbose:
        print('Using device:', device)
+    if include_mps:
+        device = torch.device("mps" if torch.backends.mps.is_available() else device)
    return device

-def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
+def save_model_and_hyperparams(model, model_prefix_name, rmse, hyperparameters, timestamp=None):
    """
    Save the model and hyperparameters to disk.
-    **kwargs: hyperparameters to save
+    hyperparameters: dictionary containing hyperparameters to save
    """
    # Create a timestamp
    if timestamp is None:
        timestamp = time.strftime("%Y%m%d-%H%M%S")
-    
-    accuracy = round(accuracy, 4)
+
+    rmse = round(rmse, 4)

    # Save the model state dictionary
-    model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
+    model_path = f'models/{model_prefix_name}_acc_{rmse}_{timestamp}.pth'
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}.")
    
    # Save the hyperparameters as a JSON file
-    hyperparameters = kwargs
-    hyperparameters['accuracy'] = accuracy
-    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
+    hyperparameters['rmse'] = rmse
+    hyperparameters_path = f'models/{model_prefix_name}_para_acc_{rmse}_{timestamp}.json'
    with open(hyperparameters_path, 'w') as f:
        json.dump(hyperparameters, f)
    print(f"Hyperparameters saved to {hyperparameters_path}.")

-def get_newest_model_path(path, name=None, extension=".pth"):
+def get_newest_file(path, name=None, extension=".pth"):
    """
    Get the newest file in a directory.
    """
--- a/ml_history.py
+++ b/ml_history.py
@ -1,70 +1,115 @@
 import numpy as np
-import torch
+from sklearn.metrics import mean_squared_error
+
+from datetime import datetime
+import json
+import os
+

 class History:
    """
    Class to store the history of the training process.
-    Used to store the loss and accuracy of the training and validation sets.
+    Used to store the loss and rmse of the training and validation sets.
    """
    def __init__(self):
        self.history = {
            'train_loss': [],
            'val_loss': [],

-            'train_acc': [],
-            'val_acc': [],
+            'train_rmse': [],
+            'val_rmse': [],
+
+            'val_labels': [],
+            # val_preds contains structs {epoch: [preds], ...}
+            'val_preds': [],
+
+            # only needed in the end not in training
+            'test_labels': [],
+            'test_preds': [],
        }
        self.batch_history = {
            'train_loss': [],
            'val_loss': [],

-            'train_acc': [],
-            'val_acc': [],
+            'train_rmse': [],
+            'val_rmse': [],
        }

    def update(self):
-        self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
-        self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
-        self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
-        self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
+        if self.batch_history['train_loss']:
+            self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
+        if self.batch_history['val_loss']:
+            self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
+        if self.batch_history['train_rmse']:
+            self.history['train_rmse'].append(np.mean(self.batch_history['train_rmse']))
+        if self.batch_history['val_rmse']:
+            self.history['val_rmse'].append(np.mean(self.batch_history['val_rmse']))

    def get_history(self):
        return self.history
    
-    def calculate_accuracy(self, outputs, labels):
-        preds = torch.argmax(outputs, dim=1)
-        correct = (preds == labels).sum().item()
-        accuracy = correct / len(labels)
-        return accuracy
+    def calculate_rmse(self, outputs, labels):
+        return np.sqrt(mean_squared_error(labels, outputs))
    
    def batch_reset(self):
        self.batch_history = {
            'train_loss': [],
            'val_loss': [],
-            'train_acc': [],
-            'val_acc': [],
+            'train_rmse': [],
+            'val_rmse': [],
        }
    
-    def batch_update(self, train_loss, val_loss, train_acc, val_acc):
+    def batch_update(self, train_loss, val_loss, train_rmse, val_rmse):
        self.batch_history['train_loss'].append(train_loss)
        self.batch_history['val_loss'].append(val_loss)
-        self.batch_history['train_acc'].append(train_acc)
-        self.batch_history['val_acc'].append(val_acc)
+        self.batch_history['train_rmse'].append(train_rmse)
+        self.batch_history['val_rmse'].append(val_rmse)

    def batch_update_train(self, train_loss, preds, labels):
-        train_acc = self.calculate_accuracy(preds, labels)
+        train_rmse = self.calculate_rmse(preds, labels)
        self.batch_history['train_loss'].append(train_loss)
-        self.batch_history['train_acc'].append(train_acc)
+        self.batch_history['train_rmse'].append(train_rmse)

-    def batch_update_val(self, val_loss, preds, labels):
-        val_acc = self.calculate_accuracy(preds, labels)
+    def batch_update_val(self, val_loss, preds, labels, epoch):
+        val_rmse = self.calculate_rmse(preds, labels)
        self.batch_history['val_loss'].append(val_loss)
-        self.batch_history['val_acc'].append(val_acc)
+        self.batch_history['val_rmse'].append(val_rmse)
+
+        self.history['val_labels'] = labels.tolist()
+        self.history['val_preds'].append({epoch: preds.tolist()})
+

    def get_batch_history(self):
        return self.batch_history
    
-    def print_history(self, epoch, max_epochs, time_elapsed, verbose=True):
-        if verbose:
-            print(f'Epoch {epoch:>3}/{max_epochs} - {time_elapsed:.2f}s - loss: {self.history["train_loss"][-1]:.4f} - accuracy: {self.history["train_acc"][-1]:.4f} - val_loss: {self.history["val_loss"][-1]:.4f} - val_accuracy: {self.history["val_acc"][-1]:.4f}')
-    
+    def add_test_results(self, test_labels, test_preds):
+        self.history['test_labels'] = test_labels
+        self.history['test_preds'] = test_preds
+    
+
+    def convert_hist(self):
+        # Needed for saving the history to a json file: 
+        # convert numpy arrays to lists and use float instead of numpy float
+        history_to_save = {}
+        for hist_key, hist_val in self.history.items():
+            if hist_key == 'val_preds':
+                history_to_save[hist_key] = [{k: [float(x) for x in v] for k, v in val.items()} for val in hist_val]
+            else:
+                history_to_save[hist_key] = [float(x) for x in hist_val]
+
+        return history_to_save
+
+    def save_history(self, hist_name):
+        directory = "histories"
+        if not os.path.exists(directory):
+            os.makedirs(directory)  # Create the directory if it does not exist
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json")
+
+        # Needed for saving the history to a json file: 
+        # convert numpy arrays to lists and use float instead of numpy float
+        history_to_save = self.convert_hist()
+
+        with open(filepath, 'w') as f:
+            json.dump(history_to_save, f, indent=4)
+        print(f"History saved to {filepath}")
--- a/ml_plots.py
+++ b/ml_plots.py
@ -0,0 +1,75 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import time
+
+def save_plot(plt, plot_name):
+    if not os.path.exists('plots'):
+        os.makedirs('plots')
+    # create timestamp
+    time_stamp = time.strftime('%Y%m%d-%H%M%S')
+    plt.savefig(f'plots/{plot_name}_{time_stamp}.png')
+
+def plot_training_history(hist_data, title='Training History', save=True):
+
+    epochs = range(1, len(hist_data['train_loss']) + 1)
+
+    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
+
+    # Plot accuracy
+    axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE')
+    axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE')
+    axs[1].set_title('RMSE')
+    axs[1].set_xlabel('Epochs')
+    axs[1].set_ylabel('RMSE')
+    axs[1].legend()
+
+    # Plot loss
+    axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
+    axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
+    axs[0].set_title('Loss')
+    axs[0].set_xlabel('Epochs')
+    axs[0].set_ylabel('Loss')
+    axs[0].legend()
+
+    plt.tight_layout()
+    plt.suptitle(title)
+
+    # save plot
+    if save:
+        save_plot(plt, title)
+    return plt
+
+def plot_distribution(true_values, predicted_values, title='Distribution of Predicted and True Values', save=True):
+    plt.figure(figsize=(10, 6))
+    plt.hist(true_values, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
+    plt.hist(predicted_values, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
+    plt.title(title)
+    plt.xlabel('Score')
+    plt.ylabel('Frequency')
+    plt.legend()
+    plt.grid(axis='y', linestyle='--', alpha=0.7)
+    # save plot
+    if save:
+        save_plot(plt, title)
+    return plt
+
+def plot_predictions(true_values, predicted_values, title='True vs Predicted Values', threshold=0.3, save=True):
+    plt.figure(figsize=(10, 6))
+    # Difference between predicted and true values
+    correct_indices = np.isclose(true_values, predicted_values, atol=threshold)
+    incorrect_indices = ~correct_indices
+    # Plot
+    plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color='green', label='Correctly predicted')
+    plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color='red', label='Incorrectly predicted')
+    plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal Line')
+    plt.xlabel('True Values')
+    plt.ylabel('Predicted Values')
+    plt.title(title)
+    plt.legend()
+    plt.grid(True)
+    # save plot
+    if save:
+        save_plot(plt, title)
+    return plt
--- a/ml_train.py
+++ b/ml_train.py
@ -0,0 +1,87 @@
+from tqdm import tqdm
+import torch
+import numpy as np
+
+
+def train_epoch(model, train_loader, criterion, optimizer, device, history, epoch, total_epochs, bert_freeze=False, is_bert=False):
+    model.train()
+    if bert_freeze and hasattr(model, 'freeze_bert_params'):
+        model.freeze_bert_params()
+
+    with tqdm(train_loader, desc=f"├ Epoch {epoch + 1}/{total_epochs}") as pbar:
+        for batch in pbar:
+            optimizer.zero_grad()
+            if is_bert:
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                labels = batch['labels'].to(device).float()
+                predictions = model(input_ids, attention_mask=attention_mask).float()
+            else:
+                X_batch, y_batch = batch
+                X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
+                predictions = model(X_batch).float()
+                labels = y_batch
+
+            loss = criterion(predictions, labels)
+            loss.backward()
+            optimizer.step()
+            
+            preds = predictions.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+            history.batch_update_train(loss.item(), preds, labels)
+            
+            # Update progress bar
+            pbar.set_postfix({"Train Loss": loss.item()})
+    
+    history.update()
+    history.batch_reset()
+
+
+def validate_epoch(model, val_loader, epoch, criterion, device, history, is_bert=False):
+    model.eval()
+    val_loss = 0.0
+    val_preds, val_labels = [], []
+    with torch.no_grad():
+        for batch in val_loader:
+            if is_bert:
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                labels = batch['labels'].to(device).float()
+                predictions = model(input_ids, attention_mask=attention_mask).float()
+            else:
+                X_batch, y_batch = batch
+                X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
+                labels = y_batch
+                predictions = model(X_batch).float()
+            loss = criterion(predictions, labels)
+            val_loss += loss.item()
+            val_preds.extend(predictions.cpu().detach().numpy())
+            val_labels.extend(labels.cpu().detach().numpy())
+    
+    val_rmse = history.calculate_rmse(np.array(val_preds), np.array(val_labels))
+    history.batch_update_val(val_loss / len(val_loader), np.array(val_preds), np.array(val_labels), epoch)
+    history.update()
+    history.batch_reset()
+    
+    return val_rmse
+
+
+def test_loop(model, test_loader, device, is_bert=False):
+    model.eval()
+    test_preds, test_labels = [], []
+    with torch.no_grad():
+        for batch in test_loader:
+            if is_bert:
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                labels = batch['labels'].to(device).float()
+                predictions = model(input_ids, attention_mask=attention_mask).float()
+            else:
+                X_batch, y_batch = batch
+                X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
+                labels = y_batch
+                predictions = model(X_batch).float()
+            test_preds.extend(predictions.cpu().detach().numpy())
+            test_labels.extend(labels.cpu().detach().numpy())
+
+    return test_labels, test_preds
--- a/model_comparison.ipynb
+++ b/model_comparison.ipynb
@ -0,0 +1,24 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: compare"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/model_evaluation.ipynb
+++ b/model_evaluation.ipynb
--- a/puns/data_explore_pun.ipynb
+++ b/puns/data_explore_pun.ipynb
--- a/puns/data_explore_pun_new.ipynb
+++ b/puns/data_explore_pun_new.ipynb
--- a/puns/data_matching.ipynb
+++ b/puns/data_matching.ipynb
@ -1,187 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load the data\n",
-    "with open('data/pun_anno/pun_het.json') as f:\n",
-    "    data_het = json.load(f)\n",
-    "\n",
-    "with open('data/pun_anno/pun_hom.json') as f:\n",
-    "    data_hom = json.load(f)\n",
-    "\n",
-    "with open('data/pun_annotated.json') as f:\n",
-    "    data_anno = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a DataFrame\n",
-    "df_anno = pd.DataFrame(data_anno)\n",
-    "\n",
-    "df_het = pd.DataFrame(data_het)\n",
-    "# df switch columns to rows\n",
-    "df_het = df_het.T\n",
-    "\n",
-    "df_hom = pd.DataFrame(data_hom)\n",
-    "# df switch columns to rows\n",
-    "df_hom = df_hom.T"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0        hom_362\n",
-      "1        het_837\n",
-      "2        het_635\n",
-      "3        hom_657\n",
-      "4       het_1275\n",
-      "          ...   \n",
-      "1894    hom_2076\n",
-      "1895    hom_1437\n",
-      "1896    het_1530\n",
-      "1897     het_100\n",
-      "1898     hom_364\n",
-      "Name: ID, Length: 1899, dtype: object\n",
-      "Index(['het_991', 'het_990', 'het_987', 'het_982', 'het_980', 'het_978',\n",
-      "       'het_973', 'het_958', 'het_956', 'het_955',\n",
-      "       ...\n",
-      "       'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
-      "       'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
-      "      dtype='object', length=1146)\n",
-      "Index(['hom_998', 'hom_996', 'hom_994', 'hom_993', 'hom_992', 'hom_990',\n",
-      "       'hom_99', 'hom_985', 'hom_984', 'hom_981',\n",
-      "       ...\n",
-      "       'hom_2221', 'hom_2223', 'hom_2225', 'hom_2226', 'hom_2230', 'hom_2232',\n",
-      "       'hom_2234', 'hom_2243', 'hom_2246', 'hom_2247'],\n",
-      "      dtype='object', length=1443)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# print index for each df\n",
-    "print(df_anno['ID'])\n",
-    "print(df_het.index)\n",
-    "print(df_hom.index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(655, 8) (1146, 11) (1899, 8)\n",
-      "(825, 8) (1443, 11) (1899, 8)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# find matches from df_anno['ID'] to df_het.index\n",
-    "df_het_match = df_anno[df_anno['ID'].isin(df_het.index)]\n",
-    "print(df_het_match.shape, df_het.shape, df_anno.shape)\n",
-    "\n",
-    "# find matches from df_anno['ID'] to df_hom.index\n",
-    "df_hom_match = df_anno[df_anno['ID'].isin(df_hom.index)]\n",
-    "print(df_hom_match.shape, df_hom.shape, df_anno.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0        hom_362\n",
-      "3        hom_657\n",
-      "6       hom_1510\n",
-      "7        hom_955\n",
-      "8       hom_1505\n",
-      "          ...   \n",
-      "1893     hom_151\n",
-      "1894    hom_2076\n",
-      "1895    hom_1437\n",
-      "1896    het_1530\n",
-      "1898     hom_364\n",
-      "Name: ID, Length: 1244, dtype: object\n",
-      "Index(['het_955', 'het_907', 'het_905', 'het_786', 'het_783', 'het_777',\n",
-      "       'het_639', 'het_573', 'het_466', 'het_435',\n",
-      "       ...\n",
-      "       'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
-      "       'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
-      "      dtype='object', length=491)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# print not matched IDs and index\n",
-    "print(df_anno[~df_anno['ID'].isin(df_het.index)]['ID'])\n",
-    "print(df_het.index[~df_het.index.isin(df_anno['ID'])])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# merge df_anno and df_het where ID matches with index\n",
-    "df_het_merge = pd.merge(df_anno, df_het, left_on='ID', right_index=True)\n",
-    "# score_avg \n",
-    "df_het_merge['score_avg'] = df_het_merge['Funniness (1-5)'].apply(lambda x: np.mean(x))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/puns/pun_anno/pun_het.json
+++ b/puns/pun_anno/pun_het.json
--- a/puns/pun_anno/pun_hom.json
+++ b/puns/pun_anno/pun_hom.json
--- a/puns/pun_annotated.json
+++ b/puns/pun_annotated.json
--- a/transformer.ipynb
+++ b/transformer.ipynb
--- a/transformer_reg.ipynb
+++ b/transformer_reg.ipynb
@ -1,584 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KuFFT6LrB6Fe"
-      },
-      "outputs": [],
-      "source": [
-        "import time\n",
-        "import json\n",
-        "import math\n",
-        "\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        "import matplotlib.pyplot as plt\n",
-        "import seaborn as sns\n",
-        "\n",
-        "from nltk.tokenize import word_tokenize\n",
-        "\n",
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "from torch.utils.data import DataLoader\n",
-        "from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
-        "\n",
-        "from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n",
-        "from sklearn.model_selection import KFold\n",
-        "# local imports\n",
-        "import ml_evaluation as ml_eval\n",
-        "import ml_helper\n",
-        "import ml_history\n",
-        "import dataset_generator as data_gen\n",
-        "# class imports\n",
-        "import HumorDataset as humor_ds\n",
-        "import EarlyStopping\n",
-        "import BalancedCELoss\n",
-        "\n",
-        "\n",
-        "# architecture inspired:\n",
-        "# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n",
-        "\n",
-        "# TODO: maybe KFold for cross validation?\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Using device: cuda\n"
-          ]
-        }
-      ],
-      "source": [
-        "torch.manual_seed(0)\n",
-        "np.random.seed(0)\n",
-        "\n",
-        "\n",
-        "best_model_filename = 'best_transformer_reg_model.pt'\n",
-        "\n",
-        "device = ml_helper.get_device(verbose=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Load Embeddings"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "400002\n",
-            "vocab_size: 400002, d_model: 100\n",
-            "vocab_size: 400002, d_model: 100\n"
-          ]
-        }
-      ],
-      "source": [
-        "embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n",
-        "\n",
-        "vocab_size = len(embedding_matrix)\n",
-        "d_model = len(embedding_matrix[0])\n",
-        "vocab_size, d_model = embedding_matrix.size()\n",
-        "print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Define Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "class PositionalEncoding(nn.Module):\n",
-        "    \"\"\"\n",
-        "    https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n",
-        "    \"\"\"\n",
-        "\n",
-        "    def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n",
-        "        super().__init__()\n",
-        "        self.dropout = nn.Dropout(p=dropout)\n",
-        "\n",
-        "        pe = torch.zeros(vocab_size, d_model)\n",
-        "        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n",
-        "        div_term = torch.exp(\n",
-        "            torch.arange(0, d_model, 2).float()\n",
-        "            * (-math.log(10000.0) / d_model)\n",
-        "        )\n",
-        "        pe[:, 0::2] = torch.sin(position * div_term)\n",
-        "        pe[:, 1::2] = torch.cos(position * div_term)\n",
-        "        pe = pe.unsqueeze(0)\n",
-        "        self.register_buffer(\"pe\", pe)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = x + self.pe[:, : x.size(1), :]\n",
-        "        return self.dropout(x)\n",
-        "\n",
-        "\n",
-        "class TransformerBinaryClassifier(nn.Module):\n",
-        "    \"\"\"\n",
-        "    Text classifier based on a pytorch TransformerEncoder.\n",
-        "    \"\"\"\n",
-        "\n",
-        "    def __init__(\n",
-        "        self,\n",
-        "        embeddings,\n",
-        "        nhead=8,\n",
-        "        dim_feedforward=2048,\n",
-        "        num_layers=6,\n",
-        "        positional_dropout=0.1,\n",
-        "        classifier_dropout=0.1,\n",
-        "        activation=\"relu\",\n",
-        "    ):\n",
-        "\n",
-        "        super().__init__()\n",
-        "\n",
-        "        vocab_size, d_model = embeddings.size()\n",
-        "        assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n",
-        "\n",
-        "        self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n",
-        "\n",
-        "        self.pos_encoder = PositionalEncoding(\n",
-        "            d_model=d_model,\n",
-        "            dropout=positional_dropout,\n",
-        "            vocab_size=vocab_size,\n",
-        "        )\n",
-        "\n",
-        "        encoder_layer = nn.TransformerEncoderLayer(\n",
-        "            d_model=d_model,\n",
-        "            nhead=nhead,\n",
-        "            dim_feedforward=dim_feedforward,\n",
-        "            dropout=classifier_dropout,\n",
-        "        )\n",
-        "        self.transformer_encoder = nn.TransformerEncoder(\n",
-        "            encoder_layer,\n",
-        "            num_layers=num_layers,\n",
-        "        )\n",
-        "        # normalize to stabilize and stop overfitting\n",
-        "        self.batch_norm = nn.BatchNorm1d(d_model)\n",
-        "        self.classifier = nn.Linear(d_model, 1)\n",
-        "        self.d_model = d_model\n",
-        "        #self.softmax = nn.Softmax(dim=1)\n",
-        "        #self.sigmoid = nn.Sigmoid()\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = self.emb(x) * math.sqrt(self.d_model)\n",
-        "        x = self.pos_encoder(x)\n",
-        "        x = self.transformer_encoder(x)\n",
-        "        x = x.mean(dim=1)\n",
-        "        # normalize to stabilize and stop overfitting\n",
-        "        #x = self.batch_norm(x)\n",
-        "\n",
-        "        #NOTE: no activation function for regression\n",
-        "        # sigmoid would only distort the output\n",
-        "        x = self.classifier(x)\n",
-        "        \n",
-        "        return x\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Load data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 20,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def load_preprocess_data(path_data='data/hack.csv'):\n",
-        "    df = pd.read_csv(path_data)\n",
-        "    df = df.dropna(subset=['humor_rating'])\n",
-        "\n",
-        "    df['y'] = df['humor_rating']\n",
-        "    X = df['text']\n",
-        "    y = df['y']\n",
-        "    return X, y"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 26,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "train 3945 3945\n",
-            "test 494 494\n",
-            "val 493 493\n"
-          ]
-        }
-      ],
-      "source": [
-        "X,y = load_preprocess_data()\n",
-        "\n",
-        "ret_dict = data_gen.split_data(X, y)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Set hyper params"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 27,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "model created\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
-            "  warnings.warn(\n"
-          ]
-        }
-      ],
-      "source": [
-        "params = {\n",
-        "    # used for class balancing\n",
-        "    'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n",
-        "    # training parameters\n",
-        "    'batch_size': 32, # 32 (16 to 64)\n",
-        "    'epochs': 100, # 100\n",
-        "    'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n",
-        "    \n",
-        "    # NOTE: used for gradient clipping (needed for lstm and transformer)\n",
-        "    # use 0 to disable\n",
-        "    'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n",
-        "    \n",
-        "    # patience for early stopping\n",
-        "    'early_stopping_patience': 5, # 5 (3 to 10)\n",
-        "\n",
-        "    # learning rate scheduler\n",
-        "    'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n",
-        "    'lr_scheduler_patience': 3, # 3 (2 to 5)\n",
-        "\n",
-        "    # model parameters\n",
-        "    'nhead': 2, # 5\n",
-        "    'num_layers': 3, # 6\n",
-        "    'hidden_dim': 10, # 50\n",
-        "\n",
-        "    # regularization parameters\n",
-        "    'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
-        "    'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
-        "    'weight_decay': 1e-2 # 0.0  (1e-6 to 1e-2)\n",
-        "}\n",
-        "\n",
-        "# Model initialization\n",
-        "model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n",
-        "                                    nhead=params['nhead'], \n",
-        "                                    num_layers=params['num_layers'], \n",
-        "                                    dim_feedforward=params['hidden_dim'],\n",
-        "                                    positional_dropout=params['positional_dropout'],\n",
-        "                                    classifier_dropout=params['classifier_dropout']\n",
-        "                                    )\n",
-        "model.to(device)\n",
-        "print('model created')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### create datasets"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "datasets length: 3945 493\n",
-            "train: 124, val: 16, test: 16\n"
-          ]
-        }
-      ],
-      "source": [
-        "# NOTE: Info comes from data explore notebook: 280 is max length,\n",
-        "# 139 contains 80% and 192 contains 95% of the data\n",
-        "max_len = 280\n",
-        "\n",
-        "train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n",
-        "val_dataset =  humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n",
-        "test_dataset =  humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n",
-        "\n",
-        "print('datasets length:', len(train_dataset), len(val_dataset))\n",
-        "#NOTE: overfitting test\n",
-        "#train_dataset.labels = train_dataset.labels[:100]\n",
-        "#train_dataset.texts = train_dataset.texts[:100]\n",
-        "\n",
-        "train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
-        "val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n",
-        "test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
-        "\n",
-        "# NOTE: samller because of batches not all data\n",
-        "print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Set training requirements"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "#TODO: change to RMSE\n",
-        "\"\"\"\n",
-        "criterion = nn.MSELoss()\n",
-        "loss = torch.sqrt(criterion(x, y))\n",
-        "loss.backward()\n",
-        "print(x.grad)\n",
-        "\"\"\"\n",
-        "criterion = nn.MSELoss()\n",
-        "\n",
-        "optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n",
-        "                            lr=params['lr']) #, \n",
-        "                            #weight_decay=params['weight_decay'])\n",
-        "\"\"\"\n",
-        "scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n",
-        "                                                        factor=params['lr_scheduler_factor'],\n",
-        "                                                        patience=params['lr_scheduler_patience'],\n",
-        "                                                        verbose=True)\n",
-        "\"\"\"\n",
-        "early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# Training loop"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s\n",
-            "Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s\n",
-            "Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s\n",
-            "Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
-            "Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s\n",
-            "Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s\n",
-            "Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s\n",
-            "Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s\n",
-            "Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s\n",
-            "Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s\n",
-            "Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s\n",
-            "Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s\n",
-            "Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s\n",
-            "Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.8873, Time: 2.28s\n",
-            "Epoch 17/100, Train Loss: 1.8081, Val Loss: 1.8873, Time: 2.44s\n",
-            "Epoch 18/100, Train Loss: 1.8078, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 19/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.15s\n",
-            "Epoch 20/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 21/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 22/100, Train Loss: 1.8103, Val Loss: 1.8873, Time: 2.09s\n",
-            "Epoch 23/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.16s\n",
-            "Epoch 24/100, Train Loss: 1.8034, Val Loss: 1.8873, Time: 2.24s\n",
-            "Epoch 25/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.46s\n",
-            "Epoch 26/100, Train Loss: 1.8084, Val Loss: 1.8873, Time: 2.38s\n",
-            "Epoch 27/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.35s\n",
-            "Epoch 28/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.15s\n",
-            "Epoch 29/100, Train Loss: 1.8136, Val Loss: 1.8873, Time: 2.24s\n",
-            "Epoch 30/100, Train Loss: 1.8051, Val Loss: 1.8873, Time: 2.28s\n",
-            "Epoch 31/100, Train Loss: 1.8026, Val Loss: 1.8873, Time: 2.19s\n",
-            "Epoch 32/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.16s\n",
-            "Epoch 33/100, Train Loss: 1.8121, Val Loss: 1.8873, Time: 2.13s\n",
-            "Epoch 34/100, Train Loss: 1.8098, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 35/100, Train Loss: 1.8036, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 36/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.19s\n",
-            "Epoch 37/100, Train Loss: 1.8108, Val Loss: 1.8873, Time: 2.50s\n",
-            "Epoch 38/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.45s\n",
-            "Epoch 39/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.38s\n",
-            "Epoch 40/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.22s\n",
-            "Epoch 41/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.26s\n",
-            "Epoch 42/100, Train Loss: 1.8088, Val Loss: 1.8873, Time: 2.30s\n",
-            "Epoch 43/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.28s\n",
-            "Epoch 44/100, Train Loss: 1.8029, Val Loss: 1.8873, Time: 2.14s\n",
-            "Epoch 45/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.14s\n",
-            "Epoch 46/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.22s\n",
-            "Epoch 47/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.19s\n",
-            "Epoch 48/100, Train Loss: 1.8069, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 49/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.22s\n",
-            "Epoch 50/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 51/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 52/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.08s\n",
-            "Epoch 53/100, Train Loss: 1.8075, Val Loss: 1.8873, Time: 2.00s\n",
-            "Epoch 54/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 55/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.02s\n",
-            "Epoch 56/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 57/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.34s\n",
-            "Epoch 58/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 59/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.11s\n",
-            "Epoch 60/100, Train Loss: 1.8100, Val Loss: 1.8873, Time: 2.05s\n",
-            "Epoch 61/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.08s\n",
-            "Epoch 62/100, Train Loss: 1.8068, Val Loss: 1.8873, Time: 2.22s\n",
-            "Epoch 63/100, Train Loss: 1.8012, Val Loss: 1.8873, Time: 2.32s\n",
-            "Epoch 64/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.35s\n",
-            "Epoch 65/100, Train Loss: 1.8109, Val Loss: 1.8873, Time: 2.36s\n",
-            "Epoch 66/100, Train Loss: 1.8030, Val Loss: 1.8873, Time: 2.28s\n",
-            "Epoch 67/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.24s\n",
-            "Epoch 68/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.20s\n",
-            "Epoch 69/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.18s\n",
-            "Epoch 70/100, Train Loss: 1.8019, Val Loss: 1.8873, Time: 2.15s\n",
-            "Epoch 71/100, Train Loss: 1.8025, Val Loss: 1.8873, Time: 2.19s\n",
-            "Epoch 72/100, Train Loss: 1.8124, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 73/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.06s\n",
-            "Epoch 74/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.06s\n",
-            "Epoch 75/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.08s\n",
-            "Epoch 76/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
-            "Epoch 77/100, Train Loss: 1.8141, Val Loss: 1.8873, Time: 2.39s\n",
-            "Epoch 78/100, Train Loss: 1.8092, Val Loss: 1.8873, Time: 2.44s\n",
-            "Epoch 79/100, Train Loss: 1.8106, Val Loss: 1.8873, Time: 2.30s\n",
-            "Epoch 80/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.25s\n",
-            "Epoch 81/100, Train Loss: 1.8142, Val Loss: 1.8873, Time: 2.26s\n",
-            "Epoch 82/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.08s\n",
-            "Epoch 83/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.14s\n",
-            "Epoch 84/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.15s\n",
-            "Epoch 85/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.17s\n",
-            "Epoch 86/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.12s\n",
-            "Epoch 87/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.09s\n",
-            "Epoch 88/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.11s\n",
-            "Epoch 89/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.25s\n",
-            "Epoch 90/100, Train Loss: 1.8047, Val Loss: 1.8873, Time: 2.42s\n",
-            "Epoch 91/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.34s\n",
-            "Epoch 92/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.37s\n",
-            "Epoch 93/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.20s\n",
-            "Epoch 94/100, Train Loss: 1.8031, Val Loss: 1.8873, Time: 2.18s\n",
-            "Epoch 95/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.07s\n",
-            "Epoch 96/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.20s\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Training loop\n",
-        "\n",
-        "for epoch in range(params['epochs']):\n",
-        "    epoch_start_time = time.time()\n",
-        "    model.train()\n",
-        "    \n",
-        "    train_loss = 0.0\n",
-        "    \n",
-        "    for batch in train_loader:\n",
-        "        optimizer.zero_grad()\n",
-        "        input_ids, labels = batch\n",
-        "        input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
-        "\n",
-        "        outputs = model(input_ids)\n",
-        "        outputs = outputs.squeeze().float()\n",
-        "        loss = criterion(outputs, labels)\n",
-        "        loss.backward()\n",
-        "        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n",
-        "        optimizer.step()\n",
-        "        preds = outputs\n",
-        "        \n",
-        "        train_loss += loss.item()\n",
-        "\n",
-        "    train_loss /= len(train_loader)\n",
-        "    \n",
-        "    # Validation\n",
-        "    model.eval()\n",
-        "    val_loss = 0.0\n",
-        "    \n",
-        "    with torch.no_grad():\n",
-        "        for batch in val_loader:\n",
-        "            input_ids, labels = batch\n",
-        "            input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
-        "            outputs = model(input_ids)\n",
-        "            outputs = outputs.squeeze().float()\n",
-        "            loss = criterion(outputs, labels)\n",
-        "            preds = outputs\n",
-        "            \n",
-        "            val_loss += loss.item()\n",
-        "\n",
-        "    val_loss /= len(val_loader)\n",
-        "    \n",
-        "    epoch_end_time = time.time()\n",
-        "    \n",
-        "    print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n",
-        "          f'Train Loss: {train_loss:.4f}, '\n",
-        "          f'Val Loss: {val_loss:.4f}, '\n",
-        "          f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n",
-        "\n",
-        "    "
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.4"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}