!!!WARNING!!! Nuclear refactoring bomb in coming (Now 90% more confusing but 100% cleaner)

main
Felix Jan Michael Mucha 2025-02-15 17:16:34 +01:00
parent 556ed1c292
commit 2ff92b9e15
38 changed files with 15114 additions and 164515 deletions

137
BERT.py 100644
View File

@ -0,0 +1,137 @@
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import BertForSequenceClassification, AutoTokenizer
import numpy as np
import Datasets
import dataset_helper
import EarlyStopping
import ml_helper
import ml_history
import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class CustomBert(nn.Module):
def __init__(self,dropout):
super().__init__()
#Bert + Custom Layers (Not a tuple any longer -- idk why)
self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(2,1)
# self.sm = nn.Softmax(dim=1)
def forward(self, input_ids, attention_mask):
x = self.bfsc(input_ids, attention_mask = attention_mask)
x = self.dropout(x[0])
x = self.classifier(x)
x = x.squeeze()
return x
def freeze_bert_params(self):
for param in self.bfsc.named_parameters():
param[1].requires_grad_(False)
def unfreeze_bert_params(self):
for param in self.bfsc.named_parameters():
param[1].requires_grad_(True)
if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 128,
# Training
"epochs": 10,
"patience": 7,
"batch_size": 32,
"learning_rate": 0.001,
"weight_decay": 5e-4 ,
# Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6
}
# Configs
MODEL_NAME = 'BERT.pt'
HIST_NAME = 'BERT_history'
GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv'
FREEZE_BERT = False
EMBEDDING_DIM = 100
TEST_SIZE = 0.1
VAL_SIZE = 0.1
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
# Aufteilen der Daten
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
# Initialize BertTokenizer from Pretrained
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
print("Tokenizer Initialized")
# Dataset und DataLoader
train_dataset = Datasets.BertDataset(tokenizer, data_split['train']['X'], data_split['train']['y'], max_len=params["max_len"])
val_dataset = Datasets.BertDataset(tokenizer, data_split['val']['X'], data_split['val']['y'], max_len=params["max_len"])
test_dataset = Datasets.BertDataset(tokenizer, data_split['test']['X'], data_split['test']['y'], max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = CustomBert(dropout=params["dropout"])
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
hist = ml_history.History()
# Training und Validierung
for epoch in range(params["epochs"]):
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"], bert_freeze=FREEZE_BERT, is_bert=True)
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist, is_bert=True)
early_stopping(val_rmse, model)
if early_stopping.early_stop:
print("Early stopping triggered.")
break
# Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
# Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True)
hist.add_test_results(test_labels, test_preds)
# save training history
hist.save_history(HIST_NAME)
# RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,44 +0,0 @@
import torch
import torch.nn as nn
import numpy as np
class BalancedCELoss(nn.Module):
def __init__(self, alpha=0.1):
super(BalancedCELoss, self).__init__()
self.bce_loss = nn.CrossEntropyLoss()
self.alpha = alpha
def forward(self, predictions, targets):
# detect num of unique classes
num_classes = len(torch.unique(targets))
if num_classes == 1:
# If only one class than split it into two classes
predictions = torch.cat((1 - predictions, predictions), dim=1)
# Calculate the standard binary cross-entropy loss
bce_loss = self.bce_loss(predictions, targets)
predictions = torch.argmax(predictions, dim=1)
# Calculate the number of predictions for each class
class_0_preds_n = predictions[predictions == 0]
class_1_preds_n = predictions[predictions == 1]
# Calculate the number of labels for each class based on predictions
class_0_labels_n = targets[targets == 0]
class_1_labels_n = targets[targets == 1]
preds_ratio_0 = len(class_0_preds_n) / len(predictions)
preds_ratio_1 = len(class_1_preds_n) / len(predictions)
labels_ratio_0 = len(class_0_labels_n) / len(targets)
labels_ratio_1 = len(class_1_labels_n) / len(targets)
# Calculate the imbalance penalty
imbalance_penalty = np.abs(preds_ratio_0 - labels_ratio_0) + np.abs(preds_ratio_1 - labels_ratio_1)
# Combine the BCE loss with the imbalance penalty
total_loss = bce_loss + self.alpha * imbalance_penalty
return total_loss

File diff suppressed because one or more lines are too long

147
CNN.py 100644
View File

@ -0,0 +1,147 @@
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import Datasets
import dataset_helper
import EarlyStopping
import ml_helper
import ml_history
import ml_train
SEED = 501
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
class EnhancedCNNRegressor(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNRegressor, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
# Convolutional Schichten mit Batch-Normalisierung
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout) # Dropout nach jeder Schicht
)
for fs in filter_sizes
])
# Fully-Connected Layer
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
x = self.dropout(x)
return self.fc2(x).squeeze(1)
if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 25,
"patience": 7,
"batch_size": 32,
"learning_rate": 0.001,
"weight_decay": 5e-4 ,
# Model
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"dropout": 0.6
}
# Configs
MODEL_NAME = 'CNN.pt'
HIST_NAME = 'CNN_history'
GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100
TEST_SIZE = 0.1
VAL_SIZE = 0.1
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
# Aufteilen der Daten
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
# Dataset und DataLoader
train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = EnhancedCNNRegressor(
vocab_size=vocab_size,
embedding_dim=EMBEDDING_DIM,
filter_sizes=params["filter_sizes"],
num_filters=params["num_filters"],
embedding_matrix=embedding_matrix,
dropout=params["dropout"]
)
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
hist = ml_history.History()
# Training und Validierung
for epoch in range(params["epochs"]):
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
early_stopping(val_rmse, model)
if early_stopping.early_stop:
print("Early stopping triggered.")
break
# save training history
hist.save_history(HIST_NAME)
# Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
# Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
hist.add_test_results(test_labels, test_preds)
# save training history
hist.save_history(HIST_NAME)
# RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,227 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from dataset_generator import create_embedding_matrix, split_data, load_preprocess_data
from HumorDataset import TextDataset
from BalancedCELoss import BalancedCELoss
import matplotlib.pyplot as plt
import numpy as np
# Hyperparameter und Konfigurationen
params = {
"embedding_dim": 100,
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"batch_size": 32,
"learning_rate": 0.001,
"epochs": 25,
"glove_path": 'data/glove.6B.100d.txt',
"max_len": 280,
"test_size": 0.1,
"val_size": 0.1,
"patience": 5,
"data_path": 'data/hack.csv',
"dropout": 0.6,
"weight_decay": 5e-4,
"alpha": 0.1 # Alpha für die Balance in der Loss-Funktion
}
# CNN-Modell für binäre Klassifikation
class EnhancedCNNBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNBinaryClassifier, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters),
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout)
)
for fs in filter_sizes
])
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)
self.fc2 = nn.Linear(128, 2) # 2 Klassen, daher 2 Outputs für CrossEntropyLoss
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1)
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]
x = torch.cat(conv_outputs, 1)
x = torch.relu(self.fc1(x))
x = self.dropout(x)
return self.fc2(x) # 2 Outputs, CrossEntropyLoss übernimmt die Softmax
# Visualisierungsfunktionen
def visualize_predictions(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Unterschied zwischen vorhergesagten und wahren Werten
true_values = np.array(true_values)
predicted_values = np.array(predicted_values)
correct_indices = true_values == predicted_values
incorrect_indices = ~correct_indices
# Scatterplot
plt.scatter(
np.arange(len(true_values))[correct_indices],
true_values[correct_indices],
color='green',
label='Richtig vorhergesagt'
)
plt.scatter(
np.arange(len(true_values))[incorrect_indices],
true_values[incorrect_indices],
color='red',
label='Falsch vorhergesagt'
)
plt.axhline(0.5, linestyle='--', color='blue', label='Schwelle (0.5)')
plt.ylim(-0.5, 1.5)
plt.yticks([0, 1], labels=['Klasse 0', 'Klasse 1'])
plt.xlabel('Datenindex')
plt.ylabel('Klassifikation')
plt.title('Richtige vs. Falsche Vorhersagen')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
def visualize_distribution(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Häufigkeiten der Klassen berechnen
true_counts = np.bincount(true_values, minlength=2)
predicted_counts = np.bincount(predicted_values, minlength=2)
# Barplot erstellen
labels = ['Klasse 0', 'Klasse 1']
x = np.arange(len(labels))
plt.bar(x - 0.2, true_counts, width=0.4, color='skyblue', label='Wahre Werte', edgecolor='black')
plt.bar(x + 0.2, predicted_counts, width=0.4, color='salmon', label='Vorhergesagte Werte', edgecolor='black')
plt.title('Verteilung der wahren Werte und Vorhersagen')
plt.xticks(x, labels)
plt.ylabel('Häufigkeit')
plt.xlabel('Klassen')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Gerät initialisieren
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Daten laden
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
)
X, y = load_preprocess_data(path_data=params["data_path"])
# Daten splitten
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = EnhancedCNNBinaryClassifier(
vocab_size=vocab_size,
embedding_dim=params["embedding_dim"],
filter_sizes=params["filter_sizes"],
num_filters=params["num_filters"],
embedding_matrix=embedding_matrix,
dropout=params["dropout"]
)
model = model.to(device)
# BalancedCELoss verwenden
criterion = BalancedCELoss(alpha=params["alpha"])
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
# Training
history = {
"train_loss": [],
"val_loss": [],
"train_acc": [],
"val_acc": [],
}
for epoch in range(params["epochs"]):
model.train()
train_loss, correct, total = 0.0, 0, 0
with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
for X_batch, y_batch in pbar:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
predicted = torch.argmax(outputs, dim=1)
correct += (predicted == y_batch).sum().item()
total += y_batch.size(0)
pbar.set_postfix({"Train Loss": loss.item()})
train_acc = correct / total
history["train_loss"].append(train_loss / len(train_loader))
history["train_acc"].append(train_acc)
# Validation
model.eval()
val_loss, correct, total = 0.0, 0, 0
with torch.no_grad():
for X_batch, y_batch in val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
val_loss += loss.item()
predicted = torch.argmax(outputs, dim=1)
correct += (predicted == y_batch).sum().item()
total += y_batch.size(0)
val_acc = correct / total
history["val_loss"].append(val_loss / len(val_loader))
history["val_acc"].append(val_acc)
print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
print(f"Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")
# Testen und Visualisieren
model.eval()
test_correct, test_total = 0, 0
true_labels, predicted_labels = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
predicted = torch.argmax(outputs, dim=1)
true_labels.extend(y_batch.cpu().numpy())
predicted_labels.extend(predicted.cpu().numpy())
test_correct += (predicted == y_batch).sum().item()
test_total += y_batch.size(0)
test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy:.4f}")
# Visualisierung der Vorhersagen (Scatterplot)
visualize_predictions(true_labels, predicted_labels)
# Visualisierung der Verteilung (Barplot)
visualize_distribution(true_labels, predicted_labels)

View File

@ -1,316 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm # Fortschrittsbalken-Bibliothek
from dataset_generator import create_embedding_matrix, split_data
from HumorDataset import TextRegDataset
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
# Hyperparameter und Konfigurationen
params = {
"embedding_dim": 100,
"filter_sizes": [2, 3, 4, 5], # Zusätzliche Filtergröße
"num_filters": 150, # Erhöhte Anzahl von Filtern
"batch_size": 32,
"learning_rate": 0.001,
"epochs": 25,
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
"max_len": 280,
"test_size": 0.1,
"val_size": 0.1,
"patience": 5,
"data_path": 'data/hack.csv', # Pfad zu den Daten
"dropout": 0.6, # Erhöhtes Dropout
"weight_decay": 5e-4 # L2-Regularisierung
}
# EarlyStopping-Klasse mit Ordnerprüfung
class EarlyStopping:
def __init__(self, patience=5, verbose=False):
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
def __call__(self, val_loss, model):
score = -val_loss
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_loss, model)
elif score < self.best_score:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_loss, model)
self.counter = 0
def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
directory = "checkpoints"
if not os.path.exists(directory):
os.makedirs(directory) # Erstelle den Ordner, falls er nicht existiert
if self.verbose:
print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
torch.save(model.state_dict(), os.path.join(directory, filename))
# Plot-Funktion für Training
def plot_learning_curves(history):
epochs = range(1, len(history['train_loss']) + 1)
# Loss-Plot
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, history['train_loss'], label='Train Loss')
plt.plot(epochs, history['val_loss'], label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
# RMSE-Plot
plt.subplot(1, 2, 2)
plt.plot(epochs, history['train_rmse'], label='Train RMSE')
plt.plot(epochs, history['val_rmse'], label='Val RMSE')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.title('Training and Validation RMSE')
plt.legend()
plt.tight_layout()
plt.show()
# Visualisierung der Zielvariablen (Scores)
def visualize_data_distribution(y):
print("\n--- Zielvariable: Statistik ---")
print(f"Min: {np.min(y)}, Max: {np.max(y)}")
print(f"Mittelwert: {np.mean(y):.4f}, Standardabweichung: {np.std(y):.4f}")
# Histogramm plotten
plt.figure(figsize=(10, 6))
plt.hist(y, bins=20, color='skyblue', edgecolor='black')
plt.title('Verteilung der Zielvariable (Scores)')
plt.xlabel('Score')
plt.ylabel('Häufigkeit')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Funktion zum Laden und Vorverarbeiten der Daten
def load_preprocess_data(path_data='data/hack.csv'):
# Daten laden
df = pd.read_csv(path_data)
# Fehlende Werte in der Zielspalte entfernen
df = df.dropna(subset=['humor_rating'])
# Zielvariable aus der Spalte 'humor_rating' extrahieren
df['y'] = df['humor_rating'].astype(float) # Sicherstellen, dass Zielvariable numerisch ist
# Eingabetexte und Zielvariable zuweisen
X = df['text']
y = df['y']
# Debug-Ausgabe zur Überprüfung
print(f"Erste Zielwerte: {y.head(10)}")
print(f"Datentyp der Zielvariable: {y.dtype}")
print(f"Anzahl der Beispiele: {len(X)}")
return X, y
# CNN-Modell für Regression mit erweiterten Regularisierungen
class EnhancedCNNRegressor(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNRegressor, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
# Convolutional Schichten mit Batch-Normalisierung
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout) # Dropout nach jeder Schicht
)
for fs in filter_sizes
])
# Fully-Connected Layer
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
x = self.dropout(x)
return self.fc2(x).squeeze(1)
# Device auf CPU setzen
device = torch.device("cpu")
print(f"Using device: {device}")
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
)
X, y = load_preprocess_data(path_data=params["data_path"])
# Visualisierung der Daten
visualize_data_distribution(y)
# Aufteilen der Daten
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
# Dataset und DataLoader
train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = EnhancedCNNRegressor(
vocab_size=vocab_size,
embedding_dim=params["embedding_dim"],
filter_sizes=params["filter_sizes"],
num_filters=params["num_filters"],
embedding_matrix=embedding_matrix,
dropout=params["dropout"]
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
early_stopping = EarlyStopping(patience=params["patience"], verbose=True)
# Speicher für Trainingsmetriken
history = {
"train_loss": [],
"val_loss": [],
"train_rmse": [],
"val_rmse": [],
}
# Training und Validierung
for epoch in range(params["epochs"]):
model.train()
train_loss = 0.0
train_preds, train_labels = [], []
# Fortschrittsbalken für Training innerhalb einer Epoche
with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
for X_batch, y_batch in pbar:
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
optimizer.zero_grad()
predictions = model(X_batch).float()
loss = criterion(predictions, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
# Speichere echte und vorhergesagte Werte für Metriken
train_preds.extend(predictions.cpu().detach().numpy())
train_labels.extend(y_batch.cpu().detach().numpy())
# Update der Fortschrittsanzeige
pbar.set_postfix({"Train Loss": loss.item()})
train_rmse = np.sqrt(mean_squared_error(train_labels, train_preds)) # RMSE
history["train_loss"].append(train_loss / len(train_loader))
history["train_rmse"].append(train_rmse)
# Validation
model.eval()
val_loss = 0.0
val_preds, val_labels = [], []
with torch.no_grad():
for X_batch, y_batch in val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = model(X_batch).float()
loss = criterion(predictions, y_batch)
val_loss += loss.item()
val_preds.extend(predictions.cpu().detach().numpy())
val_labels.extend(y_batch.cpu().detach().numpy())
val_rmse = np.sqrt(mean_squared_error(val_labels, val_preds)) # RMSE
history["val_loss"].append(val_loss / len(val_loader))
history["val_rmse"].append(val_rmse)
print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
print(f"Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
early_stopping(val_rmse, model)
if early_stopping.early_stop:
print("Early stopping triggered.")
break
# Plot der Lernkurven
plot_learning_curves(history)
# Funktion zur Visualisierung der richtigen und falschen Vorhersagen
def visualize_predictions(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Unterschied zwischen vorhergesagten und wahren Werten
correct_indices = np.isclose(true_values, predicted_values, atol=0.3) # Als korrekt angenommen, wenn Differenz <= 0.3
# Plot
plt.scatter(true_values[correct_indices], predicted_values[correct_indices], color='green', label='Richtig vorhergesagt')
plt.scatter(true_values[~correct_indices], predicted_values[~correct_indices], color='red', label='Falsch vorhergesagt')
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal-Linie')
plt.xlabel('Wahre Werte')
plt.ylabel('Vorhergesagte Werte')
plt.title('Richtige vs Falsche Vorhersagen')
plt.legend()
plt.grid(True)
plt.show()
# Test Evaluation
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = model(X_batch).float()
test_preds.extend(predictions.cpu().detach().numpy())
test_labels.extend(y_batch.cpu().detach().numpy())
# Konvertierung zu NumPy-Arrays
true_values = np.array(test_labels)
predicted_values = np.array(test_preds)
# Visualisierung der Ergebnisse
visualize_predictions(true_values, predicted_values)
# RMSE, MAE und R²-Score für das Test-Set
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_mae = mean_absolute_error(test_labels, test_preds)
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
# plot distribution of predicted values and true values
plt.figure(figsize=(10, 6))
plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
plt.title('Distribution of Predicted and True Values')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

69
Datasets.py 100644
View File

@ -0,0 +1,69 @@
"""
This file contains the Datasets class.
"""
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset
from transformers import AutoTokenizer
class GloveDataset(Dataset):
def __init__(self, texts, labels, word_index, max_len=50):
self.original_indices = labels.index.to_list()
self.texts = texts.reset_index(drop=True)
self.labels = labels.reset_index(drop=True)
self.word_index = word_index
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
texts = self.texts[idx]
tokens = word_tokenize(texts.lower())
label = self.labels[idx]
# Tokenize and convert to indices
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
# Pad or truncate to max_len
if len(input_ids) < self.max_len:
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
else:
input_ids = input_ids[:self.max_len]
# Convert to PyTorch tensors
input_ids = torch.tensor(input_ids, dtype=torch.long)
label = torch.tensor(label, dtype=torch.float)
return input_ids, label
class BertDataset(Dataset):
def __init__(self,tokenizer:AutoTokenizer, texts, labels, max_len:int=128):
super(BertDataset,self).__init__()
self.tokenizer = tokenizer
self.max_length = max_len
self.text = texts.to_numpy()
self.labels = labels.to_numpy()
def __getitem__(self,idx:int):
text = self.text[idx]
labels = self.labels[idx]
encoding = self.tokenizer(
text,
padding="max_length",
return_attention_mask = True,
max_length=self.max_length,
truncation = True,
return_tensors = 'pt'
)
input_ids = encoding['input_ids'].flatten()
attention_mask = encoding['attention_mask'].flatten()
return {
'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
'labels':torch.tensor(labels,dtype=torch.float)
}
def __len__(self):
return len(self.labels)

View File

@ -1,12 +1,14 @@
import torch
import os
class EarlyStopping:
def __init__(self, patience=5, verbose=False):
class EarlyStoppingCallback:
def __init__(self, model_name, patience=5, verbose=False):
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.model_name = model_name
def __call__(self, val_loss, model):
score = -val_loss
@ -22,7 +24,10 @@ class EarlyStopping:
self.save_checkpoint(val_loss, model)
self.counter = 0
def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
def save_checkpoint(self, val_loss, model):
directory = "models/checkpoints"
if not os.path.exists(directory):
os.makedirs(directory) # Create the directory if it does not exist
if self.verbose:
print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
torch.save(model.state_dict(), f'checkpoints/{filename}')
print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
torch.save(model.state_dict(), os.path.join(directory, self.model_name))

View File

@ -1,111 +0,0 @@
"""
This file contains the HumorDataset class.
"""
import torch
import numpy as np
from nltk.tokenize import word_tokenize
class TextRegDataset(torch.utils.data.Dataset):
def __init__(self, texts, labels, word_index, max_len=50):
self.original_indices = labels.index.to_list()
self.texts = texts.reset_index(drop=True)
self.labels = labels.reset_index(drop=True)
self.word_index = word_index
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
texts = self.texts[idx]
tokens = word_tokenize(texts.lower())
label = self.labels[idx]
# Tokenize and convert to indices
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
# Pad or truncate to max_len
if len(input_ids) < self.max_len:
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
else:
input_ids = input_ids[:self.max_len]
# Convert to PyTorch tensors
input_ids = torch.tensor(input_ids, dtype=torch.long)
label = torch.tensor(label, dtype=torch.float)
return input_ids, label
class TextDataset(torch.utils.data.Dataset):
def __init__(self, texts, labels, word_index, max_len=50):
self.original_indices = labels.index.to_list()
self.texts = texts.reset_index(drop=True)
self.labels = labels.reset_index(drop=True)
self.word_index = word_index
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
texts = self.texts[idx]
tokens = word_tokenize(texts.lower())
label = self.labels[idx]
# Tokenize and convert to indices
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
# Pad or truncate to max_len
if len(input_ids) < self.max_len:
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
else:
input_ids = input_ids[:self.max_len]
# Convert to PyTorch tensors
input_ids = torch.tensor(input_ids, dtype=torch.long)
label = torch.tensor(label, dtype=torch.long)
return input_ids, label
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, data, labels, vocab_size=0, emb_dim=None):
self.original_indices = labels.index.to_list()
self.data = data
self.labels = labels.reset_index(drop=True)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
# TODO: bug fix
self.shape = self.get_shape()
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
def get_single_shape(self, data):
shape_data = None
if type(data) == list:
shape_data = len(data[0])
elif type(data) == torch.Tensor:
shape_data = data[0].shape
elif type(data) == np.ndarray:
shape_data = data[0].shape
return shape_data
def get_shape(self):
shape_data = self.get_single_shape(self.data)
shape_labels = self.get_single_shape(self.labels)
return shape_data, shape_labels

View File

196
Transformer.py 100644
View File

@ -0,0 +1,196 @@
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import Datasets
import dataset_helper
import EarlyStopping
import ml_helper
import ml_history
import ml_train
class PositionalEncoding(nn.Module):
"""
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
"""
def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(vocab_size, d_model)
position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float()
* (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[:, : x.size(1), :]
return self.dropout(x)
class TransformerBinaryClassifier(nn.Module):
"""
Text classifier based on a pytorch TransformerEncoder.
"""
def __init__(
self,
embeddings,
nhead=8,
dim_feedforward=2048,
num_layers=6,
positional_dropout=0.1,
classifier_dropout=0.1,
):
super().__init__()
vocab_size, d_model = embeddings.size()
assert d_model % nhead == 0, "nheads must divide evenly into d_model"
self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
self.pos_encoder = PositionalEncoding(
d_model=d_model,
dropout=positional_dropout,
vocab_size=vocab_size,
)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=nhead,
dim_feedforward=dim_feedforward,
dropout=classifier_dropout,
)
self.transformer_encoder = nn.TransformerEncoder(
encoder_layer,
num_layers=num_layers,
)
# normalize to stabilize and stop overfitting
self.batch_norm = nn.BatchNorm1d(d_model)
self.classifier = nn.Linear(d_model, 1)
self.d_model = d_model
def forward(self, x):
x = self.emb(x) * math.sqrt(self.d_model)
x = self.pos_encoder(x)
x = self.transformer_encoder(x)
x = x.mean(dim=1)
# normalize to stabilize and stop overfitting
#x = self.batch_norm(x)
#NOTE: no activation function for regression
x = self.classifier(x)
x = x.squeeze(1)
return x
if __name__ == '__main__':
# Hyperparameter und Konfigurationen
params = {
# Config
"max_len": 280,
# Training
"epochs": 25,
"patience": 7,
"batch_size": 32,
"learning_rate": 1e-4, # 1e-4
"weight_decay": 5e-4 ,
# Model
'nhead': 2, # 5
"dropout": 0.2,
'hiden_dim': 2048,
'num_layers': 6
}
# TODO set seeds
# Configs
MODEL_NAME = 'transfomrer.pt'
HIST_NAME = 'transformer_history'
GLOVE_PATH = 'data/glove.6B.100d.txt'
DATA_PATH = 'data/hack.csv'
EMBEDDING_DIM = 100
TEST_SIZE = 0.1
VAL_SIZE = 0.1
# Daten laden und vorbereiten
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
# Aufteilen der Daten
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
# Dataset und DataLoader
train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = TransformerBinaryClassifier(
embeddings=embedding_matrix,
nhead=params['nhead'],
dim_feedforward=params['hiden_dim'],
num_layers=params['num_layers'],
positional_dropout=params["dropout"],
classifier_dropout=params["dropout"],
)
device = ml_helper.get_device(verbose=True, include_mps=False)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"])
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
hist = ml_history.History()
# Training und Validierung
for epoch in range(params["epochs"]):
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
early_stopping(val_rmse, model)
if early_stopping.early_stop:
print("Early stopping triggered.")
break
# save training history
hist.save_history(HIST_NAME)
# save training history
hist.save_history(HIST_NAME)
# Load best model
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
# Test Evaluation
test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
hist.add_test_results(test_labels, test_preds)
# save training history
hist.save_history(HIST_NAME)
# RMSE, MAE und R²-Score für das Test-Set
test_mae = mean_absolute_error(test_labels, test_preds)
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")

View File

@ -1,266 +0,0 @@
# PyTorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# scikit-learn Imports
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
# Bert imports
from transformers import BertForSequenceClassification, AutoTokenizer
#Default imports (pandas, numpy, matplotlib, etc.)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Select Device
if torch.cuda.is_available():
DEVICE = torch.device("cuda")
else:
DEVICE = torch.device("cpu")
class SimpleHumorDataset(Dataset):
def __init__(self,tokenizer:AutoTokenizer,dataframe:pd.DataFrame,max_length:int=128):
super(SimpleHumorDataset,self).__init__()
self.tokenizer = tokenizer
self.max_length = max_length
self.text = dataframe['text'].to_numpy()
self.labels = dataframe['is_humor'].to_numpy()
def __getitem__(self,idx:int):
text = self.text[idx]
labels = self.labels[idx]
encoding = self.tokenizer(
text,
padding="max_length",
return_attention_mask = True,
max_length=self.max_length,
truncation = True,
return_tensors = 'pt'
)
input_ids = encoding['input_ids'].flatten()
attention_mask = encoding['attention_mask'].flatten()
return {
'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
'labels':torch.tensor(labels,dtype=torch.long)
}
def __len__(self):
return len(self.labels)
class CustomBert(nn.Module):
def __init__(self,dropout):
super().__init__()
#Bert + Custom Layers (Not a tuple any longer -- idk why)
self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(2,2)
# self.sm = nn.Softmax(dim=1)
def forward(self, input_ids, attention_mask):
x = self.bfsc(input_ids, attention_mask = attention_mask)
x = self.dropout(x[0])
x = self.classifier(x)
return x
def freeze_bert_params(self):
for param in self.bfsc.named_parameters():
param[1].requires_grad_(False)
def unfreeze_bert_params(self):
for param in self.bfsc.named_parameters():
param[1].requires_grad_(True)
def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim.AdamW,train_loader:DataLoader,freeze_bert:bool=False):
model.train()
if freeze_bert:
model.freeze_bert_params()
total_loss = 0
len_train_loader = len(train_loader)
for train_batch in train_loader:
# Set Gradient to Zero
optimizer.zero_grad()
# Unpack batch values and "push" it to GPU
input_ids, att_mask, labels = train_batch.values()
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE),labels.to(DEVICE)
# Feed Model with Data
outputs = model(input_ids, attention_mask=att_mask)
# print(f"{model.bfsc.}")
# print(f"{outputs.shape}")
loss = criterion(outputs,labels)
loss.backward()
optimizer.step()
total_loss+=loss.item()
print(f"Training Loss is {(total_loss/len(train_loader)):.4f}")
return (total_loss/len(train_loader))
def eval_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,validation_loader:DataLoader):
model.eval()
total, correct = 0.0, 0.0
total_loss = 0.0
best_loss = float("Inf")
with torch.no_grad():
for val_batch in validation_loader:
input_ids, att_mask ,labels = val_batch.values()
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE), labels.to(DEVICE)
outputs = model(input_ids,attention_mask=att_mask)
loss = criterion(outputs,labels)
total_loss += loss.item()
predictions = torch.argmax(outputs,1)
total += labels.size(0)
correct += (predictions == labels).sum().item()
if total_loss/len(validation_loader) < best_loss:
best_loss = total_loss/len(validation_loader)
torch.save(model,"best_bert_model.pt")
print(f"Validation Loss: {total_loss/len(validation_loader):.4f} ### Validation Accuracy {correct/total*100:.4f}%")
return total_loss/len(validation_loader)
def test_loop(model:CustomBert, test_loader:DataLoader):
for batch in test_loader:
input_ids, att_mask, labels = batch.values()
input_ids, att_mask, labels = input_ids.to(DEVICE), att_mask.to(DEVICE), labels.to(DEVICE)
with torch.no_grad():
model = torch.load("best_bert_model")
model.to(DEVICE)
output = model(input_ids,att_mask)
output.detach().cpu().numpy()
labels.detach().cpu().numpy()
pred_flat = np.argmax(output,1).flatten()
print(accuracy_score(labels,pred_flat))
def plot_metrics_loss_n_acc(train_loss,validation_loss,train_acc,validation_acc):
"""
Method that plots Loss and Accuracy of Training and Validation Data used in given modelinstance
"""
# Visualize Training Loss
# plt.plot(loss_values)
# plt.plot(eval_values)
# plt.hlines(np.mean(loss_values),xmin=0,xmax=EPOCH,colors='red',linestyles="dotted",label="Average Loss")
# plt.hlines(np.mean(eval_values),xmin=0,xmax=EPOCH,colors='green',linestyles="dashed",label="Average Val Loss")
# plt.title("Test Loss")
# plt.xlabel("Num Epochs")
# plt.ylabel("Total Loss of Epoch")
# plt.show()
pass
def plot_test_metrics(accuracy):
"""
Plot Test Metrics of Model (Confiuson Matrix, Accuracy)
"""
plt.plot(accuracy)
plt.hlines(np.mean(accuracy),0,len(accuracy),'red','dotted','Mean Accuracy %d'.format(np.mean(accuracy)))
plt.title("Accuracy of Test")
plt.xlabel("Num Epochs")
plt.ylabel("Accurcy 0.0 - 1.0")
plt.grid(True)
plt.legend()
plt.show()
# def performance_metrics(true_labels,predictions):
# confusion_matrix(true_labels,predictions)
# accuracy_score(true_labels,predictions)
# f1_score(true_labels,predictions)
# pass
def create_datasets(tokenizer:AutoTokenizer,dataframe:pd.DataFrame,train_split_ratio:float,val:bool=False)->tuple[SimpleHumorDataset,SimpleHumorDataset,SimpleHumorDataset]|tuple[SimpleHumorDataset,SimpleHumorDataset]:
if train_split_ratio > 1.0:
raise AssertionError("Trainsplit sollte kleiner(-gleich) 1.0 sein")
train,test = train_test_split(dataframe,train_size=train_split_ratio,random_state=501)
if val:
test,validation = train_test_split(test,train_size=.5,random_state=501)
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test), SimpleHumorDataset(tokenizer,validation)
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test)
def create_dataloaders(datasets:tuple|list,batchsize:int,shufflelist:list):
train_loader = DataLoader(datasets[0],batchsize,shuffle=shufflelist[0])
test_loader = DataLoader(datasets[1],batchsize,shuffle=shufflelist[1])
if len(datasets) == 3:
return train_loader, test_loader, DataLoader(datasets[2],batchsize,shuffle=shufflelist[2])
return train_loader, test_loader
# if __name__ == "__main__":
# # HYPERPARAMETERS
# # Set Max Epoch Amount
# EPOCH = 10
# # DROPOUT-PROBABILITY
# DROPOUT = 0.1
# # BATCHSIZE
# BATCH_SIZE = 16
# #LEARNING RATE
# LEARNING_RATE = 1e-5
# # RANDOM SEED
# RNDM_SEED = 501
# # FREEZE Bert Layers
# FREEZE = True
# torch.manual_seed(RNDM_SEED)
# np.random.seed(RNDM_SEED)
# torch.cuda.manual_seed_all(RNDM_SEED)
# Initialize Bert Model with dropout probability and port to DEVICE
# mybert = CustomBert(DROPOUT)
# print("Bert Initialized")
# mybert.to(DEVICE)
# Read Raw Data from csv and save as DataFrame
# df = pd.read_csv("./data/hack.csv",encoding="latin1")
# print("Raw Data read")
# Initialize BertTokenizer from Pretrained
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
# print("Tokenizer Initialized")
# Split DataFrame into Train and Test Sets
# Create Custom Datasets for Train and Test
# train_data,test_data,validation_data = create_datasets(tokenizer,df,.7,True)
# print("Splitted Data in Train and Test Sets")
# print("Custom Datasets created")
# Initialize Dataloader with Train and Test Sets
# train_loader, test_loader, validation_loader = create_dataloaders([train_data,test_data,validation_data],batchsize=BATCH_SIZE,shufflelist=[True,True,False])
# print("DataLoaders created")
# Set criterion to Cross Entropy and define Adam Optimizer with model parameters and learning rate
# criterion_cross_entropy = nn.CrossEntropyLoss()
# optimizer_adamW = optim.Adam(mybert.parameters(), lr = LEARNING_RATE)
# import time
# Set Scheduler for dynamically Learning Rate adjustment
loss_values, eval_values = np.zeros(EPOCH), np.zeros(EPOCH)
# for epoch in range(EPOCH):
# start = time.time()
# print(f"For {epoch+1} the Scores are: ")
# loss_values[epoch] = training_loop(mybert,optimizer=optimizer_adamW,criterion=criterion_cross_entropy,train_loader=train_loader,freeze_bert=FREEZE)
# eval_values[epoch] = eval_loop(mybert,criterion=criterion_cross_entropy,validation_loader=test_loader)
# end = time.time()
# print((end-start),"seconds per epoch needed")
# plot_metrics_loss_n_acc("x","x","x","x")
# for epoch in range(EPOCH):
# test_loop(mybert,validation_loader)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,207 +0,0 @@
"""
This file contains the dataset generation and preprocessing.
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gensim
import torch
import os
import copy
import regex as re
import HumorDataset
# def load_glove_embeddings(glove_file_path):
# embeddings_index = {}
# with open(glove_file_path, 'r', encoding='utf-8') as f:
# for line in f:
# try:
# values = line.split()
# #print(values)
# word = values[0]
# coefs = np.asarray(values[1:], dtype='float32')
# embeddings_index[word] = coefs
# except ValueError:
# print('Error with line:', line[:100])
# return embeddings_index
def load_glove_embeddings(glove_file_path, emb_len=100):
embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
# Use regex to split the line into word and coefficients
match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
# regex explanation: Match word followed by one or more spaces and then the coefficients
if match:
word = match.group(1)
coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
#check list length
if len(coefs) != emb_len:
print('Skip: Length mismatch with line:', line[:100])
else:
embeddings_index[word] = coefs
else:
print('Error with line:', line[:100])
except ValueError:
print('Error with line:', line[:100])
return embeddings_index
def create_embbedings_matrix(embeddings_glove, max_len=100):
embeddings_glove['<UNK>'] = np.random.rand(max_len)
embeddings_glove['<PAD>'] = np.zeros(max_len)
# Create a word index (vocabulary)
word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
# Special tokens are in the word index
word_index['<UNK>'] = len(word_index) - 2
word_index['<PAD>'] = len(word_index) - 1
# print len of word_index
print(len(word_index))
# Create an embedding matrix
embedding_dim = len(next(iter(embeddings_glove.values())))
embedding_matrix = np.zeros((len(word_index), embedding_dim))
for word, idx in word_index.items():
embedding_vector = embeddings_glove.get(word)
if embedding_vector is not None:
embedding_matrix[idx] = embedding_vector
# Convert the embedding matrix to a tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
return embedding_matrix, word_index
def create_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
return embedding_matrix, word_index, vocab_size, d_model
def load_preprocess_data(path_data='data/hack.csv'):
df = pd.read_csv(path_data)
df = df.dropna(subset=['humor_rating'])
# find median of humor_rating
median_rating = df['humor_rating'].median()
df['y'] = df['humor_rating'] > median_rating
X = df['text']
y = df['y']
return X, y
def encode_tokens(tokens, embedding_index, default_vector_len=100):
return [embedding_index.get(token, np.random.zeros(default_vector_len)) for token in tokens]
def pad_sequences(sequences, max_len, pad_index):
return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
def split_data(X, y, test_size=0.1, val_size=0.1):
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
val_split_ratio = val_size / (test_size + val_size)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
ret_dict = {
'train': {'X': X_train, 'y': y_train},
'test': {'X': X_test, 'y': y_test},
'val': {'X': X_val, 'y': y_val}
}
# for each print len
for key in ret_dict.keys():
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
return ret_dict
def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
if not os.path.exists(path):
print('Creating directory:', path)
os.makedirs(path)
print('saving data into:', path)
for key, value in data_dict.items():
# tansform to Dataset
dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
# save dataset
torch.save(dataset, path + prefix + key + '.pt')
if __name__ == "__main__":
# Load the data from csv
df = pd.read_csv('data/hack.csv')
print(df.shape)
df = df.dropna(subset=['humor_rating'])
# find median of humor_rating
median_rating = df['humor_rating'].median()
#print('median and therefore middle of humor_rating:', median_rating)
df['y'] = df['humor_rating'] > median_rating
# transfrom data into dataset
X = df['text']
y = df['y']
# Tokenize the data with nltk
tokens = [word_tokenize(text.lower()) for text in X]
vocab_size = len(set([word for sentence in tokens for word in sentence]))
print('vocab size:', vocab_size)
# Pad the sequences
# NOTE: Info comes from data explore notebook: 280 is max length,
# 139 contains 80% and 192 contains 95% of the data
max_len = 280
padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
# split data into train, test, and validation
data_dict = split_data(padded_indices, y)
# data_idx_based = copy.deepcopy(data_dict)
# vector_based = False
# for key in data_idx_based.keys():
# data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
# # print shape of data
# #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
# # save the data
# save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
print('loading GloVe embeddings')
# Load GloVe embeddings
glove_file_path = 'glove.6B/glove.6B.100d.txt'
#glove_file_path = 'glove.840B.300d/glove.840B.300d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)
emb_len = 100
print('starting with embedding the data')
# Encode the tokens
#for key in data_dict.keys():
#data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=emb_len) for tokens in data_dict[key]['X']]
# print shape of data
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
# Save the data
#save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
max_len = 100
gloVe_path = 'glove.6B/glove.6B.100d.txt'
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=max_len)
embeddings_glove['<UNK>'] = np.random.rand(max_len)
embeddings_glove['<PAD>'] = np.zeros(max_len)

102
dataset_helper.py 100644
View File

@ -0,0 +1,102 @@
"""
This file contains the dataset generation and preprocessing.
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import regex as re
def load_glove_embeddings(glove_file_path, emb_len=100):
embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
# Use regex to split the line into word and coefficients
match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
# regex explanation: Match word followed by one or more spaces and then the coefficients
if match:
word = match.group(1)
coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
#check list length
if len(coefs) != emb_len:
print('Skip: Length mismatch with line:', line[:100])
else:
embeddings_index[word] = coefs
else:
print('Error with line:', line[:100])
except ValueError:
print('Error with line:', line[:100])
return embeddings_index
def create_embbedings_matrix(embeddings_glove, max_len=100):
embeddings_glove['<UNK>'] = np.random.rand(max_len)
embeddings_glove['<PAD>'] = np.zeros(max_len)
# Create a word index (vocabulary)
word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
# Special tokens are in the word index
word_index['<UNK>'] = len(word_index) - 2
word_index['<PAD>'] = len(word_index) - 1
# print len of word_index
print(len(word_index))
# Create an embedding matrix
embedding_dim = len(next(iter(embeddings_glove.values())))
embedding_matrix = np.zeros((len(word_index), embedding_dim))
for word, idx in word_index.items():
embedding_vector = embeddings_glove.get(word)
if embedding_vector is not None:
embedding_matrix[idx] = embedding_vector
# Convert the embedding matrix to a tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
return embedding_matrix, word_index
def get_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
return embedding_matrix, word_index, vocab_size, d_model
def load_preprocess_data(path_data='data/hack.csv', verbose=False):
# Daten laden
df = pd.read_csv(path_data)
# Fehlende Werte in der Zielspalte entfernen
df = df.dropna(subset=['humor_rating'])
# Zielvariable aus der Spalte 'humor_rating' extrahieren
df['y'] = df['humor_rating'].astype(float) # Sicherstellen, dass Zielvariable numerisch ist
# Eingabetexte und Zielvariable zuweisen
X = df['text']
y = df['y']
if verbose:
print(f"Erste Zielwerte: {y.head(10)}")
print(f"Datentyp der Zielvariable: {y.dtype}")
print(f"Anzahl der Beispiele: {len(X)}")
return X, y
def split_data(X, y, test_size=0.1, val_size=0.1):
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
val_split_ratio = val_size / (test_size + val_size)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
ret_dict = {
'train': {'X': X_train, 'y': y_train},
'test': {'X': X_test, 'y': y_test},
'val': {'X': X_val, 'y': y_val}
}
# for each print len
for key in ret_dict.keys():
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
return ret_dict

File diff suppressed because it is too large Load Diff

View File

@ -1,129 +0,0 @@
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd
import matplotlib.patches as mpatches
def get_accuracy(outputs, labels):
correct = np.array([p == l for p, l in zip(outputs, labels)])
accuracy = correct.sum() / len(labels)
return accuracy
def get_f1_score(outputs, labels):
outputs = torch.tensor(outputs)
labels = torch.tensor(labels)
f1 = f1_score(labels, outputs)
return f1
def plot_confusion_matrix(outputs, labels, class_names=['No Humor', 'Humor'], title='Confusion Matrix'):
conf_matrix = confusion_matrix(labels, outputs)
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title(title)
return plt
def get_label_distribution(labels, preds):
# Calculate wrong predictions
wrong_preds = np.array(labels) != np.array(preds)
# Calculate the number of wrong predictions for each class
class_0_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 0)
class_1_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 1)
# Calculate the total number of wrong predictions
total_wrong_preds = np.sum(wrong_preds)
# Calculate and print the ratio of wrong predictions for each class
class_0_ratio = class_0_wrong_preds / total_wrong_preds
class_1_ratio = class_1_wrong_preds / total_wrong_preds
print(f"Class 0: {class_0_ratio:.2f}")
print(f"Class 1: {class_1_ratio:.2f}")
def plot_training_history(history, title='Training History'):
hist_data = history.get_history()
epochs = range(1, len(hist_data['train_loss']) + 1)
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Plot accuracy
axs[1].plot(epochs, hist_data['train_acc'], label='Train Accuracy')
axs[1].plot(epochs, hist_data['val_acc'], label='Validation Accuracy')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].legend()
# Plot loss
axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
axs[0].set_title('Loss')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend()
plt.tight_layout()
plt.suptitle(title)
return plt
def load_data(filepath):
"""
Load the data from a CSV file.
"""
df = pd.read_csv(filepath)
#print(df.shape)
return df
def process_data(df, test_dataset, all_preds, all_labels):
"""
Process the data to prepare it for plotting.
"""
df_test = df.iloc[test_dataset.original_indices].copy()
df_test['prediction'] = all_preds
df_test['label'] = all_labels
df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])
df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)
return df_test_sorted
def plot_rating_df_based(df_test_sorted, title='Humor Rating vs Prediction for Test Set'):
"""
Plot the results of the predictions.
"""
median_rating = df_test_sorted['humor_rating'].median()
median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]
#print(median_idx)
range_idx = range(len(df_test_sorted))
colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})
plt.figure(figsize=(12, 6))
plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)
plt.axvline(x=median_idx, color='black', linestyle='--')
green_patch = mpatches.Patch(color='g', label='Correct Prediction')
red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')
line_patch = mpatches.Patch(color='black', label='humor_rating cut off')
plt.title(title)
plt.xlabel('Index')
plt.ylabel('Humor Rating')
plt.legend(handles=[green_patch, red_patch, line_patch])
return plt
def plot_rating_preds(all_preds, all_labels,
test_dataset,
title='Humor Rating vs Prediction for Test Set',
data_path = 'data/hack.csv'):
data = load_data(data_path)
df_test_sorted = process_data(data, test_dataset, all_preds, all_labels)
plt = plot_rating_df_based(df_test_sorted, title=title)
return plt

View File

@ -5,40 +5,41 @@ import time
import json
import os
def get_device(verbose=False):
def get_device(verbose=False, include_mps=False):
"""
Get the current device (MPS, CPU or GPU) for PyTorch.
"""
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if verbose:
print('Using device:', device)
if include_mps:
device = torch.device("mps" if torch.backends.mps.is_available() else device)
return device
def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
def save_model_and_hyperparams(model, model_prefix_name, rmse, hyperparameters, timestamp=None):
"""
Save the model and hyperparameters to disk.
**kwargs: hyperparameters to save
hyperparameters: dictionary containing hyperparameters to save
"""
# Create a timestamp
if timestamp is None:
timestamp = time.strftime("%Y%m%d-%H%M%S")
accuracy = round(accuracy, 4)
rmse = round(rmse, 4)
# Save the model state dictionary
model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
model_path = f'models/{model_prefix_name}_acc_{rmse}_{timestamp}.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}.")
# Save the hyperparameters as a JSON file
hyperparameters = kwargs
hyperparameters['accuracy'] = accuracy
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
hyperparameters['rmse'] = rmse
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{rmse}_{timestamp}.json'
with open(hyperparameters_path, 'w') as f:
json.dump(hyperparameters, f)
print(f"Hyperparameters saved to {hyperparameters_path}.")
def get_newest_model_path(path, name=None, extension=".pth"):
def get_newest_file(path, name=None, extension=".pth"):
"""
Get the newest file in a directory.
"""

View File

@ -1,70 +1,115 @@
import numpy as np
import torch
from sklearn.metrics import mean_squared_error
from datetime import datetime
import json
import os
class History:
"""
Class to store the history of the training process.
Used to store the loss and accuracy of the training and validation sets.
Used to store the loss and rmse of the training and validation sets.
"""
def __init__(self):
self.history = {
'train_loss': [],
'val_loss': [],
'train_acc': [],
'val_acc': [],
'train_rmse': [],
'val_rmse': [],
'val_labels': [],
# val_preds contains structs {epoch: [preds], ...}
'val_preds': [],
# only needed in the end not in training
'test_labels': [],
'test_preds': [],
}
self.batch_history = {
'train_loss': [],
'val_loss': [],
'train_acc': [],
'val_acc': [],
'train_rmse': [],
'val_rmse': [],
}
def update(self):
self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
if self.batch_history['train_loss']:
self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
if self.batch_history['val_loss']:
self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
if self.batch_history['train_rmse']:
self.history['train_rmse'].append(np.mean(self.batch_history['train_rmse']))
if self.batch_history['val_rmse']:
self.history['val_rmse'].append(np.mean(self.batch_history['val_rmse']))
def get_history(self):
return self.history
def calculate_accuracy(self, outputs, labels):
preds = torch.argmax(outputs, dim=1)
correct = (preds == labels).sum().item()
accuracy = correct / len(labels)
return accuracy
def calculate_rmse(self, outputs, labels):
return np.sqrt(mean_squared_error(labels, outputs))
def batch_reset(self):
self.batch_history = {
'train_loss': [],
'val_loss': [],
'train_acc': [],
'val_acc': [],
'train_rmse': [],
'val_rmse': [],
}
def batch_update(self, train_loss, val_loss, train_acc, val_acc):
def batch_update(self, train_loss, val_loss, train_rmse, val_rmse):
self.batch_history['train_loss'].append(train_loss)
self.batch_history['val_loss'].append(val_loss)
self.batch_history['train_acc'].append(train_acc)
self.batch_history['val_acc'].append(val_acc)
self.batch_history['train_rmse'].append(train_rmse)
self.batch_history['val_rmse'].append(val_rmse)
def batch_update_train(self, train_loss, preds, labels):
train_acc = self.calculate_accuracy(preds, labels)
train_rmse = self.calculate_rmse(preds, labels)
self.batch_history['train_loss'].append(train_loss)
self.batch_history['train_acc'].append(train_acc)
self.batch_history['train_rmse'].append(train_rmse)
def batch_update_val(self, val_loss, preds, labels):
val_acc = self.calculate_accuracy(preds, labels)
def batch_update_val(self, val_loss, preds, labels, epoch):
val_rmse = self.calculate_rmse(preds, labels)
self.batch_history['val_loss'].append(val_loss)
self.batch_history['val_acc'].append(val_acc)
self.batch_history['val_rmse'].append(val_rmse)
self.history['val_labels'] = labels.tolist()
self.history['val_preds'].append({epoch: preds.tolist()})
def get_batch_history(self):
return self.batch_history
def print_history(self, epoch, max_epochs, time_elapsed, verbose=True):
if verbose:
print(f'Epoch {epoch:>3}/{max_epochs} - {time_elapsed:.2f}s - loss: {self.history["train_loss"][-1]:.4f} - accuracy: {self.history["train_acc"][-1]:.4f} - val_loss: {self.history["val_loss"][-1]:.4f} - val_accuracy: {self.history["val_acc"][-1]:.4f}')
def add_test_results(self, test_labels, test_preds):
self.history['test_labels'] = test_labels
self.history['test_preds'] = test_preds
def convert_hist(self):
# Needed for saving the history to a json file:
# convert numpy arrays to lists and use float instead of numpy float
history_to_save = {}
for hist_key, hist_val in self.history.items():
if hist_key == 'val_preds':
history_to_save[hist_key] = [{k: [float(x) for x in v] for k, v in val.items()} for val in hist_val]
else:
history_to_save[hist_key] = [float(x) for x in hist_val]
return history_to_save
def save_history(self, hist_name):
directory = "histories"
if not os.path.exists(directory):
os.makedirs(directory) # Create the directory if it does not exist
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json")
# Needed for saving the history to a json file:
# convert numpy arrays to lists and use float instead of numpy float
history_to_save = self.convert_hist()
with open(filepath, 'w') as f:
json.dump(history_to_save, f, indent=4)
print(f"History saved to {filepath}")

75
ml_plots.py 100644
View File

@ -0,0 +1,75 @@
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
def save_plot(plt, plot_name):
if not os.path.exists('plots'):
os.makedirs('plots')
# create timestamp
time_stamp = time.strftime('%Y%m%d-%H%M%S')
plt.savefig(f'plots/{plot_name}_{time_stamp}.png')
def plot_training_history(hist_data, title='Training History', save=True):
epochs = range(1, len(hist_data['train_loss']) + 1)
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Plot accuracy
axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE')
axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE')
axs[1].set_title('RMSE')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('RMSE')
axs[1].legend()
# Plot loss
axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
axs[0].set_title('Loss')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend()
plt.tight_layout()
plt.suptitle(title)
# save plot
if save:
save_plot(plt, title)
return plt
def plot_distribution(true_values, predicted_values, title='Distribution of Predicted and True Values', save=True):
plt.figure(figsize=(10, 6))
plt.hist(true_values, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
plt.hist(predicted_values, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
plt.title(title)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
# save plot
if save:
save_plot(plt, title)
return plt
def plot_predictions(true_values, predicted_values, title='True vs Predicted Values', threshold=0.3, save=True):
plt.figure(figsize=(10, 6))
# Difference between predicted and true values
correct_indices = np.isclose(true_values, predicted_values, atol=threshold)
incorrect_indices = ~correct_indices
# Plot
plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color='green', label='Correctly predicted')
plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color='red', label='Incorrectly predicted')
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal Line')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title(title)
plt.legend()
plt.grid(True)
# save plot
if save:
save_plot(plt, title)
return plt

87
ml_train.py 100644
View File

@ -0,0 +1,87 @@
from tqdm import tqdm
import torch
import numpy as np
def train_epoch(model, train_loader, criterion, optimizer, device, history, epoch, total_epochs, bert_freeze=False, is_bert=False):
model.train()
if bert_freeze and hasattr(model, 'freeze_bert_params'):
model.freeze_bert_params()
with tqdm(train_loader, desc=f"├ Epoch {epoch + 1}/{total_epochs}") as pbar:
for batch in pbar:
optimizer.zero_grad()
if is_bert:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device).float()
predictions = model(input_ids, attention_mask=attention_mask).float()
else:
X_batch, y_batch = batch
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = model(X_batch).float()
labels = y_batch
loss = criterion(predictions, labels)
loss.backward()
optimizer.step()
preds = predictions.detach().cpu().numpy()
labels = labels.detach().cpu().numpy()
history.batch_update_train(loss.item(), preds, labels)
# Update progress bar
pbar.set_postfix({"Train Loss": loss.item()})
history.update()
history.batch_reset()
def validate_epoch(model, val_loader, epoch, criterion, device, history, is_bert=False):
model.eval()
val_loss = 0.0
val_preds, val_labels = [], []
with torch.no_grad():
for batch in val_loader:
if is_bert:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device).float()
predictions = model(input_ids, attention_mask=attention_mask).float()
else:
X_batch, y_batch = batch
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
labels = y_batch
predictions = model(X_batch).float()
loss = criterion(predictions, labels)
val_loss += loss.item()
val_preds.extend(predictions.cpu().detach().numpy())
val_labels.extend(labels.cpu().detach().numpy())
val_rmse = history.calculate_rmse(np.array(val_preds), np.array(val_labels))
history.batch_update_val(val_loss / len(val_loader), np.array(val_preds), np.array(val_labels), epoch)
history.update()
history.batch_reset()
return val_rmse
def test_loop(model, test_loader, device, is_bert=False):
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
for batch in test_loader:
if is_bert:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device).float()
predictions = model(input_ids, attention_mask=attention_mask).float()
else:
X_batch, y_batch = batch
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
labels = y_batch
predictions = model(X_batch).float()
test_preds.extend(predictions.cpu().detach().numpy())
test_labels.extend(labels.cpu().detach().numpy())
return test_labels, test_preds

View File

@ -0,0 +1,24 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"# TODO: compare"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,187 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load the data\n",
"with open('data/pun_anno/pun_het.json') as f:\n",
" data_het = json.load(f)\n",
"\n",
"with open('data/pun_anno/pun_hom.json') as f:\n",
" data_hom = json.load(f)\n",
"\n",
"with open('data/pun_annotated.json') as f:\n",
" data_anno = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Create a DataFrame\n",
"df_anno = pd.DataFrame(data_anno)\n",
"\n",
"df_het = pd.DataFrame(data_het)\n",
"# df switch columns to rows\n",
"df_het = df_het.T\n",
"\n",
"df_hom = pd.DataFrame(data_hom)\n",
"# df switch columns to rows\n",
"df_hom = df_hom.T"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 hom_362\n",
"1 het_837\n",
"2 het_635\n",
"3 hom_657\n",
"4 het_1275\n",
" ... \n",
"1894 hom_2076\n",
"1895 hom_1437\n",
"1896 het_1530\n",
"1897 het_100\n",
"1898 hom_364\n",
"Name: ID, Length: 1899, dtype: object\n",
"Index(['het_991', 'het_990', 'het_987', 'het_982', 'het_980', 'het_978',\n",
" 'het_973', 'het_958', 'het_956', 'het_955',\n",
" ...\n",
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
" dtype='object', length=1146)\n",
"Index(['hom_998', 'hom_996', 'hom_994', 'hom_993', 'hom_992', 'hom_990',\n",
" 'hom_99', 'hom_985', 'hom_984', 'hom_981',\n",
" ...\n",
" 'hom_2221', 'hom_2223', 'hom_2225', 'hom_2226', 'hom_2230', 'hom_2232',\n",
" 'hom_2234', 'hom_2243', 'hom_2246', 'hom_2247'],\n",
" dtype='object', length=1443)\n"
]
}
],
"source": [
"# print index for each df\n",
"print(df_anno['ID'])\n",
"print(df_het.index)\n",
"print(df_hom.index)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(655, 8) (1146, 11) (1899, 8)\n",
"(825, 8) (1443, 11) (1899, 8)\n"
]
}
],
"source": [
"# find matches from df_anno['ID'] to df_het.index\n",
"df_het_match = df_anno[df_anno['ID'].isin(df_het.index)]\n",
"print(df_het_match.shape, df_het.shape, df_anno.shape)\n",
"\n",
"# find matches from df_anno['ID'] to df_hom.index\n",
"df_hom_match = df_anno[df_anno['ID'].isin(df_hom.index)]\n",
"print(df_hom_match.shape, df_hom.shape, df_anno.shape)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 hom_362\n",
"3 hom_657\n",
"6 hom_1510\n",
"7 hom_955\n",
"8 hom_1505\n",
" ... \n",
"1893 hom_151\n",
"1894 hom_2076\n",
"1895 hom_1437\n",
"1896 het_1530\n",
"1898 hom_364\n",
"Name: ID, Length: 1244, dtype: object\n",
"Index(['het_955', 'het_907', 'het_905', 'het_786', 'het_783', 'het_777',\n",
" 'het_639', 'het_573', 'het_466', 'het_435',\n",
" ...\n",
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
" dtype='object', length=491)\n"
]
}
],
"source": [
"# print not matched IDs and index\n",
"print(df_anno[~df_anno['ID'].isin(df_het.index)]['ID'])\n",
"print(df_het.index[~df_het.index.isin(df_anno['ID'])])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# merge df_anno and df_het where ID matches with index\n",
"df_het_merge = pd.merge(df_anno, df_het, left_on='ID', right_index=True)\n",
"# score_avg \n",
"df_het_merge['score_avg'] = df_het_merge['Funniness (1-5)'].apply(lambda x: np.mean(x))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,584 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KuFFT6LrB6Fe"
},
"outputs": [],
"source": [
"import time\n",
"import json\n",
"import math\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import DataLoader\n",
"from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
"\n",
"from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n",
"from sklearn.model_selection import KFold\n",
"# local imports\n",
"import ml_evaluation as ml_eval\n",
"import ml_helper\n",
"import ml_history\n",
"import dataset_generator as data_gen\n",
"# class imports\n",
"import HumorDataset as humor_ds\n",
"import EarlyStopping\n",
"import BalancedCELoss\n",
"\n",
"\n",
"# architecture inspired:\n",
"# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n",
"\n",
"# TODO: maybe KFold for cross validation?\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
}
],
"source": [
"torch.manual_seed(0)\n",
"np.random.seed(0)\n",
"\n",
"\n",
"best_model_filename = 'best_transformer_reg_model.pt'\n",
"\n",
"device = ml_helper.get_device(verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"400002\n",
"vocab_size: 400002, d_model: 100\n",
"vocab_size: 400002, d_model: 100\n"
]
}
],
"source": [
"embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n",
"\n",
"vocab_size = len(embedding_matrix)\n",
"d_model = len(embedding_matrix[0])\n",
"vocab_size, d_model = embedding_matrix.size()\n",
"print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class PositionalEncoding(nn.Module):\n",
" \"\"\"\n",
" https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n",
" \"\"\"\n",
"\n",
" def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n",
" super().__init__()\n",
" self.dropout = nn.Dropout(p=dropout)\n",
"\n",
" pe = torch.zeros(vocab_size, d_model)\n",
" position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n",
" div_term = torch.exp(\n",
" torch.arange(0, d_model, 2).float()\n",
" * (-math.log(10000.0) / d_model)\n",
" )\n",
" pe[:, 0::2] = torch.sin(position * div_term)\n",
" pe[:, 1::2] = torch.cos(position * div_term)\n",
" pe = pe.unsqueeze(0)\n",
" self.register_buffer(\"pe\", pe)\n",
"\n",
" def forward(self, x):\n",
" x = x + self.pe[:, : x.size(1), :]\n",
" return self.dropout(x)\n",
"\n",
"\n",
"class TransformerBinaryClassifier(nn.Module):\n",
" \"\"\"\n",
" Text classifier based on a pytorch TransformerEncoder.\n",
" \"\"\"\n",
"\n",
" def __init__(\n",
" self,\n",
" embeddings,\n",
" nhead=8,\n",
" dim_feedforward=2048,\n",
" num_layers=6,\n",
" positional_dropout=0.1,\n",
" classifier_dropout=0.1,\n",
" activation=\"relu\",\n",
" ):\n",
"\n",
" super().__init__()\n",
"\n",
" vocab_size, d_model = embeddings.size()\n",
" assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n",
"\n",
" self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n",
"\n",
" self.pos_encoder = PositionalEncoding(\n",
" d_model=d_model,\n",
" dropout=positional_dropout,\n",
" vocab_size=vocab_size,\n",
" )\n",
"\n",
" encoder_layer = nn.TransformerEncoderLayer(\n",
" d_model=d_model,\n",
" nhead=nhead,\n",
" dim_feedforward=dim_feedforward,\n",
" dropout=classifier_dropout,\n",
" )\n",
" self.transformer_encoder = nn.TransformerEncoder(\n",
" encoder_layer,\n",
" num_layers=num_layers,\n",
" )\n",
" # normalize to stabilize and stop overfitting\n",
" self.batch_norm = nn.BatchNorm1d(d_model)\n",
" self.classifier = nn.Linear(d_model, 1)\n",
" self.d_model = d_model\n",
" #self.softmax = nn.Softmax(dim=1)\n",
" #self.sigmoid = nn.Sigmoid()\n",
"\n",
" def forward(self, x):\n",
" x = self.emb(x) * math.sqrt(self.d_model)\n",
" x = self.pos_encoder(x)\n",
" x = self.transformer_encoder(x)\n",
" x = x.mean(dim=1)\n",
" # normalize to stabilize and stop overfitting\n",
" #x = self.batch_norm(x)\n",
"\n",
" #NOTE: no activation function for regression\n",
" # sigmoid would only distort the output\n",
" x = self.classifier(x)\n",
" \n",
" return x\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def load_preprocess_data(path_data='data/hack.csv'):\n",
" df = pd.read_csv(path_data)\n",
" df = df.dropna(subset=['humor_rating'])\n",
"\n",
" df['y'] = df['humor_rating']\n",
" X = df['text']\n",
" y = df['y']\n",
" return X, y"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train 3945 3945\n",
"test 494 494\n",
"val 493 493\n"
]
}
],
"source": [
"X,y = load_preprocess_data()\n",
"\n",
"ret_dict = data_gen.split_data(X, y)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set hyper params"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model created\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
" warnings.warn(\n"
]
}
],
"source": [
"params = {\n",
" # used for class balancing\n",
" 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n",
" # training parameters\n",
" 'batch_size': 32, # 32 (16 to 64)\n",
" 'epochs': 100, # 100\n",
" 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n",
" \n",
" # NOTE: used for gradient clipping (needed for lstm and transformer)\n",
" # use 0 to disable\n",
" 'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n",
" \n",
" # patience for early stopping\n",
" 'early_stopping_patience': 5, # 5 (3 to 10)\n",
"\n",
" # learning rate scheduler\n",
" 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n",
" 'lr_scheduler_patience': 3, # 3 (2 to 5)\n",
"\n",
" # model parameters\n",
" 'nhead': 2, # 5\n",
" 'num_layers': 3, # 6\n",
" 'hidden_dim': 10, # 50\n",
"\n",
" # regularization parameters\n",
" 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
" 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
" 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n",
"}\n",
"\n",
"# Model initialization\n",
"model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n",
" nhead=params['nhead'], \n",
" num_layers=params['num_layers'], \n",
" dim_feedforward=params['hidden_dim'],\n",
" positional_dropout=params['positional_dropout'],\n",
" classifier_dropout=params['classifier_dropout']\n",
" )\n",
"model.to(device)\n",
"print('model created')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### create datasets"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"datasets length: 3945 493\n",
"train: 124, val: 16, test: 16\n"
]
}
],
"source": [
"# NOTE: Info comes from data explore notebook: 280 is max length,\n",
"# 139 contains 80% and 192 contains 95% of the data\n",
"max_len = 280\n",
"\n",
"train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n",
"val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n",
"test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n",
"\n",
"print('datasets length:', len(train_dataset), len(val_dataset))\n",
"#NOTE: overfitting test\n",
"#train_dataset.labels = train_dataset.labels[:100]\n",
"#train_dataset.texts = train_dataset.texts[:100]\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
"val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n",
"test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
"\n",
"# NOTE: samller because of batches not all data\n",
"print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set training requirements"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#TODO: change to RMSE\n",
"\"\"\"\n",
"criterion = nn.MSELoss()\n",
"loss = torch.sqrt(criterion(x, y))\n",
"loss.backward()\n",
"print(x.grad)\n",
"\"\"\"\n",
"criterion = nn.MSELoss()\n",
"\n",
"optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n",
" lr=params['lr']) #, \n",
" #weight_decay=params['weight_decay'])\n",
"\"\"\"\n",
"scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n",
" factor=params['lr_scheduler_factor'],\n",
" patience=params['lr_scheduler_patience'],\n",
" verbose=True)\n",
"\"\"\"\n",
"early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training loop"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s\n",
"Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s\n",
"Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s\n",
"Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s\n",
"Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s\n",
"Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s\n",
"Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s\n",
"Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 17/100, Train Loss: 1.8081, Val Loss: 1.8873, Time: 2.44s\n",
"Epoch 18/100, Train Loss: 1.8078, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 19/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 20/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 21/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 22/100, Train Loss: 1.8103, Val Loss: 1.8873, Time: 2.09s\n",
"Epoch 23/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.16s\n",
"Epoch 24/100, Train Loss: 1.8034, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 25/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.46s\n",
"Epoch 26/100, Train Loss: 1.8084, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 27/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.35s\n",
"Epoch 28/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 29/100, Train Loss: 1.8136, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 30/100, Train Loss: 1.8051, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 31/100, Train Loss: 1.8026, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 32/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.16s\n",
"Epoch 33/100, Train Loss: 1.8121, Val Loss: 1.8873, Time: 2.13s\n",
"Epoch 34/100, Train Loss: 1.8098, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 35/100, Train Loss: 1.8036, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 36/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 37/100, Train Loss: 1.8108, Val Loss: 1.8873, Time: 2.50s\n",
"Epoch 38/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.45s\n",
"Epoch 39/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 40/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 41/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 42/100, Train Loss: 1.8088, Val Loss: 1.8873, Time: 2.30s\n",
"Epoch 43/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 44/100, Train Loss: 1.8029, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 45/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 46/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 47/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 48/100, Train Loss: 1.8069, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 49/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 50/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 51/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 52/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 53/100, Train Loss: 1.8075, Val Loss: 1.8873, Time: 2.00s\n",
"Epoch 54/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 55/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.02s\n",
"Epoch 56/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 57/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.34s\n",
"Epoch 58/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 59/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.11s\n",
"Epoch 60/100, Train Loss: 1.8100, Val Loss: 1.8873, Time: 2.05s\n",
"Epoch 61/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 62/100, Train Loss: 1.8068, Val Loss: 1.8873, Time: 2.22s\n",
"Epoch 63/100, Train Loss: 1.8012, Val Loss: 1.8873, Time: 2.32s\n",
"Epoch 64/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.35s\n",
"Epoch 65/100, Train Loss: 1.8109, Val Loss: 1.8873, Time: 2.36s\n",
"Epoch 66/100, Train Loss: 1.8030, Val Loss: 1.8873, Time: 2.28s\n",
"Epoch 67/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.24s\n",
"Epoch 68/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.20s\n",
"Epoch 69/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.18s\n",
"Epoch 70/100, Train Loss: 1.8019, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 71/100, Train Loss: 1.8025, Val Loss: 1.8873, Time: 2.19s\n",
"Epoch 72/100, Train Loss: 1.8124, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 73/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 74/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.06s\n",
"Epoch 75/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 76/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
"Epoch 77/100, Train Loss: 1.8141, Val Loss: 1.8873, Time: 2.39s\n",
"Epoch 78/100, Train Loss: 1.8092, Val Loss: 1.8873, Time: 2.44s\n",
"Epoch 79/100, Train Loss: 1.8106, Val Loss: 1.8873, Time: 2.30s\n",
"Epoch 80/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.25s\n",
"Epoch 81/100, Train Loss: 1.8142, Val Loss: 1.8873, Time: 2.26s\n",
"Epoch 82/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.08s\n",
"Epoch 83/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.14s\n",
"Epoch 84/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.15s\n",
"Epoch 85/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.17s\n",
"Epoch 86/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.12s\n",
"Epoch 87/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.09s\n",
"Epoch 88/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.11s\n",
"Epoch 89/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.25s\n",
"Epoch 90/100, Train Loss: 1.8047, Val Loss: 1.8873, Time: 2.42s\n",
"Epoch 91/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.34s\n",
"Epoch 92/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.37s\n",
"Epoch 93/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.20s\n",
"Epoch 94/100, Train Loss: 1.8031, Val Loss: 1.8873, Time: 2.18s\n",
"Epoch 95/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.07s\n",
"Epoch 96/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.20s\n"
]
}
],
"source": [
"# Training loop\n",
"\n",
"for epoch in range(params['epochs']):\n",
" epoch_start_time = time.time()\n",
" model.train()\n",
" \n",
" train_loss = 0.0\n",
" \n",
" for batch in train_loader:\n",
" optimizer.zero_grad()\n",
" input_ids, labels = batch\n",
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
"\n",
" outputs = model(input_ids)\n",
" outputs = outputs.squeeze().float()\n",
" loss = criterion(outputs, labels)\n",
" loss.backward()\n",
" #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n",
" optimizer.step()\n",
" preds = outputs\n",
" \n",
" train_loss += loss.item()\n",
"\n",
" train_loss /= len(train_loader)\n",
" \n",
" # Validation\n",
" model.eval()\n",
" val_loss = 0.0\n",
" \n",
" with torch.no_grad():\n",
" for batch in val_loader:\n",
" input_ids, labels = batch\n",
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
" outputs = model(input_ids)\n",
" outputs = outputs.squeeze().float()\n",
" loss = criterion(outputs, labels)\n",
" preds = outputs\n",
" \n",
" val_loss += loss.item()\n",
"\n",
" val_loss /= len(val_loader)\n",
" \n",
" epoch_end_time = time.time()\n",
" \n",
" print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n",
" f'Train Loss: {train_loss:.4f}, '\n",
" f'Val Loss: {val_loss:.4f}, '\n",
" f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n",
"\n",
" "
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}