!!!WARNING!!! Nuclear refactoring bomb in coming (Now 90% more confusing but 100% cleaner)
parent
556ed1c292
commit
2ff92b9e15
|
|
@ -0,0 +1,137 @@
|
|||
import random
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
from transformers import BertForSequenceClassification, AutoTokenizer
|
||||
import numpy as np
|
||||
|
||||
import Datasets
|
||||
import dataset_helper
|
||||
import EarlyStopping
|
||||
import ml_helper
|
||||
import ml_history
|
||||
import ml_train
|
||||
|
||||
SEED = 501
|
||||
random.seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
torch.manual_seed(SEED)
|
||||
torch.cuda.manual_seed_all(SEED)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
class CustomBert(nn.Module):
|
||||
def __init__(self,dropout):
|
||||
super().__init__()
|
||||
#Bert + Custom Layers (Not a tuple any longer -- idk why)
|
||||
self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.classifier = nn.Linear(2,1)
|
||||
# self.sm = nn.Softmax(dim=1)
|
||||
|
||||
def forward(self, input_ids, attention_mask):
|
||||
x = self.bfsc(input_ids, attention_mask = attention_mask)
|
||||
x = self.dropout(x[0])
|
||||
x = self.classifier(x)
|
||||
x = x.squeeze()
|
||||
return x
|
||||
|
||||
|
||||
def freeze_bert_params(self):
|
||||
for param in self.bfsc.named_parameters():
|
||||
param[1].requires_grad_(False)
|
||||
|
||||
def unfreeze_bert_params(self):
|
||||
for param in self.bfsc.named_parameters():
|
||||
param[1].requires_grad_(True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Hyperparameter und Konfigurationen
|
||||
params = {
|
||||
# Config
|
||||
"max_len": 128,
|
||||
# Training
|
||||
"epochs": 10,
|
||||
"patience": 7,
|
||||
"batch_size": 32,
|
||||
"learning_rate": 0.001,
|
||||
"weight_decay": 5e-4 ,
|
||||
# Model
|
||||
"filter_sizes": [2, 3, 4, 5],
|
||||
"num_filters": 150,
|
||||
"dropout": 0.6
|
||||
}
|
||||
|
||||
# Configs
|
||||
MODEL_NAME = 'BERT.pt'
|
||||
HIST_NAME = 'BERT_history'
|
||||
GLOVE_PATH = 'data/glove.6B.100d.txt'
|
||||
DATA_PATH = 'data/hack.csv'
|
||||
FREEZE_BERT = False
|
||||
EMBEDDING_DIM = 100
|
||||
TEST_SIZE = 0.1
|
||||
VAL_SIZE = 0.1
|
||||
|
||||
# Daten laden und vorbereiten
|
||||
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
|
||||
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
|
||||
|
||||
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
|
||||
|
||||
# Aufteilen der Daten
|
||||
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
|
||||
|
||||
# Initialize BertTokenizer from Pretrained
|
||||
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
|
||||
print("Tokenizer Initialized")
|
||||
|
||||
# Dataset und DataLoader
|
||||
train_dataset = Datasets.BertDataset(tokenizer, data_split['train']['X'], data_split['train']['y'], max_len=params["max_len"])
|
||||
val_dataset = Datasets.BertDataset(tokenizer, data_split['val']['X'], data_split['val']['y'], max_len=params["max_len"])
|
||||
test_dataset = Datasets.BertDataset(tokenizer, data_split['test']['X'], data_split['test']['y'], max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
||||
# Modell initialisieren
|
||||
model = CustomBert(dropout=params["dropout"])
|
||||
|
||||
device = ml_helper.get_device(verbose=True, include_mps=False)
|
||||
model = model.to(device)
|
||||
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
|
||||
|
||||
hist = ml_history.History()
|
||||
|
||||
# Training und Validierung
|
||||
for epoch in range(params["epochs"]):
|
||||
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"], bert_freeze=FREEZE_BERT, is_bert=True)
|
||||
|
||||
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist, is_bert=True)
|
||||
|
||||
early_stopping(val_rmse, model)
|
||||
if early_stopping.early_stop:
|
||||
print("Early stopping triggered.")
|
||||
break
|
||||
|
||||
# Load best model
|
||||
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
|
||||
|
||||
# Test Evaluation
|
||||
test_labels, test_preds = ml_train.test_loop(model, test_loader, device, is_bert=True)
|
||||
|
||||
hist.add_test_results(test_labels, test_preds)
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# RMSE, MAE und R²-Score für das Test-Set
|
||||
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
||||
test_r2 = r2_score(test_labels, test_preds)
|
||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
class BalancedCELoss(nn.Module):
|
||||
def __init__(self, alpha=0.1):
|
||||
super(BalancedCELoss, self).__init__()
|
||||
self.bce_loss = nn.CrossEntropyLoss()
|
||||
self.alpha = alpha
|
||||
|
||||
def forward(self, predictions, targets):
|
||||
# detect num of unique classes
|
||||
num_classes = len(torch.unique(targets))
|
||||
if num_classes == 1:
|
||||
# If only one class than split it into two classes
|
||||
predictions = torch.cat((1 - predictions, predictions), dim=1)
|
||||
|
||||
|
||||
# Calculate the standard binary cross-entropy loss
|
||||
bce_loss = self.bce_loss(predictions, targets)
|
||||
|
||||
predictions = torch.argmax(predictions, dim=1)
|
||||
|
||||
# Calculate the number of predictions for each class
|
||||
class_0_preds_n = predictions[predictions == 0]
|
||||
class_1_preds_n = predictions[predictions == 1]
|
||||
|
||||
# Calculate the number of labels for each class based on predictions
|
||||
class_0_labels_n = targets[targets == 0]
|
||||
class_1_labels_n = targets[targets == 1]
|
||||
|
||||
preds_ratio_0 = len(class_0_preds_n) / len(predictions)
|
||||
preds_ratio_1 = len(class_1_preds_n) / len(predictions)
|
||||
|
||||
labels_ratio_0 = len(class_0_labels_n) / len(targets)
|
||||
labels_ratio_1 = len(class_1_labels_n) / len(targets)
|
||||
|
||||
# Calculate the imbalance penalty
|
||||
imbalance_penalty = np.abs(preds_ratio_0 - labels_ratio_0) + np.abs(preds_ratio_1 - labels_ratio_1)
|
||||
|
||||
# Combine the BCE loss with the imbalance penalty
|
||||
total_loss = bce_loss + self.alpha * imbalance_penalty
|
||||
|
||||
return total_loss
|
||||
417
BertFine.ipynb
417
BertFine.ipynb
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,147 @@
|
|||
import random
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
import numpy as np
|
||||
|
||||
import Datasets
|
||||
import dataset_helper
|
||||
import EarlyStopping
|
||||
import ml_helper
|
||||
import ml_history
|
||||
import ml_train
|
||||
|
||||
SEED = 501
|
||||
random.seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
torch.manual_seed(SEED)
|
||||
torch.cuda.manual_seed_all(SEED)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
class EnhancedCNNRegressor(nn.Module):
|
||||
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
|
||||
super(EnhancedCNNRegressor, self).__init__()
|
||||
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
|
||||
|
||||
# Convolutional Schichten mit Batch-Normalisierung
|
||||
self.convs = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
|
||||
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
|
||||
nn.ReLU(),
|
||||
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
|
||||
nn.Dropout(dropout) # Dropout nach jeder Schicht
|
||||
)
|
||||
for fs in filter_sizes
|
||||
])
|
||||
|
||||
# Fully-Connected Layer
|
||||
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
|
||||
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
|
||||
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
|
||||
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
|
||||
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
|
||||
x = self.dropout(x)
|
||||
return self.fc2(x).squeeze(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Hyperparameter und Konfigurationen
|
||||
params = {
|
||||
# Config
|
||||
"max_len": 280,
|
||||
# Training
|
||||
"epochs": 25,
|
||||
"patience": 7,
|
||||
"batch_size": 32,
|
||||
"learning_rate": 0.001,
|
||||
"weight_decay": 5e-4 ,
|
||||
# Model
|
||||
"filter_sizes": [2, 3, 4, 5],
|
||||
"num_filters": 150,
|
||||
"dropout": 0.6
|
||||
}
|
||||
|
||||
# Configs
|
||||
MODEL_NAME = 'CNN.pt'
|
||||
HIST_NAME = 'CNN_history'
|
||||
GLOVE_PATH = 'data/glove.6B.100d.txt'
|
||||
DATA_PATH = 'data/hack.csv'
|
||||
EMBEDDING_DIM = 100
|
||||
TEST_SIZE = 0.1
|
||||
VAL_SIZE = 0.1
|
||||
|
||||
# Daten laden und vorbereiten
|
||||
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
|
||||
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
|
||||
|
||||
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
|
||||
|
||||
# Aufteilen der Daten
|
||||
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
|
||||
|
||||
# Dataset und DataLoader
|
||||
train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
||||
# Modell initialisieren
|
||||
model = EnhancedCNNRegressor(
|
||||
vocab_size=vocab_size,
|
||||
embedding_dim=EMBEDDING_DIM,
|
||||
filter_sizes=params["filter_sizes"],
|
||||
num_filters=params["num_filters"],
|
||||
embedding_matrix=embedding_matrix,
|
||||
dropout=params["dropout"]
|
||||
)
|
||||
|
||||
|
||||
device = ml_helper.get_device(verbose=True, include_mps=False)
|
||||
model = model.to(device)
|
||||
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
|
||||
|
||||
hist = ml_history.History()
|
||||
|
||||
# Training und Validierung
|
||||
for epoch in range(params["epochs"]):
|
||||
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
|
||||
|
||||
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
|
||||
|
||||
early_stopping(val_rmse, model)
|
||||
if early_stopping.early_stop:
|
||||
print("Early stopping triggered.")
|
||||
break
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# Load best model
|
||||
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
|
||||
|
||||
# Test Evaluation
|
||||
test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
|
||||
|
||||
hist.add_test_results(test_labels, test_preds)
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# RMSE, MAE und R²-Score für das Test-Set
|
||||
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
||||
test_r2 = r2_score(test_labels, test_preds)
|
||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||
227
CNN_CLASS.py
227
CNN_CLASS.py
|
|
@ -1,227 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.metrics import accuracy_score
|
||||
from tqdm import tqdm
|
||||
from dataset_generator import create_embedding_matrix, split_data, load_preprocess_data
|
||||
from HumorDataset import TextDataset
|
||||
from BalancedCELoss import BalancedCELoss
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Hyperparameter und Konfigurationen
|
||||
params = {
|
||||
"embedding_dim": 100,
|
||||
"filter_sizes": [2, 3, 4, 5],
|
||||
"num_filters": 150,
|
||||
"batch_size": 32,
|
||||
"learning_rate": 0.001,
|
||||
"epochs": 25,
|
||||
"glove_path": 'data/glove.6B.100d.txt',
|
||||
"max_len": 280,
|
||||
"test_size": 0.1,
|
||||
"val_size": 0.1,
|
||||
"patience": 5,
|
||||
"data_path": 'data/hack.csv',
|
||||
"dropout": 0.6,
|
||||
"weight_decay": 5e-4,
|
||||
"alpha": 0.1 # Alpha für die Balance in der Loss-Funktion
|
||||
}
|
||||
|
||||
# CNN-Modell für binäre Klassifikation
|
||||
class EnhancedCNNBinaryClassifier(nn.Module):
|
||||
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
|
||||
super(EnhancedCNNBinaryClassifier, self).__init__()
|
||||
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
|
||||
self.convs = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
|
||||
nn.BatchNorm2d(num_filters),
|
||||
nn.ReLU(),
|
||||
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
|
||||
nn.Dropout(dropout)
|
||||
)
|
||||
for fs in filter_sizes
|
||||
])
|
||||
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)
|
||||
self.fc2 = nn.Linear(128, 2) # 2 Klassen, daher 2 Outputs für CrossEntropyLoss
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x).unsqueeze(1)
|
||||
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]
|
||||
x = torch.cat(conv_outputs, 1)
|
||||
x = torch.relu(self.fc1(x))
|
||||
x = self.dropout(x)
|
||||
return self.fc2(x) # 2 Outputs, CrossEntropyLoss übernimmt die Softmax
|
||||
|
||||
# Visualisierungsfunktionen
|
||||
def visualize_predictions(true_values, predicted_values):
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Unterschied zwischen vorhergesagten und wahren Werten
|
||||
true_values = np.array(true_values)
|
||||
predicted_values = np.array(predicted_values)
|
||||
|
||||
correct_indices = true_values == predicted_values
|
||||
incorrect_indices = ~correct_indices
|
||||
|
||||
# Scatterplot
|
||||
plt.scatter(
|
||||
np.arange(len(true_values))[correct_indices],
|
||||
true_values[correct_indices],
|
||||
color='green',
|
||||
label='Richtig vorhergesagt'
|
||||
)
|
||||
plt.scatter(
|
||||
np.arange(len(true_values))[incorrect_indices],
|
||||
true_values[incorrect_indices],
|
||||
color='red',
|
||||
label='Falsch vorhergesagt'
|
||||
)
|
||||
|
||||
plt.axhline(0.5, linestyle='--', color='blue', label='Schwelle (0.5)')
|
||||
plt.ylim(-0.5, 1.5)
|
||||
plt.yticks([0, 1], labels=['Klasse 0', 'Klasse 1'])
|
||||
plt.xlabel('Datenindex')
|
||||
plt.ylabel('Klassifikation')
|
||||
plt.title('Richtige vs. Falsche Vorhersagen')
|
||||
plt.legend()
|
||||
plt.grid(True, linestyle='--', alpha=0.6)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
def visualize_distribution(true_values, predicted_values):
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Häufigkeiten der Klassen berechnen
|
||||
true_counts = np.bincount(true_values, minlength=2)
|
||||
predicted_counts = np.bincount(predicted_values, minlength=2)
|
||||
|
||||
# Barplot erstellen
|
||||
labels = ['Klasse 0', 'Klasse 1']
|
||||
x = np.arange(len(labels))
|
||||
|
||||
plt.bar(x - 0.2, true_counts, width=0.4, color='skyblue', label='Wahre Werte', edgecolor='black')
|
||||
plt.bar(x + 0.2, predicted_counts, width=0.4, color='salmon', label='Vorhergesagte Werte', edgecolor='black')
|
||||
|
||||
plt.title('Verteilung der wahren Werte und Vorhersagen')
|
||||
plt.xticks(x, labels)
|
||||
plt.ylabel('Häufigkeit')
|
||||
plt.xlabel('Klassen')
|
||||
plt.legend()
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
# Gerät initialisieren
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Daten laden
|
||||
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
|
||||
gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
|
||||
)
|
||||
X, y = load_preprocess_data(path_data=params["data_path"])
|
||||
|
||||
# Daten splitten
|
||||
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
|
||||
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
||||
# Modell initialisieren
|
||||
model = EnhancedCNNBinaryClassifier(
|
||||
vocab_size=vocab_size,
|
||||
embedding_dim=params["embedding_dim"],
|
||||
filter_sizes=params["filter_sizes"],
|
||||
num_filters=params["num_filters"],
|
||||
embedding_matrix=embedding_matrix,
|
||||
dropout=params["dropout"]
|
||||
)
|
||||
model = model.to(device)
|
||||
|
||||
# BalancedCELoss verwenden
|
||||
criterion = BalancedCELoss(alpha=params["alpha"])
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||
|
||||
# Training
|
||||
history = {
|
||||
"train_loss": [],
|
||||
"val_loss": [],
|
||||
"train_acc": [],
|
||||
"val_acc": [],
|
||||
}
|
||||
|
||||
for epoch in range(params["epochs"]):
|
||||
model.train()
|
||||
train_loss, correct, total = 0.0, 0, 0
|
||||
|
||||
with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
|
||||
for X_batch, y_batch in pbar:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
||||
optimizer.zero_grad()
|
||||
outputs = model(X_batch)
|
||||
loss = criterion(outputs, y_batch)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
train_loss += loss.item()
|
||||
predicted = torch.argmax(outputs, dim=1)
|
||||
correct += (predicted == y_batch).sum().item()
|
||||
total += y_batch.size(0)
|
||||
|
||||
pbar.set_postfix({"Train Loss": loss.item()})
|
||||
|
||||
train_acc = correct / total
|
||||
history["train_loss"].append(train_loss / len(train_loader))
|
||||
history["train_acc"].append(train_acc)
|
||||
|
||||
# Validation
|
||||
model.eval()
|
||||
val_loss, correct, total = 0.0, 0, 0
|
||||
with torch.no_grad():
|
||||
for X_batch, y_batch in val_loader:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
||||
outputs = model(X_batch)
|
||||
loss = criterion(outputs, y_batch)
|
||||
val_loss += loss.item()
|
||||
predicted = torch.argmax(outputs, dim=1)
|
||||
correct += (predicted == y_batch).sum().item()
|
||||
total += y_batch.size(0)
|
||||
|
||||
val_acc = correct / total
|
||||
history["val_loss"].append(val_loss / len(val_loader))
|
||||
history["val_acc"].append(val_acc)
|
||||
|
||||
print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
|
||||
print(f"Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")
|
||||
|
||||
# Testen und Visualisieren
|
||||
model.eval()
|
||||
test_correct, test_total = 0, 0
|
||||
true_labels, predicted_labels = [], []
|
||||
|
||||
with torch.no_grad():
|
||||
for X_batch, y_batch in test_loader:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
||||
outputs = model(X_batch)
|
||||
predicted = torch.argmax(outputs, dim=1)
|
||||
true_labels.extend(y_batch.cpu().numpy())
|
||||
predicted_labels.extend(predicted.cpu().numpy())
|
||||
test_correct += (predicted == y_batch).sum().item()
|
||||
test_total += y_batch.size(0)
|
||||
|
||||
test_accuracy = test_correct / test_total
|
||||
print(f"Test Accuracy: {test_accuracy:.4f}")
|
||||
|
||||
# Visualisierung der Vorhersagen (Scatterplot)
|
||||
visualize_predictions(true_labels, predicted_labels)
|
||||
|
||||
# Visualisierung der Verteilung (Barplot)
|
||||
visualize_distribution(true_labels, predicted_labels)
|
||||
316
CNN_REG.py
316
CNN_REG.py
|
|
@ -1,316 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
from tqdm import tqdm # Fortschrittsbalken-Bibliothek
|
||||
from dataset_generator import create_embedding_matrix, split_data
|
||||
from HumorDataset import TextRegDataset
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Hyperparameter und Konfigurationen
|
||||
params = {
|
||||
"embedding_dim": 100,
|
||||
"filter_sizes": [2, 3, 4, 5], # Zusätzliche Filtergröße
|
||||
"num_filters": 150, # Erhöhte Anzahl von Filtern
|
||||
"batch_size": 32,
|
||||
"learning_rate": 0.001,
|
||||
"epochs": 25,
|
||||
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
|
||||
"max_len": 280,
|
||||
"test_size": 0.1,
|
||||
"val_size": 0.1,
|
||||
"patience": 5,
|
||||
"data_path": 'data/hack.csv', # Pfad zu den Daten
|
||||
"dropout": 0.6, # Erhöhtes Dropout
|
||||
"weight_decay": 5e-4 # L2-Regularisierung
|
||||
}
|
||||
|
||||
# EarlyStopping-Klasse mit Ordnerprüfung
|
||||
class EarlyStopping:
|
||||
def __init__(self, patience=5, verbose=False):
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.counter = 0
|
||||
self.best_score = None
|
||||
self.early_stop = False
|
||||
|
||||
def __call__(self, val_loss, model):
|
||||
score = -val_loss
|
||||
if self.best_score is None:
|
||||
self.best_score = score
|
||||
self.save_checkpoint(val_loss, model)
|
||||
elif score < self.best_score:
|
||||
self.counter += 1
|
||||
if self.counter >= self.patience:
|
||||
self.early_stop = True
|
||||
else:
|
||||
self.best_score = score
|
||||
self.save_checkpoint(val_loss, model)
|
||||
self.counter = 0
|
||||
|
||||
def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
|
||||
directory = "checkpoints"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory) # Erstelle den Ordner, falls er nicht existiert
|
||||
if self.verbose:
|
||||
print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
|
||||
torch.save(model.state_dict(), os.path.join(directory, filename))
|
||||
|
||||
# Plot-Funktion für Training
|
||||
def plot_learning_curves(history):
|
||||
epochs = range(1, len(history['train_loss']) + 1)
|
||||
|
||||
# Loss-Plot
|
||||
plt.figure(figsize=(14, 6))
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.plot(epochs, history['train_loss'], label='Train Loss')
|
||||
plt.plot(epochs, history['val_loss'], label='Val Loss')
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('Loss')
|
||||
plt.title('Training and Validation Loss')
|
||||
plt.legend()
|
||||
|
||||
# RMSE-Plot
|
||||
plt.subplot(1, 2, 2)
|
||||
plt.plot(epochs, history['train_rmse'], label='Train RMSE')
|
||||
plt.plot(epochs, history['val_rmse'], label='Val RMSE')
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('RMSE')
|
||||
plt.title('Training and Validation RMSE')
|
||||
plt.legend()
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
# Visualisierung der Zielvariablen (Scores)
|
||||
def visualize_data_distribution(y):
|
||||
print("\n--- Zielvariable: Statistik ---")
|
||||
print(f"Min: {np.min(y)}, Max: {np.max(y)}")
|
||||
print(f"Mittelwert: {np.mean(y):.4f}, Standardabweichung: {np.std(y):.4f}")
|
||||
|
||||
# Histogramm plotten
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(y, bins=20, color='skyblue', edgecolor='black')
|
||||
plt.title('Verteilung der Zielvariable (Scores)')
|
||||
plt.xlabel('Score')
|
||||
plt.ylabel('Häufigkeit')
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
plt.show()
|
||||
|
||||
# Funktion zum Laden und Vorverarbeiten der Daten
|
||||
def load_preprocess_data(path_data='data/hack.csv'):
|
||||
# Daten laden
|
||||
df = pd.read_csv(path_data)
|
||||
|
||||
# Fehlende Werte in der Zielspalte entfernen
|
||||
df = df.dropna(subset=['humor_rating'])
|
||||
|
||||
# Zielvariable aus der Spalte 'humor_rating' extrahieren
|
||||
df['y'] = df['humor_rating'].astype(float) # Sicherstellen, dass Zielvariable numerisch ist
|
||||
|
||||
# Eingabetexte und Zielvariable zuweisen
|
||||
X = df['text']
|
||||
y = df['y']
|
||||
|
||||
# Debug-Ausgabe zur Überprüfung
|
||||
print(f"Erste Zielwerte: {y.head(10)}")
|
||||
print(f"Datentyp der Zielvariable: {y.dtype}")
|
||||
print(f"Anzahl der Beispiele: {len(X)}")
|
||||
|
||||
return X, y
|
||||
|
||||
# CNN-Modell für Regression mit erweiterten Regularisierungen
|
||||
class EnhancedCNNRegressor(nn.Module):
|
||||
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
|
||||
super(EnhancedCNNRegressor, self).__init__()
|
||||
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
|
||||
|
||||
# Convolutional Schichten mit Batch-Normalisierung
|
||||
self.convs = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
|
||||
nn.BatchNorm2d(num_filters), # Batch-Normalisierung
|
||||
nn.ReLU(),
|
||||
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
|
||||
nn.Dropout(dropout) # Dropout nach jeder Schicht
|
||||
)
|
||||
for fs in filter_sizes
|
||||
])
|
||||
|
||||
# Fully-Connected Layer
|
||||
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128) # Erweiterte Dense-Schicht
|
||||
self.fc2 = nn.Linear(128, 1) # Ausgangsschicht (Regression)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x).unsqueeze(1) # [Batch, 1, Seq, Embedding]
|
||||
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs] # Pooling reduziert Dim
|
||||
x = torch.cat(conv_outputs, 1) # Kombiniere Features von allen Filtern
|
||||
x = torch.relu(self.fc1(x)) # Zusätzliche Dense-Schicht
|
||||
x = self.dropout(x)
|
||||
return self.fc2(x).squeeze(1)
|
||||
|
||||
# Device auf CPU setzen
|
||||
device = torch.device("cpu")
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Daten laden und vorbereiten
|
||||
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
|
||||
gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
|
||||
)
|
||||
X, y = load_preprocess_data(path_data=params["data_path"])
|
||||
|
||||
# Visualisierung der Daten
|
||||
visualize_data_distribution(y)
|
||||
|
||||
# Aufteilen der Daten
|
||||
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
|
||||
|
||||
# Dataset und DataLoader
|
||||
train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
||||
# Modell initialisieren
|
||||
model = EnhancedCNNRegressor(
|
||||
vocab_size=vocab_size,
|
||||
embedding_dim=params["embedding_dim"],
|
||||
filter_sizes=params["filter_sizes"],
|
||||
num_filters=params["num_filters"],
|
||||
embedding_matrix=embedding_matrix,
|
||||
dropout=params["dropout"]
|
||||
)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model = model.to(device)
|
||||
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||
early_stopping = EarlyStopping(patience=params["patience"], verbose=True)
|
||||
|
||||
# Speicher für Trainingsmetriken
|
||||
history = {
|
||||
"train_loss": [],
|
||||
"val_loss": [],
|
||||
"train_rmse": [],
|
||||
"val_rmse": [],
|
||||
}
|
||||
|
||||
# Training und Validierung
|
||||
for epoch in range(params["epochs"]):
|
||||
model.train()
|
||||
train_loss = 0.0
|
||||
train_preds, train_labels = [], []
|
||||
|
||||
# Fortschrittsbalken für Training innerhalb einer Epoche
|
||||
with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
|
||||
for X_batch, y_batch in pbar:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
optimizer.zero_grad()
|
||||
predictions = model(X_batch).float()
|
||||
loss = criterion(predictions, y_batch)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
train_loss += loss.item()
|
||||
|
||||
# Speichere echte und vorhergesagte Werte für Metriken
|
||||
train_preds.extend(predictions.cpu().detach().numpy())
|
||||
train_labels.extend(y_batch.cpu().detach().numpy())
|
||||
|
||||
# Update der Fortschrittsanzeige
|
||||
pbar.set_postfix({"Train Loss": loss.item()})
|
||||
|
||||
train_rmse = np.sqrt(mean_squared_error(train_labels, train_preds)) # RMSE
|
||||
history["train_loss"].append(train_loss / len(train_loader))
|
||||
history["train_rmse"].append(train_rmse)
|
||||
|
||||
# Validation
|
||||
model.eval()
|
||||
val_loss = 0.0
|
||||
val_preds, val_labels = [], []
|
||||
with torch.no_grad():
|
||||
for X_batch, y_batch in val_loader:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
predictions = model(X_batch).float()
|
||||
loss = criterion(predictions, y_batch)
|
||||
val_loss += loss.item()
|
||||
|
||||
val_preds.extend(predictions.cpu().detach().numpy())
|
||||
val_labels.extend(y_batch.cpu().detach().numpy())
|
||||
|
||||
val_rmse = np.sqrt(mean_squared_error(val_labels, val_preds)) # RMSE
|
||||
history["val_loss"].append(val_loss / len(val_loader))
|
||||
history["val_rmse"].append(val_rmse)
|
||||
|
||||
print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
|
||||
print(f"Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
|
||||
|
||||
early_stopping(val_rmse, model)
|
||||
if early_stopping.early_stop:
|
||||
print("Early stopping triggered.")
|
||||
break
|
||||
|
||||
# Plot der Lernkurven
|
||||
plot_learning_curves(history)
|
||||
# Funktion zur Visualisierung der richtigen und falschen Vorhersagen
|
||||
def visualize_predictions(true_values, predicted_values):
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Unterschied zwischen vorhergesagten und wahren Werten
|
||||
correct_indices = np.isclose(true_values, predicted_values, atol=0.3) # Als korrekt angenommen, wenn Differenz <= 0.3
|
||||
|
||||
# Plot
|
||||
plt.scatter(true_values[correct_indices], predicted_values[correct_indices], color='green', label='Richtig vorhergesagt')
|
||||
plt.scatter(true_values[~correct_indices], predicted_values[~correct_indices], color='red', label='Falsch vorhergesagt')
|
||||
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal-Linie')
|
||||
|
||||
plt.xlabel('Wahre Werte')
|
||||
plt.ylabel('Vorhergesagte Werte')
|
||||
plt.title('Richtige vs Falsche Vorhersagen')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# Test Evaluation
|
||||
model.eval()
|
||||
test_preds, test_labels = [], []
|
||||
with torch.no_grad():
|
||||
for X_batch, y_batch in test_loader:
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
predictions = model(X_batch).float()
|
||||
test_preds.extend(predictions.cpu().detach().numpy())
|
||||
test_labels.extend(y_batch.cpu().detach().numpy())
|
||||
|
||||
# Konvertierung zu NumPy-Arrays
|
||||
true_values = np.array(test_labels)
|
||||
predicted_values = np.array(test_preds)
|
||||
|
||||
# Visualisierung der Ergebnisse
|
||||
visualize_predictions(true_values, predicted_values)
|
||||
|
||||
# RMSE, MAE und R²-Score für das Test-Set
|
||||
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
||||
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||
test_r2 = r2_score(test_labels, test_preds)
|
||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||
|
||||
|
||||
|
||||
# plot distribution of predicted values and true values
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
|
||||
plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
|
||||
plt.title('Distribution of Predicted and True Values')
|
||||
plt.xlabel('Score')
|
||||
plt.ylabel('Frequency')
|
||||
plt.legend()
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
plt.show()
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
This file contains the Datasets class.
|
||||
"""
|
||||
import torch
|
||||
from nltk.tokenize import word_tokenize
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
class GloveDataset(Dataset):
|
||||
def __init__(self, texts, labels, word_index, max_len=50):
|
||||
self.original_indices = labels.index.to_list()
|
||||
self.texts = texts.reset_index(drop=True)
|
||||
self.labels = labels.reset_index(drop=True)
|
||||
self.word_index = word_index
|
||||
self.max_len = max_len
|
||||
|
||||
def __len__(self):
|
||||
return len(self.texts)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
texts = self.texts[idx]
|
||||
tokens = word_tokenize(texts.lower())
|
||||
|
||||
label = self.labels[idx]
|
||||
# Tokenize and convert to indices
|
||||
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
|
||||
# Pad or truncate to max_len
|
||||
if len(input_ids) < self.max_len:
|
||||
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
|
||||
else:
|
||||
input_ids = input_ids[:self.max_len]
|
||||
|
||||
# Convert to PyTorch tensors
|
||||
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
||||
label = torch.tensor(label, dtype=torch.float)
|
||||
|
||||
return input_ids, label
|
||||
|
||||
|
||||
class BertDataset(Dataset):
|
||||
def __init__(self,tokenizer:AutoTokenizer, texts, labels, max_len:int=128):
|
||||
super(BertDataset,self).__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_len
|
||||
self.text = texts.to_numpy()
|
||||
self.labels = labels.to_numpy()
|
||||
|
||||
def __getitem__(self,idx:int):
|
||||
text = self.text[idx]
|
||||
labels = self.labels[idx]
|
||||
encoding = self.tokenizer(
|
||||
text,
|
||||
padding="max_length",
|
||||
return_attention_mask = True,
|
||||
max_length=self.max_length,
|
||||
truncation = True,
|
||||
return_tensors = 'pt'
|
||||
)
|
||||
input_ids = encoding['input_ids'].flatten()
|
||||
attention_mask = encoding['attention_mask'].flatten()
|
||||
|
||||
return {
|
||||
'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
|
||||
'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
|
||||
'labels':torch.tensor(labels,dtype=torch.float)
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
|
@ -1,12 +1,14 @@
|
|||
import torch
|
||||
import os
|
||||
|
||||
class EarlyStopping:
|
||||
def __init__(self, patience=5, verbose=False):
|
||||
class EarlyStoppingCallback:
|
||||
def __init__(self, model_name, patience=5, verbose=False):
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.counter = 0
|
||||
self.best_score = None
|
||||
self.early_stop = False
|
||||
self.model_name = model_name
|
||||
|
||||
def __call__(self, val_loss, model):
|
||||
score = -val_loss
|
||||
|
|
@ -22,7 +24,10 @@ class EarlyStopping:
|
|||
self.save_checkpoint(val_loss, model)
|
||||
self.counter = 0
|
||||
|
||||
def save_checkpoint(self, val_loss, model, filename='checkpoint.pt'):
|
||||
def save_checkpoint(self, val_loss, model):
|
||||
directory = "models/checkpoints"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory) # Create the directory if it does not exist
|
||||
if self.verbose:
|
||||
print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
|
||||
torch.save(model.state_dict(), f'checkpoints/{filename}')
|
||||
print(f'└ Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}). Saving model ...')
|
||||
torch.save(model.state_dict(), os.path.join(directory, self.model_name))
|
||||
111
HumorDataset.py
111
HumorDataset.py
|
|
@ -1,111 +0,0 @@
|
|||
"""
|
||||
This file contains the HumorDataset class.
|
||||
"""
|
||||
import torch
|
||||
import numpy as np
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
class TextRegDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, texts, labels, word_index, max_len=50):
|
||||
|
||||
self.original_indices = labels.index.to_list()
|
||||
|
||||
self.texts = texts.reset_index(drop=True)
|
||||
self.labels = labels.reset_index(drop=True)
|
||||
self.word_index = word_index
|
||||
self.max_len = max_len
|
||||
|
||||
def __len__(self):
|
||||
return len(self.texts)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
texts = self.texts[idx]
|
||||
tokens = word_tokenize(texts.lower())
|
||||
|
||||
label = self.labels[idx]
|
||||
|
||||
# Tokenize and convert to indices
|
||||
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
|
||||
|
||||
# Pad or truncate to max_len
|
||||
if len(input_ids) < self.max_len:
|
||||
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
|
||||
else:
|
||||
input_ids = input_ids[:self.max_len]
|
||||
|
||||
# Convert to PyTorch tensors
|
||||
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
||||
label = torch.tensor(label, dtype=torch.float)
|
||||
|
||||
return input_ids, label
|
||||
|
||||
class TextDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, texts, labels, word_index, max_len=50):
|
||||
|
||||
self.original_indices = labels.index.to_list()
|
||||
|
||||
self.texts = texts.reset_index(drop=True)
|
||||
self.labels = labels.reset_index(drop=True)
|
||||
self.word_index = word_index
|
||||
self.max_len = max_len
|
||||
|
||||
def __len__(self):
|
||||
return len(self.texts)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
texts = self.texts[idx]
|
||||
tokens = word_tokenize(texts.lower())
|
||||
|
||||
label = self.labels[idx]
|
||||
|
||||
# Tokenize and convert to indices
|
||||
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
|
||||
|
||||
# Pad or truncate to max_len
|
||||
if len(input_ids) < self.max_len:
|
||||
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
|
||||
else:
|
||||
input_ids = input_ids[:self.max_len]
|
||||
|
||||
# Convert to PyTorch tensors
|
||||
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
||||
label = torch.tensor(label, dtype=torch.long)
|
||||
|
||||
return input_ids, label
|
||||
|
||||
class HumorDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, data, labels, vocab_size=0, emb_dim=None):
|
||||
self.original_indices = labels.index.to_list()
|
||||
|
||||
self.data = data
|
||||
self.labels = labels.reset_index(drop=True)
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
|
||||
# TODO: bug fix
|
||||
self.shape = self.get_shape()
|
||||
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = {'input_ids': torch.tensor(self.data[idx], dtype=torch.float)}
|
||||
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
|
||||
return item
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
def get_single_shape(self, data):
|
||||
shape_data = None
|
||||
if type(data) == list:
|
||||
shape_data = len(data[0])
|
||||
elif type(data) == torch.Tensor:
|
||||
shape_data = data[0].shape
|
||||
elif type(data) == np.ndarray:
|
||||
shape_data = data[0].shape
|
||||
return shape_data
|
||||
|
||||
def get_shape(self):
|
||||
shape_data = self.get_single_shape(self.data)
|
||||
shape_labels = self.get_single_shape(self.labels)
|
||||
return shape_data, shape_labels
|
||||
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
import numpy as np
|
||||
|
||||
import Datasets
|
||||
import dataset_helper
|
||||
import EarlyStopping
|
||||
import ml_helper
|
||||
import ml_history
|
||||
import ml_train
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
"""
|
||||
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, vocab_size=5000, dropout=0.1):
|
||||
super().__init__()
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
pe = torch.zeros(vocab_size, d_model)
|
||||
position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
torch.arange(0, d_model, 2).float()
|
||||
* (-math.log(10000.0) / d_model)
|
||||
)
|
||||
pe[:, 0::2] = torch.sin(position * div_term)
|
||||
pe[:, 1::2] = torch.cos(position * div_term)
|
||||
pe = pe.unsqueeze(0)
|
||||
self.register_buffer("pe", pe)
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.pe[:, : x.size(1), :]
|
||||
return self.dropout(x)
|
||||
|
||||
|
||||
class TransformerBinaryClassifier(nn.Module):
|
||||
"""
|
||||
Text classifier based on a pytorch TransformerEncoder.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embeddings,
|
||||
nhead=8,
|
||||
dim_feedforward=2048,
|
||||
num_layers=6,
|
||||
positional_dropout=0.1,
|
||||
classifier_dropout=0.1,
|
||||
):
|
||||
|
||||
super().__init__()
|
||||
|
||||
vocab_size, d_model = embeddings.size()
|
||||
assert d_model % nhead == 0, "nheads must divide evenly into d_model"
|
||||
|
||||
self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
|
||||
|
||||
self.pos_encoder = PositionalEncoding(
|
||||
d_model=d_model,
|
||||
dropout=positional_dropout,
|
||||
vocab_size=vocab_size,
|
||||
)
|
||||
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=d_model,
|
||||
nhead=nhead,
|
||||
dim_feedforward=dim_feedforward,
|
||||
dropout=classifier_dropout,
|
||||
)
|
||||
self.transformer_encoder = nn.TransformerEncoder(
|
||||
encoder_layer,
|
||||
num_layers=num_layers,
|
||||
)
|
||||
# normalize to stabilize and stop overfitting
|
||||
self.batch_norm = nn.BatchNorm1d(d_model)
|
||||
self.classifier = nn.Linear(d_model, 1)
|
||||
self.d_model = d_model
|
||||
|
||||
def forward(self, x):
|
||||
x = self.emb(x) * math.sqrt(self.d_model)
|
||||
x = self.pos_encoder(x)
|
||||
x = self.transformer_encoder(x)
|
||||
x = x.mean(dim=1)
|
||||
# normalize to stabilize and stop overfitting
|
||||
#x = self.batch_norm(x)
|
||||
|
||||
#NOTE: no activation function for regression
|
||||
x = self.classifier(x)
|
||||
x = x.squeeze(1)
|
||||
return x
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Hyperparameter und Konfigurationen
|
||||
params = {
|
||||
# Config
|
||||
"max_len": 280,
|
||||
# Training
|
||||
"epochs": 25,
|
||||
"patience": 7,
|
||||
"batch_size": 32,
|
||||
"learning_rate": 1e-4, # 1e-4
|
||||
"weight_decay": 5e-4 ,
|
||||
# Model
|
||||
'nhead': 2, # 5
|
||||
"dropout": 0.2,
|
||||
'hiden_dim': 2048,
|
||||
'num_layers': 6
|
||||
}
|
||||
# TODO set seeds
|
||||
|
||||
# Configs
|
||||
MODEL_NAME = 'transfomrer.pt'
|
||||
HIST_NAME = 'transformer_history'
|
||||
GLOVE_PATH = 'data/glove.6B.100d.txt'
|
||||
DATA_PATH = 'data/hack.csv'
|
||||
EMBEDDING_DIM = 100
|
||||
TEST_SIZE = 0.1
|
||||
VAL_SIZE = 0.1
|
||||
|
||||
# Daten laden und vorbereiten
|
||||
embedding_matrix, word_index, vocab_size, d_model = dataset_helper.get_embedding_matrix(
|
||||
gloVe_path=GLOVE_PATH, emb_len=EMBEDDING_DIM)
|
||||
|
||||
X, y = dataset_helper.load_preprocess_data(path_data=DATA_PATH, verbose=True)
|
||||
|
||||
# Aufteilen der Daten
|
||||
data_split = dataset_helper.split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE)
|
||||
|
||||
# Dataset und DataLoader
|
||||
train_dataset = Datasets.GloveDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = Datasets.GloveDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = Datasets.GloveDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
||||
# Modell initialisieren
|
||||
model = TransformerBinaryClassifier(
|
||||
embeddings=embedding_matrix,
|
||||
nhead=params['nhead'],
|
||||
dim_feedforward=params['hiden_dim'],
|
||||
num_layers=params['num_layers'],
|
||||
positional_dropout=params["dropout"],
|
||||
classifier_dropout=params["dropout"],
|
||||
)
|
||||
|
||||
device = ml_helper.get_device(verbose=True, include_mps=False)
|
||||
model = model.to(device)
|
||||
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"]) #, weight_decay=params["weight_decay"])
|
||||
early_stopping = EarlyStopping.EarlyStoppingCallback(patience=params["patience"], verbose=True, model_name=MODEL_NAME)
|
||||
|
||||
hist = ml_history.History()
|
||||
|
||||
# Training und Validierung
|
||||
for epoch in range(params["epochs"]):
|
||||
ml_train.train_epoch(model, train_loader, criterion, optimizer, device, hist, epoch, params["epochs"])
|
||||
|
||||
val_rmse = ml_train.validate_epoch(model, val_loader, epoch, criterion, device, hist)
|
||||
|
||||
early_stopping(val_rmse, model)
|
||||
if early_stopping.early_stop:
|
||||
print("Early stopping triggered.")
|
||||
break
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# Load best model
|
||||
model.load_state_dict(torch.load('models/checkpoints/' + MODEL_NAME))
|
||||
|
||||
# Test Evaluation
|
||||
test_labels, test_preds = ml_train.test_loop(model, test_loader, device)
|
||||
|
||||
hist.add_test_results(test_labels, test_preds)
|
||||
|
||||
# save training history
|
||||
hist.save_history(HIST_NAME)
|
||||
|
||||
# RMSE, MAE und R²-Score für das Test-Set
|
||||
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
||||
test_r2 = r2_score(test_labels, test_preds)
|
||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||
266
bert_no_ernie.py
266
bert_no_ernie.py
|
|
@ -1,266 +0,0 @@
|
|||
# PyTorch Imports
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
# scikit-learn Imports
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
# Bert imports
|
||||
from transformers import BertForSequenceClassification, AutoTokenizer
|
||||
#Default imports (pandas, numpy, matplotlib, etc.)
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
## Select Device
|
||||
if torch.cuda.is_available():
|
||||
DEVICE = torch.device("cuda")
|
||||
else:
|
||||
DEVICE = torch.device("cpu")
|
||||
|
||||
|
||||
class SimpleHumorDataset(Dataset):
|
||||
def __init__(self,tokenizer:AutoTokenizer,dataframe:pd.DataFrame,max_length:int=128):
|
||||
super(SimpleHumorDataset,self).__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
self.text = dataframe['text'].to_numpy()
|
||||
self.labels = dataframe['is_humor'].to_numpy()
|
||||
|
||||
def __getitem__(self,idx:int):
|
||||
text = self.text[idx]
|
||||
labels = self.labels[idx]
|
||||
encoding = self.tokenizer(
|
||||
text,
|
||||
padding="max_length",
|
||||
return_attention_mask = True,
|
||||
max_length=self.max_length,
|
||||
truncation = True,
|
||||
return_tensors = 'pt'
|
||||
)
|
||||
input_ids = encoding['input_ids'].flatten()
|
||||
attention_mask = encoding['attention_mask'].flatten()
|
||||
|
||||
return {
|
||||
'input_ids': torch.as_tensor(input_ids,dtype=torch.long),
|
||||
'attention_mask':torch.as_tensor(attention_mask,dtype=torch.long),
|
||||
'labels':torch.tensor(labels,dtype=torch.long)
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
class CustomBert(nn.Module):
|
||||
def __init__(self,dropout):
|
||||
super().__init__()
|
||||
#Bert + Custom Layers (Not a tuple any longer -- idk why)
|
||||
self.bfsc = BertForSequenceClassification.from_pretrained("bert-base-uncased")
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.classifier = nn.Linear(2,2)
|
||||
# self.sm = nn.Softmax(dim=1)
|
||||
|
||||
def forward(self, input_ids, attention_mask):
|
||||
x = self.bfsc(input_ids, attention_mask = attention_mask)
|
||||
x = self.dropout(x[0])
|
||||
x = self.classifier(x)
|
||||
return x
|
||||
|
||||
|
||||
def freeze_bert_params(self):
|
||||
for param in self.bfsc.named_parameters():
|
||||
param[1].requires_grad_(False)
|
||||
|
||||
def unfreeze_bert_params(self):
|
||||
for param in self.bfsc.named_parameters():
|
||||
param[1].requires_grad_(True)
|
||||
|
||||
def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim.AdamW,train_loader:DataLoader,freeze_bert:bool=False):
|
||||
model.train()
|
||||
if freeze_bert:
|
||||
model.freeze_bert_params()
|
||||
|
||||
total_loss = 0
|
||||
len_train_loader = len(train_loader)
|
||||
for train_batch in train_loader:
|
||||
|
||||
# Set Gradient to Zero
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Unpack batch values and "push" it to GPU
|
||||
input_ids, att_mask, labels = train_batch.values()
|
||||
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE),labels.to(DEVICE)
|
||||
|
||||
# Feed Model with Data
|
||||
outputs = model(input_ids, attention_mask=att_mask)
|
||||
# print(f"{model.bfsc.}")
|
||||
# print(f"{outputs.shape}")
|
||||
loss = criterion(outputs,labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
total_loss+=loss.item()
|
||||
|
||||
print(f"Training Loss is {(total_loss/len(train_loader)):.4f}")
|
||||
return (total_loss/len(train_loader))
|
||||
|
||||
def eval_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,validation_loader:DataLoader):
|
||||
model.eval()
|
||||
total, correct = 0.0, 0.0
|
||||
total_loss = 0.0
|
||||
best_loss = float("Inf")
|
||||
with torch.no_grad():
|
||||
for val_batch in validation_loader:
|
||||
|
||||
input_ids, att_mask ,labels = val_batch.values()
|
||||
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE), labels.to(DEVICE)
|
||||
|
||||
outputs = model(input_ids,attention_mask=att_mask)
|
||||
|
||||
loss = criterion(outputs,labels)
|
||||
total_loss += loss.item()
|
||||
|
||||
predictions = torch.argmax(outputs,1)
|
||||
total += labels.size(0)
|
||||
correct += (predictions == labels).sum().item()
|
||||
|
||||
if total_loss/len(validation_loader) < best_loss:
|
||||
best_loss = total_loss/len(validation_loader)
|
||||
torch.save(model,"best_bert_model.pt")
|
||||
|
||||
print(f"Validation Loss: {total_loss/len(validation_loader):.4f} ### Validation Accuracy {correct/total*100:.4f}%")
|
||||
return total_loss/len(validation_loader)
|
||||
|
||||
def test_loop(model:CustomBert, test_loader:DataLoader):
|
||||
for batch in test_loader:
|
||||
input_ids, att_mask, labels = batch.values()
|
||||
input_ids, att_mask, labels = input_ids.to(DEVICE), att_mask.to(DEVICE), labels.to(DEVICE)
|
||||
with torch.no_grad():
|
||||
model = torch.load("best_bert_model")
|
||||
model.to(DEVICE)
|
||||
output = model(input_ids,att_mask)
|
||||
output.detach().cpu().numpy()
|
||||
labels.detach().cpu().numpy()
|
||||
pred_flat = np.argmax(output,1).flatten()
|
||||
print(accuracy_score(labels,pred_flat))
|
||||
|
||||
def plot_metrics_loss_n_acc(train_loss,validation_loss,train_acc,validation_acc):
|
||||
"""
|
||||
Method that plots Loss and Accuracy of Training and Validation Data used in given modelinstance
|
||||
"""
|
||||
# Visualize Training Loss
|
||||
# plt.plot(loss_values)
|
||||
# plt.plot(eval_values)
|
||||
# plt.hlines(np.mean(loss_values),xmin=0,xmax=EPOCH,colors='red',linestyles="dotted",label="Average Loss")
|
||||
# plt.hlines(np.mean(eval_values),xmin=0,xmax=EPOCH,colors='green',linestyles="dashed",label="Average Val Loss")
|
||||
# plt.title("Test Loss")
|
||||
# plt.xlabel("Num Epochs")
|
||||
# plt.ylabel("Total Loss of Epoch")
|
||||
# plt.show()
|
||||
pass
|
||||
|
||||
def plot_test_metrics(accuracy):
|
||||
"""
|
||||
Plot Test Metrics of Model (Confiuson Matrix, Accuracy)
|
||||
"""
|
||||
plt.plot(accuracy)
|
||||
plt.hlines(np.mean(accuracy),0,len(accuracy),'red','dotted','Mean Accuracy %d'.format(np.mean(accuracy)))
|
||||
plt.title("Accuracy of Test")
|
||||
plt.xlabel("Num Epochs")
|
||||
plt.ylabel("Accurcy 0.0 - 1.0")
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# def performance_metrics(true_labels,predictions):
|
||||
# confusion_matrix(true_labels,predictions)
|
||||
# accuracy_score(true_labels,predictions)
|
||||
# f1_score(true_labels,predictions)
|
||||
# pass
|
||||
|
||||
def create_datasets(tokenizer:AutoTokenizer,dataframe:pd.DataFrame,train_split_ratio:float,val:bool=False)->tuple[SimpleHumorDataset,SimpleHumorDataset,SimpleHumorDataset]|tuple[SimpleHumorDataset,SimpleHumorDataset]:
|
||||
if train_split_ratio > 1.0:
|
||||
raise AssertionError("Trainsplit sollte kleiner(-gleich) 1.0 sein")
|
||||
train,test = train_test_split(dataframe,train_size=train_split_ratio,random_state=501)
|
||||
if val:
|
||||
test,validation = train_test_split(test,train_size=.5,random_state=501)
|
||||
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test), SimpleHumorDataset(tokenizer,validation)
|
||||
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test)
|
||||
|
||||
def create_dataloaders(datasets:tuple|list,batchsize:int,shufflelist:list):
|
||||
train_loader = DataLoader(datasets[0],batchsize,shuffle=shufflelist[0])
|
||||
test_loader = DataLoader(datasets[1],batchsize,shuffle=shufflelist[1])
|
||||
if len(datasets) == 3:
|
||||
return train_loader, test_loader, DataLoader(datasets[2],batchsize,shuffle=shufflelist[2])
|
||||
return train_loader, test_loader
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
|
||||
# # HYPERPARAMETERS
|
||||
# # Set Max Epoch Amount
|
||||
# EPOCH = 10
|
||||
# # DROPOUT-PROBABILITY
|
||||
# DROPOUT = 0.1
|
||||
# # BATCHSIZE
|
||||
# BATCH_SIZE = 16
|
||||
# #LEARNING RATE
|
||||
# LEARNING_RATE = 1e-5
|
||||
# # RANDOM SEED
|
||||
# RNDM_SEED = 501
|
||||
# # FREEZE Bert Layers
|
||||
# FREEZE = True
|
||||
|
||||
# torch.manual_seed(RNDM_SEED)
|
||||
# np.random.seed(RNDM_SEED)
|
||||
# torch.cuda.manual_seed_all(RNDM_SEED)
|
||||
|
||||
|
||||
# Initialize Bert Model with dropout probability and port to DEVICE
|
||||
# mybert = CustomBert(DROPOUT)
|
||||
# print("Bert Initialized")
|
||||
# mybert.to(DEVICE)
|
||||
|
||||
|
||||
# Read Raw Data from csv and save as DataFrame
|
||||
# df = pd.read_csv("./data/hack.csv",encoding="latin1")
|
||||
# print("Raw Data read")
|
||||
|
||||
|
||||
# Initialize BertTokenizer from Pretrained
|
||||
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
|
||||
# print("Tokenizer Initialized")
|
||||
|
||||
|
||||
# Split DataFrame into Train and Test Sets
|
||||
# Create Custom Datasets for Train and Test
|
||||
# train_data,test_data,validation_data = create_datasets(tokenizer,df,.7,True)
|
||||
# print("Splitted Data in Train and Test Sets")
|
||||
# print("Custom Datasets created")
|
||||
|
||||
|
||||
# Initialize Dataloader with Train and Test Sets
|
||||
# train_loader, test_loader, validation_loader = create_dataloaders([train_data,test_data,validation_data],batchsize=BATCH_SIZE,shufflelist=[True,True,False])
|
||||
# print("DataLoaders created")
|
||||
|
||||
|
||||
# Set criterion to Cross Entropy and define Adam Optimizer with model parameters and learning rate
|
||||
# criterion_cross_entropy = nn.CrossEntropyLoss()
|
||||
# optimizer_adamW = optim.Adam(mybert.parameters(), lr = LEARNING_RATE)
|
||||
# import time
|
||||
|
||||
|
||||
# Set Scheduler for dynamically Learning Rate adjustment
|
||||
loss_values, eval_values = np.zeros(EPOCH), np.zeros(EPOCH)
|
||||
|
||||
# for epoch in range(EPOCH):
|
||||
# start = time.time()
|
||||
# print(f"For {epoch+1} the Scores are: ")
|
||||
# loss_values[epoch] = training_loop(mybert,optimizer=optimizer_adamW,criterion=criterion_cross_entropy,train_loader=train_loader,freeze_bert=FREEZE)
|
||||
# eval_values[epoch] = eval_loop(mybert,criterion=criterion_cross_entropy,validation_loader=test_loader)
|
||||
# end = time.time()
|
||||
# print((end-start),"seconds per epoch needed")
|
||||
|
||||
# plot_metrics_loss_n_acc("x","x","x","x")
|
||||
|
||||
# for epoch in range(EPOCH):
|
||||
# test_loop(mybert,validation_loader)
|
||||
846
cnn_class.ipynb
846
cnn_class.ipynb
File diff suppressed because one or more lines are too long
857
cnn_reg.ipynb
857
cnn_reg.ipynb
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,207 +0,0 @@
|
|||
"""
|
||||
This file contains the dataset generation and preprocessing.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from nltk.tokenize import word_tokenize
|
||||
import gensim
|
||||
import torch
|
||||
import os
|
||||
import copy
|
||||
import regex as re
|
||||
|
||||
import HumorDataset
|
||||
|
||||
# def load_glove_embeddings(glove_file_path):
|
||||
# embeddings_index = {}
|
||||
# with open(glove_file_path, 'r', encoding='utf-8') as f:
|
||||
# for line in f:
|
||||
# try:
|
||||
# values = line.split()
|
||||
# #print(values)
|
||||
# word = values[0]
|
||||
# coefs = np.asarray(values[1:], dtype='float32')
|
||||
# embeddings_index[word] = coefs
|
||||
# except ValueError:
|
||||
# print('Error with line:', line[:100])
|
||||
# return embeddings_index
|
||||
|
||||
def load_glove_embeddings(glove_file_path, emb_len=100):
|
||||
embeddings_index = {}
|
||||
with open(glove_file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
try:
|
||||
# Use regex to split the line into word and coefficients
|
||||
match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
|
||||
# regex explanation: Match word followed by one or more spaces and then the coefficients
|
||||
if match:
|
||||
word = match.group(1)
|
||||
coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
|
||||
|
||||
#check list length
|
||||
if len(coefs) != emb_len:
|
||||
print('Skip: Length mismatch with line:', line[:100])
|
||||
else:
|
||||
embeddings_index[word] = coefs
|
||||
else:
|
||||
print('Error with line:', line[:100])
|
||||
except ValueError:
|
||||
print('Error with line:', line[:100])
|
||||
return embeddings_index
|
||||
|
||||
|
||||
def create_embbedings_matrix(embeddings_glove, max_len=100):
|
||||
embeddings_glove['<UNK>'] = np.random.rand(max_len)
|
||||
embeddings_glove['<PAD>'] = np.zeros(max_len)
|
||||
# Create a word index (vocabulary)
|
||||
word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
|
||||
# Special tokens are in the word index
|
||||
word_index['<UNK>'] = len(word_index) - 2
|
||||
word_index['<PAD>'] = len(word_index) - 1
|
||||
# print len of word_index
|
||||
print(len(word_index))
|
||||
# Create an embedding matrix
|
||||
embedding_dim = len(next(iter(embeddings_glove.values())))
|
||||
|
||||
embedding_matrix = np.zeros((len(word_index), embedding_dim))
|
||||
|
||||
for word, idx in word_index.items():
|
||||
embedding_vector = embeddings_glove.get(word)
|
||||
if embedding_vector is not None:
|
||||
embedding_matrix[idx] = embedding_vector
|
||||
|
||||
# Convert the embedding matrix to a tensor
|
||||
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
|
||||
return embedding_matrix, word_index
|
||||
|
||||
|
||||
|
||||
def create_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
|
||||
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
|
||||
|
||||
embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
|
||||
|
||||
vocab_size = len(embedding_matrix)
|
||||
d_model = len(embedding_matrix[0])
|
||||
vocab_size, d_model = embedding_matrix.size()
|
||||
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
|
||||
|
||||
return embedding_matrix, word_index, vocab_size, d_model
|
||||
|
||||
|
||||
def load_preprocess_data(path_data='data/hack.csv'):
|
||||
df = pd.read_csv(path_data)
|
||||
df = df.dropna(subset=['humor_rating'])
|
||||
# find median of humor_rating
|
||||
median_rating = df['humor_rating'].median()
|
||||
df['y'] = df['humor_rating'] > median_rating
|
||||
X = df['text']
|
||||
y = df['y']
|
||||
return X, y
|
||||
|
||||
|
||||
def encode_tokens(tokens, embedding_index, default_vector_len=100):
|
||||
return [embedding_index.get(token, np.random.zeros(default_vector_len)) for token in tokens]
|
||||
|
||||
|
||||
def pad_sequences(sequences, max_len, pad_index):
|
||||
return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
|
||||
|
||||
|
||||
def split_data(X, y, test_size=0.1, val_size=0.1):
|
||||
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
|
||||
val_split_ratio = val_size / (test_size + val_size)
|
||||
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
|
||||
|
||||
ret_dict = {
|
||||
'train': {'X': X_train, 'y': y_train},
|
||||
'test': {'X': X_test, 'y': y_test},
|
||||
'val': {'X': X_val, 'y': y_val}
|
||||
}
|
||||
|
||||
# for each print len
|
||||
for key in ret_dict.keys():
|
||||
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
|
||||
|
||||
return ret_dict
|
||||
|
||||
def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
|
||||
if not os.path.exists(path):
|
||||
print('Creating directory:', path)
|
||||
os.makedirs(path)
|
||||
print('saving data into:', path)
|
||||
for key, value in data_dict.items():
|
||||
# tansform to Dataset
|
||||
dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
|
||||
# save dataset
|
||||
torch.save(dataset, path + prefix + key + '.pt')
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Load the data from csv
|
||||
df = pd.read_csv('data/hack.csv')
|
||||
print(df.shape)
|
||||
|
||||
df = df.dropna(subset=['humor_rating'])
|
||||
|
||||
# find median of humor_rating
|
||||
median_rating = df['humor_rating'].median()
|
||||
#print('median and therefore middle of humor_rating:', median_rating)
|
||||
|
||||
df['y'] = df['humor_rating'] > median_rating
|
||||
|
||||
# transfrom data into dataset
|
||||
X = df['text']
|
||||
y = df['y']
|
||||
|
||||
# Tokenize the data with nltk
|
||||
tokens = [word_tokenize(text.lower()) for text in X]
|
||||
|
||||
vocab_size = len(set([word for sentence in tokens for word in sentence]))
|
||||
print('vocab size:', vocab_size)
|
||||
|
||||
# Pad the sequences
|
||||
# NOTE: Info comes from data explore notebook: 280 is max length,
|
||||
# 139 contains 80% and 192 contains 95% of the data
|
||||
max_len = 280
|
||||
padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
|
||||
|
||||
# split data into train, test, and validation
|
||||
data_dict = split_data(padded_indices, y)
|
||||
|
||||
# data_idx_based = copy.deepcopy(data_dict)
|
||||
# vector_based = False
|
||||
|
||||
# for key in data_idx_based.keys():
|
||||
# data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
|
||||
# # print shape of data
|
||||
# #print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||
|
||||
# # save the data
|
||||
# save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
|
||||
|
||||
print('loading GloVe embeddings')
|
||||
# Load GloVe embeddings
|
||||
glove_file_path = 'glove.6B/glove.6B.100d.txt'
|
||||
#glove_file_path = 'glove.840B.300d/glove.840B.300d.txt'
|
||||
embeddings_index = load_glove_embeddings(glove_file_path)
|
||||
emb_len = 100
|
||||
print('starting with embedding the data')
|
||||
# Encode the tokens
|
||||
#for key in data_dict.keys():
|
||||
#data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=emb_len) for tokens in data_dict[key]['X']]
|
||||
# print shape of data
|
||||
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||
|
||||
# Save the data
|
||||
#save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
|
||||
|
||||
|
||||
max_len = 100
|
||||
gloVe_path = 'glove.6B/glove.6B.100d.txt'
|
||||
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=max_len)
|
||||
|
||||
embeddings_glove['<UNK>'] = np.random.rand(max_len)
|
||||
embeddings_glove['<PAD>'] = np.zeros(max_len)
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
"""
|
||||
This file contains the dataset generation and preprocessing.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
import torch
|
||||
import regex as re
|
||||
|
||||
def load_glove_embeddings(glove_file_path, emb_len=100):
|
||||
embeddings_index = {}
|
||||
with open(glove_file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
try:
|
||||
# Use regex to split the line into word and coefficients
|
||||
match = re.match(r"(.+?)\s+([\d\s\.\-e]+)", line)
|
||||
# regex explanation: Match word followed by one or more spaces and then the coefficients
|
||||
if match:
|
||||
word = match.group(1)
|
||||
coefs = np.fromstring(match.group(2), sep=' ', dtype='float32')
|
||||
|
||||
#check list length
|
||||
if len(coefs) != emb_len:
|
||||
print('Skip: Length mismatch with line:', line[:100])
|
||||
else:
|
||||
embeddings_index[word] = coefs
|
||||
else:
|
||||
print('Error with line:', line[:100])
|
||||
except ValueError:
|
||||
print('Error with line:', line[:100])
|
||||
return embeddings_index
|
||||
|
||||
|
||||
def create_embbedings_matrix(embeddings_glove, max_len=100):
|
||||
embeddings_glove['<UNK>'] = np.random.rand(max_len)
|
||||
embeddings_glove['<PAD>'] = np.zeros(max_len)
|
||||
# Create a word index (vocabulary)
|
||||
word_index = {word: idx for idx, word in enumerate(embeddings_glove.keys())}
|
||||
# Special tokens are in the word index
|
||||
word_index['<UNK>'] = len(word_index) - 2
|
||||
word_index['<PAD>'] = len(word_index) - 1
|
||||
# print len of word_index
|
||||
print(len(word_index))
|
||||
# Create an embedding matrix
|
||||
embedding_dim = len(next(iter(embeddings_glove.values())))
|
||||
|
||||
embedding_matrix = np.zeros((len(word_index), embedding_dim))
|
||||
|
||||
for word, idx in word_index.items():
|
||||
embedding_vector = embeddings_glove.get(word)
|
||||
if embedding_vector is not None:
|
||||
embedding_matrix[idx] = embedding_vector
|
||||
|
||||
# Convert the embedding matrix to a tensor
|
||||
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
|
||||
return embedding_matrix, word_index
|
||||
|
||||
def get_embedding_matrix(gloVe_path='glove.6B/glove.6B.100d.txt', emb_len=100):
|
||||
embeddings_glove = load_glove_embeddings(gloVe_path, emb_len=emb_len)
|
||||
|
||||
embedding_matrix, word_index = create_embbedings_matrix(embeddings_glove)
|
||||
|
||||
vocab_size = len(embedding_matrix)
|
||||
d_model = len(embedding_matrix[0])
|
||||
vocab_size, d_model = embedding_matrix.size()
|
||||
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
|
||||
|
||||
return embedding_matrix, word_index, vocab_size, d_model
|
||||
|
||||
|
||||
def load_preprocess_data(path_data='data/hack.csv', verbose=False):
|
||||
# Daten laden
|
||||
df = pd.read_csv(path_data)
|
||||
# Fehlende Werte in der Zielspalte entfernen
|
||||
df = df.dropna(subset=['humor_rating'])
|
||||
# Zielvariable aus der Spalte 'humor_rating' extrahieren
|
||||
df['y'] = df['humor_rating'].astype(float) # Sicherstellen, dass Zielvariable numerisch ist
|
||||
# Eingabetexte und Zielvariable zuweisen
|
||||
X = df['text']
|
||||
y = df['y']
|
||||
if verbose:
|
||||
print(f"Erste Zielwerte: {y.head(10)}")
|
||||
print(f"Datentyp der Zielvariable: {y.dtype}")
|
||||
print(f"Anzahl der Beispiele: {len(X)}")
|
||||
return X, y
|
||||
|
||||
def split_data(X, y, test_size=0.1, val_size=0.1):
|
||||
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
|
||||
val_split_ratio = val_size / (test_size + val_size)
|
||||
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_split_ratio, random_state=42)
|
||||
|
||||
ret_dict = {
|
||||
'train': {'X': X_train, 'y': y_train},
|
||||
'test': {'X': X_test, 'y': y_test},
|
||||
'val': {'X': X_val, 'y': y_val}
|
||||
}
|
||||
|
||||
# for each print len
|
||||
for key in ret_dict.keys():
|
||||
print(key, len(ret_dict[key]['X']), len(ret_dict[key]['y']))
|
||||
|
||||
return ret_dict
|
||||
File diff suppressed because it is too large
Load Diff
129
ml_evaluation.py
129
ml_evaluation.py
|
|
@ -1,129 +0,0 @@
|
|||
import torch
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.metrics import confusion_matrix, f1_score
|
||||
import pandas as pd
|
||||
import matplotlib.patches as mpatches
|
||||
|
||||
def get_accuracy(outputs, labels):
|
||||
correct = np.array([p == l for p, l in zip(outputs, labels)])
|
||||
accuracy = correct.sum() / len(labels)
|
||||
return accuracy
|
||||
|
||||
def get_f1_score(outputs, labels):
|
||||
outputs = torch.tensor(outputs)
|
||||
labels = torch.tensor(labels)
|
||||
f1 = f1_score(labels, outputs)
|
||||
return f1
|
||||
|
||||
def plot_confusion_matrix(outputs, labels, class_names=['No Humor', 'Humor'], title='Confusion Matrix'):
|
||||
conf_matrix = confusion_matrix(labels, outputs)
|
||||
|
||||
plt.figure(figsize=(6,5))
|
||||
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", xticklabels=class_names, yticklabels=class_names)
|
||||
plt.xlabel("Predicted Label")
|
||||
plt.ylabel("True Label")
|
||||
plt.title(title)
|
||||
return plt
|
||||
|
||||
|
||||
def get_label_distribution(labels, preds):
|
||||
# Calculate wrong predictions
|
||||
wrong_preds = np.array(labels) != np.array(preds)
|
||||
|
||||
# Calculate the number of wrong predictions for each class
|
||||
class_0_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 0)
|
||||
class_1_wrong_preds = np.sum(np.array(labels)[wrong_preds] == 1)
|
||||
# Calculate the total number of wrong predictions
|
||||
total_wrong_preds = np.sum(wrong_preds)
|
||||
# Calculate and print the ratio of wrong predictions for each class
|
||||
class_0_ratio = class_0_wrong_preds / total_wrong_preds
|
||||
class_1_ratio = class_1_wrong_preds / total_wrong_preds
|
||||
|
||||
print(f"Class 0: {class_0_ratio:.2f}")
|
||||
print(f"Class 1: {class_1_ratio:.2f}")
|
||||
|
||||
def plot_training_history(history, title='Training History'):
|
||||
hist_data = history.get_history()
|
||||
|
||||
epochs = range(1, len(hist_data['train_loss']) + 1)
|
||||
|
||||
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
|
||||
|
||||
# Plot accuracy
|
||||
axs[1].plot(epochs, hist_data['train_acc'], label='Train Accuracy')
|
||||
axs[1].plot(epochs, hist_data['val_acc'], label='Validation Accuracy')
|
||||
axs[1].set_title('Accuracy')
|
||||
axs[1].set_xlabel('Epochs')
|
||||
axs[1].set_ylabel('Accuracy')
|
||||
axs[1].legend()
|
||||
|
||||
# Plot loss
|
||||
axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
|
||||
axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
|
||||
axs[0].set_title('Loss')
|
||||
axs[0].set_xlabel('Epochs')
|
||||
axs[0].set_ylabel('Loss')
|
||||
axs[0].legend()
|
||||
|
||||
plt.tight_layout()
|
||||
plt.suptitle(title)
|
||||
return plt
|
||||
|
||||
|
||||
|
||||
def load_data(filepath):
|
||||
"""
|
||||
Load the data from a CSV file.
|
||||
"""
|
||||
df = pd.read_csv(filepath)
|
||||
#print(df.shape)
|
||||
return df
|
||||
|
||||
def process_data(df, test_dataset, all_preds, all_labels):
|
||||
"""
|
||||
Process the data to prepare it for plotting.
|
||||
"""
|
||||
df_test = df.iloc[test_dataset.original_indices].copy()
|
||||
df_test['prediction'] = all_preds
|
||||
df_test['label'] = all_labels
|
||||
df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])
|
||||
df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)
|
||||
return df_test_sorted
|
||||
|
||||
def plot_rating_df_based(df_test_sorted, title='Humor Rating vs Prediction for Test Set'):
|
||||
"""
|
||||
Plot the results of the predictions.
|
||||
"""
|
||||
median_rating = df_test_sorted['humor_rating'].median()
|
||||
median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]
|
||||
#print(median_idx)
|
||||
|
||||
range_idx = range(len(df_test_sorted))
|
||||
colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)
|
||||
plt.axvline(x=median_idx, color='black', linestyle='--')
|
||||
|
||||
green_patch = mpatches.Patch(color='g', label='Correct Prediction')
|
||||
red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')
|
||||
line_patch = mpatches.Patch(color='black', label='humor_rating cut off')
|
||||
|
||||
plt.title(title)
|
||||
plt.xlabel('Index')
|
||||
plt.ylabel('Humor Rating')
|
||||
plt.legend(handles=[green_patch, red_patch, line_patch])
|
||||
return plt
|
||||
|
||||
|
||||
def plot_rating_preds(all_preds, all_labels,
|
||||
test_dataset,
|
||||
title='Humor Rating vs Prediction for Test Set',
|
||||
data_path = 'data/hack.csv'):
|
||||
|
||||
data = load_data(data_path)
|
||||
df_test_sorted = process_data(data, test_dataset, all_preds, all_labels)
|
||||
plt = plot_rating_df_based(df_test_sorted, title=title)
|
||||
return plt
|
||||
23
ml_helper.py
23
ml_helper.py
|
|
@ -5,40 +5,41 @@ import time
|
|||
import json
|
||||
import os
|
||||
|
||||
def get_device(verbose=False):
|
||||
def get_device(verbose=False, include_mps=False):
|
||||
"""
|
||||
Get the current device (MPS, CPU or GPU) for PyTorch.
|
||||
"""
|
||||
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
if verbose:
|
||||
print('Using device:', device)
|
||||
if include_mps:
|
||||
device = torch.device("mps" if torch.backends.mps.is_available() else device)
|
||||
return device
|
||||
|
||||
def save_model_and_hyperparameters(model, model_prefix_name, accuracy, timestamp=None,**kwargs):
|
||||
def save_model_and_hyperparams(model, model_prefix_name, rmse, hyperparameters, timestamp=None):
|
||||
"""
|
||||
Save the model and hyperparameters to disk.
|
||||
**kwargs: hyperparameters to save
|
||||
hyperparameters: dictionary containing hyperparameters to save
|
||||
"""
|
||||
# Create a timestamp
|
||||
if timestamp is None:
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
accuracy = round(accuracy, 4)
|
||||
|
||||
rmse = round(rmse, 4)
|
||||
|
||||
# Save the model state dictionary
|
||||
model_path = f'models/{model_prefix_name}_acc_{accuracy}_{timestamp}.pth'
|
||||
model_path = f'models/{model_prefix_name}_acc_{rmse}_{timestamp}.pth'
|
||||
torch.save(model.state_dict(), model_path)
|
||||
print(f"Model saved to {model_path}.")
|
||||
|
||||
# Save the hyperparameters as a JSON file
|
||||
hyperparameters = kwargs
|
||||
hyperparameters['accuracy'] = accuracy
|
||||
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{accuracy}_{timestamp}.json'
|
||||
hyperparameters['rmse'] = rmse
|
||||
hyperparameters_path = f'models/{model_prefix_name}_para_acc_{rmse}_{timestamp}.json'
|
||||
with open(hyperparameters_path, 'w') as f:
|
||||
json.dump(hyperparameters, f)
|
||||
print(f"Hyperparameters saved to {hyperparameters_path}.")
|
||||
|
||||
def get_newest_model_path(path, name=None, extension=".pth"):
|
||||
def get_newest_file(path, name=None, extension=".pth"):
|
||||
"""
|
||||
Get the newest file in a directory.
|
||||
"""
|
||||
|
|
|
|||
103
ml_history.py
103
ml_history.py
|
|
@ -1,70 +1,115 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
from datetime import datetime
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
class History:
|
||||
"""
|
||||
Class to store the history of the training process.
|
||||
Used to store the loss and accuracy of the training and validation sets.
|
||||
Used to store the loss and rmse of the training and validation sets.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.history = {
|
||||
'train_loss': [],
|
||||
'val_loss': [],
|
||||
|
||||
'train_acc': [],
|
||||
'val_acc': [],
|
||||
'train_rmse': [],
|
||||
'val_rmse': [],
|
||||
|
||||
'val_labels': [],
|
||||
# val_preds contains structs {epoch: [preds], ...}
|
||||
'val_preds': [],
|
||||
|
||||
# only needed in the end not in training
|
||||
'test_labels': [],
|
||||
'test_preds': [],
|
||||
}
|
||||
self.batch_history = {
|
||||
'train_loss': [],
|
||||
'val_loss': [],
|
||||
|
||||
'train_acc': [],
|
||||
'val_acc': [],
|
||||
'train_rmse': [],
|
||||
'val_rmse': [],
|
||||
}
|
||||
|
||||
def update(self):
|
||||
self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
|
||||
self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
|
||||
self.history['train_acc'].append(np.mean(self.batch_history['train_acc']))
|
||||
self.history['val_acc'].append(np.mean(self.batch_history['val_acc']))
|
||||
if self.batch_history['train_loss']:
|
||||
self.history['train_loss'].append(np.mean(self.batch_history['train_loss']))
|
||||
if self.batch_history['val_loss']:
|
||||
self.history['val_loss'].append(np.mean(self.batch_history['val_loss']))
|
||||
if self.batch_history['train_rmse']:
|
||||
self.history['train_rmse'].append(np.mean(self.batch_history['train_rmse']))
|
||||
if self.batch_history['val_rmse']:
|
||||
self.history['val_rmse'].append(np.mean(self.batch_history['val_rmse']))
|
||||
|
||||
def get_history(self):
|
||||
return self.history
|
||||
|
||||
def calculate_accuracy(self, outputs, labels):
|
||||
preds = torch.argmax(outputs, dim=1)
|
||||
correct = (preds == labels).sum().item()
|
||||
accuracy = correct / len(labels)
|
||||
return accuracy
|
||||
def calculate_rmse(self, outputs, labels):
|
||||
return np.sqrt(mean_squared_error(labels, outputs))
|
||||
|
||||
def batch_reset(self):
|
||||
self.batch_history = {
|
||||
'train_loss': [],
|
||||
'val_loss': [],
|
||||
'train_acc': [],
|
||||
'val_acc': [],
|
||||
'train_rmse': [],
|
||||
'val_rmse': [],
|
||||
}
|
||||
|
||||
def batch_update(self, train_loss, val_loss, train_acc, val_acc):
|
||||
def batch_update(self, train_loss, val_loss, train_rmse, val_rmse):
|
||||
self.batch_history['train_loss'].append(train_loss)
|
||||
self.batch_history['val_loss'].append(val_loss)
|
||||
self.batch_history['train_acc'].append(train_acc)
|
||||
self.batch_history['val_acc'].append(val_acc)
|
||||
self.batch_history['train_rmse'].append(train_rmse)
|
||||
self.batch_history['val_rmse'].append(val_rmse)
|
||||
|
||||
def batch_update_train(self, train_loss, preds, labels):
|
||||
train_acc = self.calculate_accuracy(preds, labels)
|
||||
train_rmse = self.calculate_rmse(preds, labels)
|
||||
self.batch_history['train_loss'].append(train_loss)
|
||||
self.batch_history['train_acc'].append(train_acc)
|
||||
self.batch_history['train_rmse'].append(train_rmse)
|
||||
|
||||
def batch_update_val(self, val_loss, preds, labels):
|
||||
val_acc = self.calculate_accuracy(preds, labels)
|
||||
def batch_update_val(self, val_loss, preds, labels, epoch):
|
||||
val_rmse = self.calculate_rmse(preds, labels)
|
||||
self.batch_history['val_loss'].append(val_loss)
|
||||
self.batch_history['val_acc'].append(val_acc)
|
||||
self.batch_history['val_rmse'].append(val_rmse)
|
||||
|
||||
self.history['val_labels'] = labels.tolist()
|
||||
self.history['val_preds'].append({epoch: preds.tolist()})
|
||||
|
||||
|
||||
def get_batch_history(self):
|
||||
return self.batch_history
|
||||
|
||||
def print_history(self, epoch, max_epochs, time_elapsed, verbose=True):
|
||||
if verbose:
|
||||
print(f'Epoch {epoch:>3}/{max_epochs} - {time_elapsed:.2f}s - loss: {self.history["train_loss"][-1]:.4f} - accuracy: {self.history["train_acc"][-1]:.4f} - val_loss: {self.history["val_loss"][-1]:.4f} - val_accuracy: {self.history["val_acc"][-1]:.4f}')
|
||||
|
||||
def add_test_results(self, test_labels, test_preds):
|
||||
self.history['test_labels'] = test_labels
|
||||
self.history['test_preds'] = test_preds
|
||||
|
||||
|
||||
def convert_hist(self):
|
||||
# Needed for saving the history to a json file:
|
||||
# convert numpy arrays to lists and use float instead of numpy float
|
||||
history_to_save = {}
|
||||
for hist_key, hist_val in self.history.items():
|
||||
if hist_key == 'val_preds':
|
||||
history_to_save[hist_key] = [{k: [float(x) for x in v] for k, v in val.items()} for val in hist_val]
|
||||
else:
|
||||
history_to_save[hist_key] = [float(x) for x in hist_val]
|
||||
|
||||
return history_to_save
|
||||
|
||||
def save_history(self, hist_name):
|
||||
directory = "histories"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory) # Create the directory if it does not exist
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filepath = os.path.join(directory, f"{hist_name}_{timestamp}.json")
|
||||
|
||||
# Needed for saving the history to a json file:
|
||||
# convert numpy arrays to lists and use float instead of numpy float
|
||||
history_to_save = self.convert_hist()
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump(history_to_save, f, indent=4)
|
||||
print(f"History saved to {filepath}")
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import os
|
||||
import time
|
||||
|
||||
def save_plot(plt, plot_name):
|
||||
if not os.path.exists('plots'):
|
||||
os.makedirs('plots')
|
||||
# create timestamp
|
||||
time_stamp = time.strftime('%Y%m%d-%H%M%S')
|
||||
plt.savefig(f'plots/{plot_name}_{time_stamp}.png')
|
||||
|
||||
def plot_training_history(hist_data, title='Training History', save=True):
|
||||
|
||||
epochs = range(1, len(hist_data['train_loss']) + 1)
|
||||
|
||||
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
|
||||
|
||||
# Plot accuracy
|
||||
axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE')
|
||||
axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE')
|
||||
axs[1].set_title('RMSE')
|
||||
axs[1].set_xlabel('Epochs')
|
||||
axs[1].set_ylabel('RMSE')
|
||||
axs[1].legend()
|
||||
|
||||
# Plot loss
|
||||
axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss')
|
||||
axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss')
|
||||
axs[0].set_title('Loss')
|
||||
axs[0].set_xlabel('Epochs')
|
||||
axs[0].set_ylabel('Loss')
|
||||
axs[0].legend()
|
||||
|
||||
plt.tight_layout()
|
||||
plt.suptitle(title)
|
||||
|
||||
# save plot
|
||||
if save:
|
||||
save_plot(plt, title)
|
||||
return plt
|
||||
|
||||
def plot_distribution(true_values, predicted_values, title='Distribution of Predicted and True Values', save=True):
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(true_values, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
|
||||
plt.hist(predicted_values, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
|
||||
plt.title(title)
|
||||
plt.xlabel('Score')
|
||||
plt.ylabel('Frequency')
|
||||
plt.legend()
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
# save plot
|
||||
if save:
|
||||
save_plot(plt, title)
|
||||
return plt
|
||||
|
||||
def plot_predictions(true_values, predicted_values, title='True vs Predicted Values', threshold=0.3, save=True):
|
||||
plt.figure(figsize=(10, 6))
|
||||
# Difference between predicted and true values
|
||||
correct_indices = np.isclose(true_values, predicted_values, atol=threshold)
|
||||
incorrect_indices = ~correct_indices
|
||||
# Plot
|
||||
plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color='green', label='Correctly predicted')
|
||||
plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color='red', label='Incorrectly predicted')
|
||||
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal Line')
|
||||
plt.xlabel('True Values')
|
||||
plt.ylabel('Predicted Values')
|
||||
plt.title(title)
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
# save plot
|
||||
if save:
|
||||
save_plot(plt, title)
|
||||
return plt
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
from tqdm import tqdm
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
|
||||
def train_epoch(model, train_loader, criterion, optimizer, device, history, epoch, total_epochs, bert_freeze=False, is_bert=False):
|
||||
model.train()
|
||||
if bert_freeze and hasattr(model, 'freeze_bert_params'):
|
||||
model.freeze_bert_params()
|
||||
|
||||
with tqdm(train_loader, desc=f"├ Epoch {epoch + 1}/{total_epochs}") as pbar:
|
||||
for batch in pbar:
|
||||
optimizer.zero_grad()
|
||||
if is_bert:
|
||||
input_ids = batch['input_ids'].to(device)
|
||||
attention_mask = batch['attention_mask'].to(device)
|
||||
labels = batch['labels'].to(device).float()
|
||||
predictions = model(input_ids, attention_mask=attention_mask).float()
|
||||
else:
|
||||
X_batch, y_batch = batch
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
predictions = model(X_batch).float()
|
||||
labels = y_batch
|
||||
|
||||
loss = criterion(predictions, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
preds = predictions.detach().cpu().numpy()
|
||||
labels = labels.detach().cpu().numpy()
|
||||
history.batch_update_train(loss.item(), preds, labels)
|
||||
|
||||
# Update progress bar
|
||||
pbar.set_postfix({"Train Loss": loss.item()})
|
||||
|
||||
history.update()
|
||||
history.batch_reset()
|
||||
|
||||
|
||||
def validate_epoch(model, val_loader, epoch, criterion, device, history, is_bert=False):
|
||||
model.eval()
|
||||
val_loss = 0.0
|
||||
val_preds, val_labels = [], []
|
||||
with torch.no_grad():
|
||||
for batch in val_loader:
|
||||
if is_bert:
|
||||
input_ids = batch['input_ids'].to(device)
|
||||
attention_mask = batch['attention_mask'].to(device)
|
||||
labels = batch['labels'].to(device).float()
|
||||
predictions = model(input_ids, attention_mask=attention_mask).float()
|
||||
else:
|
||||
X_batch, y_batch = batch
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
labels = y_batch
|
||||
predictions = model(X_batch).float()
|
||||
loss = criterion(predictions, labels)
|
||||
val_loss += loss.item()
|
||||
val_preds.extend(predictions.cpu().detach().numpy())
|
||||
val_labels.extend(labels.cpu().detach().numpy())
|
||||
|
||||
val_rmse = history.calculate_rmse(np.array(val_preds), np.array(val_labels))
|
||||
history.batch_update_val(val_loss / len(val_loader), np.array(val_preds), np.array(val_labels), epoch)
|
||||
history.update()
|
||||
history.batch_reset()
|
||||
|
||||
return val_rmse
|
||||
|
||||
|
||||
def test_loop(model, test_loader, device, is_bert=False):
|
||||
model.eval()
|
||||
test_preds, test_labels = [], []
|
||||
with torch.no_grad():
|
||||
for batch in test_loader:
|
||||
if is_bert:
|
||||
input_ids = batch['input_ids'].to(device)
|
||||
attention_mask = batch['attention_mask'].to(device)
|
||||
labels = batch['labels'].to(device).float()
|
||||
predictions = model(input_ids, attention_mask=attention_mask).float()
|
||||
else:
|
||||
X_batch, y_batch = batch
|
||||
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
|
||||
labels = y_batch
|
||||
predictions = model(X_batch).float()
|
||||
test_preds.extend(predictions.cpu().detach().numpy())
|
||||
test_labels.extend(labels.cpu().detach().numpy())
|
||||
|
||||
return test_labels, test_preds
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO: compare"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,187 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the data\n",
|
||||
"with open('data/pun_anno/pun_het.json') as f:\n",
|
||||
" data_het = json.load(f)\n",
|
||||
"\n",
|
||||
"with open('data/pun_anno/pun_hom.json') as f:\n",
|
||||
" data_hom = json.load(f)\n",
|
||||
"\n",
|
||||
"with open('data/pun_annotated.json') as f:\n",
|
||||
" data_anno = json.load(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a DataFrame\n",
|
||||
"df_anno = pd.DataFrame(data_anno)\n",
|
||||
"\n",
|
||||
"df_het = pd.DataFrame(data_het)\n",
|
||||
"# df switch columns to rows\n",
|
||||
"df_het = df_het.T\n",
|
||||
"\n",
|
||||
"df_hom = pd.DataFrame(data_hom)\n",
|
||||
"# df switch columns to rows\n",
|
||||
"df_hom = df_hom.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 hom_362\n",
|
||||
"1 het_837\n",
|
||||
"2 het_635\n",
|
||||
"3 hom_657\n",
|
||||
"4 het_1275\n",
|
||||
" ... \n",
|
||||
"1894 hom_2076\n",
|
||||
"1895 hom_1437\n",
|
||||
"1896 het_1530\n",
|
||||
"1897 het_100\n",
|
||||
"1898 hom_364\n",
|
||||
"Name: ID, Length: 1899, dtype: object\n",
|
||||
"Index(['het_991', 'het_990', 'het_987', 'het_982', 'het_980', 'het_978',\n",
|
||||
" 'het_973', 'het_958', 'het_956', 'het_955',\n",
|
||||
" ...\n",
|
||||
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
|
||||
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
|
||||
" dtype='object', length=1146)\n",
|
||||
"Index(['hom_998', 'hom_996', 'hom_994', 'hom_993', 'hom_992', 'hom_990',\n",
|
||||
" 'hom_99', 'hom_985', 'hom_984', 'hom_981',\n",
|
||||
" ...\n",
|
||||
" 'hom_2221', 'hom_2223', 'hom_2225', 'hom_2226', 'hom_2230', 'hom_2232',\n",
|
||||
" 'hom_2234', 'hom_2243', 'hom_2246', 'hom_2247'],\n",
|
||||
" dtype='object', length=1443)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# print index for each df\n",
|
||||
"print(df_anno['ID'])\n",
|
||||
"print(df_het.index)\n",
|
||||
"print(df_hom.index)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(655, 8) (1146, 11) (1899, 8)\n",
|
||||
"(825, 8) (1443, 11) (1899, 8)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# find matches from df_anno['ID'] to df_het.index\n",
|
||||
"df_het_match = df_anno[df_anno['ID'].isin(df_het.index)]\n",
|
||||
"print(df_het_match.shape, df_het.shape, df_anno.shape)\n",
|
||||
"\n",
|
||||
"# find matches from df_anno['ID'] to df_hom.index\n",
|
||||
"df_hom_match = df_anno[df_anno['ID'].isin(df_hom.index)]\n",
|
||||
"print(df_hom_match.shape, df_hom.shape, df_anno.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 hom_362\n",
|
||||
"3 hom_657\n",
|
||||
"6 hom_1510\n",
|
||||
"7 hom_955\n",
|
||||
"8 hom_1505\n",
|
||||
" ... \n",
|
||||
"1893 hom_151\n",
|
||||
"1894 hom_2076\n",
|
||||
"1895 hom_1437\n",
|
||||
"1896 het_1530\n",
|
||||
"1898 hom_364\n",
|
||||
"Name: ID, Length: 1244, dtype: object\n",
|
||||
"Index(['het_955', 'het_907', 'het_905', 'het_786', 'het_783', 'het_777',\n",
|
||||
" 'het_639', 'het_573', 'het_466', 'het_435',\n",
|
||||
" ...\n",
|
||||
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
|
||||
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
|
||||
" dtype='object', length=491)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# print not matched IDs and index\n",
|
||||
"print(df_anno[~df_anno['ID'].isin(df_het.index)]['ID'])\n",
|
||||
"print(df_het.index[~df_het.index.isin(df_anno['ID'])])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# merge df_anno and df_het where ID matches with index\n",
|
||||
"df_het_merge = pd.merge(df_anno, df_het, left_on='ID', right_index=True)\n",
|
||||
"# score_avg \n",
|
||||
"df_het_merge['score_avg'] = df_het_merge['Funniness (1-5)'].apply(lambda x: np.mean(x))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
128031
puns/pun_annotated.json
128031
puns/pun_annotated.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
|
@ -1,584 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "KuFFT6LrB6Fe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import json\n",
|
||||
"import math\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
|
||||
"\n",
|
||||
"from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix\n",
|
||||
"from sklearn.model_selection import KFold\n",
|
||||
"# local imports\n",
|
||||
"import ml_evaluation as ml_eval\n",
|
||||
"import ml_helper\n",
|
||||
"import ml_history\n",
|
||||
"import dataset_generator as data_gen\n",
|
||||
"# class imports\n",
|
||||
"import HumorDataset as humor_ds\n",
|
||||
"import EarlyStopping\n",
|
||||
"import BalancedCELoss\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# architecture inspired:\n",
|
||||
"# https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/\n",
|
||||
"\n",
|
||||
"# TODO: maybe KFold for cross validation?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using device: cuda\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.manual_seed(0)\n",
|
||||
"np.random.seed(0)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"best_model_filename = 'best_transformer_reg_model.pt'\n",
|
||||
"\n",
|
||||
"device = ml_helper.get_device(verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"400002\n",
|
||||
"vocab_size: 400002, d_model: 100\n",
|
||||
"vocab_size: 400002, d_model: 100\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n",
|
||||
"\n",
|
||||
"vocab_size = len(embedding_matrix)\n",
|
||||
"d_model = len(embedding_matrix[0])\n",
|
||||
"vocab_size, d_model = embedding_matrix.size()\n",
|
||||
"print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class PositionalEncoding(nn.Module):\n",
|
||||
" \"\"\"\n",
|
||||
" https://pytorch.org/tutorials/beginner/transformer_tutorial.html\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, d_model, vocab_size=5000, dropout=0.1):\n",
|
||||
" super().__init__()\n",
|
||||
" self.dropout = nn.Dropout(p=dropout)\n",
|
||||
"\n",
|
||||
" pe = torch.zeros(vocab_size, d_model)\n",
|
||||
" position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)\n",
|
||||
" div_term = torch.exp(\n",
|
||||
" torch.arange(0, d_model, 2).float()\n",
|
||||
" * (-math.log(10000.0) / d_model)\n",
|
||||
" )\n",
|
||||
" pe[:, 0::2] = torch.sin(position * div_term)\n",
|
||||
" pe[:, 1::2] = torch.cos(position * div_term)\n",
|
||||
" pe = pe.unsqueeze(0)\n",
|
||||
" self.register_buffer(\"pe\", pe)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = x + self.pe[:, : x.size(1), :]\n",
|
||||
" return self.dropout(x)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class TransformerBinaryClassifier(nn.Module):\n",
|
||||
" \"\"\"\n",
|
||||
" Text classifier based on a pytorch TransformerEncoder.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(\n",
|
||||
" self,\n",
|
||||
" embeddings,\n",
|
||||
" nhead=8,\n",
|
||||
" dim_feedforward=2048,\n",
|
||||
" num_layers=6,\n",
|
||||
" positional_dropout=0.1,\n",
|
||||
" classifier_dropout=0.1,\n",
|
||||
" activation=\"relu\",\n",
|
||||
" ):\n",
|
||||
"\n",
|
||||
" super().__init__()\n",
|
||||
"\n",
|
||||
" vocab_size, d_model = embeddings.size()\n",
|
||||
" assert d_model % nhead == 0, \"nheads must divide evenly into d_model\"\n",
|
||||
"\n",
|
||||
" self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)\n",
|
||||
"\n",
|
||||
" self.pos_encoder = PositionalEncoding(\n",
|
||||
" d_model=d_model,\n",
|
||||
" dropout=positional_dropout,\n",
|
||||
" vocab_size=vocab_size,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" encoder_layer = nn.TransformerEncoderLayer(\n",
|
||||
" d_model=d_model,\n",
|
||||
" nhead=nhead,\n",
|
||||
" dim_feedforward=dim_feedforward,\n",
|
||||
" dropout=classifier_dropout,\n",
|
||||
" )\n",
|
||||
" self.transformer_encoder = nn.TransformerEncoder(\n",
|
||||
" encoder_layer,\n",
|
||||
" num_layers=num_layers,\n",
|
||||
" )\n",
|
||||
" # normalize to stabilize and stop overfitting\n",
|
||||
" self.batch_norm = nn.BatchNorm1d(d_model)\n",
|
||||
" self.classifier = nn.Linear(d_model, 1)\n",
|
||||
" self.d_model = d_model\n",
|
||||
" #self.softmax = nn.Softmax(dim=1)\n",
|
||||
" #self.sigmoid = nn.Sigmoid()\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.emb(x) * math.sqrt(self.d_model)\n",
|
||||
" x = self.pos_encoder(x)\n",
|
||||
" x = self.transformer_encoder(x)\n",
|
||||
" x = x.mean(dim=1)\n",
|
||||
" # normalize to stabilize and stop overfitting\n",
|
||||
" #x = self.batch_norm(x)\n",
|
||||
"\n",
|
||||
" #NOTE: no activation function for regression\n",
|
||||
" # sigmoid would only distort the output\n",
|
||||
" x = self.classifier(x)\n",
|
||||
" \n",
|
||||
" return x\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_preprocess_data(path_data='data/hack.csv'):\n",
|
||||
" df = pd.read_csv(path_data)\n",
|
||||
" df = df.dropna(subset=['humor_rating'])\n",
|
||||
"\n",
|
||||
" df['y'] = df['humor_rating']\n",
|
||||
" X = df['text']\n",
|
||||
" y = df['y']\n",
|
||||
" return X, y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"train 3945 3945\n",
|
||||
"test 494 494\n",
|
||||
"val 493 493\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X,y = load_preprocess_data()\n",
|
||||
"\n",
|
||||
"ret_dict = data_gen.split_data(X, y)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set hyper params"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"model created\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\felix\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\nn\\modules\\transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"params = {\n",
|
||||
" # used for class balancing\n",
|
||||
" 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n",
|
||||
" # training parameters\n",
|
||||
" 'batch_size': 32, # 32 (16 to 64)\n",
|
||||
" 'epochs': 100, # 100\n",
|
||||
" 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n",
|
||||
" \n",
|
||||
" # NOTE: used for gradient clipping (needed for lstm and transformer)\n",
|
||||
" # use 0 to disable\n",
|
||||
" 'clipping_max_norm': 0, # 0 (0.5 to 2.0)\n",
|
||||
" \n",
|
||||
" # patience for early stopping\n",
|
||||
" 'early_stopping_patience': 5, # 5 (3 to 10)\n",
|
||||
"\n",
|
||||
" # learning rate scheduler\n",
|
||||
" 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n",
|
||||
" 'lr_scheduler_patience': 3, # 3 (2 to 5)\n",
|
||||
"\n",
|
||||
" # model parameters\n",
|
||||
" 'nhead': 2, # 5\n",
|
||||
" 'num_layers': 3, # 6\n",
|
||||
" 'hidden_dim': 10, # 50\n",
|
||||
"\n",
|
||||
" # regularization parameters\n",
|
||||
" 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
|
||||
" 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
|
||||
" 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Model initialization\n",
|
||||
"model = TransformerBinaryClassifier(embeddings=embedding_matrix, \n",
|
||||
" nhead=params['nhead'], \n",
|
||||
" num_layers=params['num_layers'], \n",
|
||||
" dim_feedforward=params['hidden_dim'],\n",
|
||||
" positional_dropout=params['positional_dropout'],\n",
|
||||
" classifier_dropout=params['classifier_dropout']\n",
|
||||
" )\n",
|
||||
"model.to(device)\n",
|
||||
"print('model created')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### create datasets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"datasets length: 3945 493\n",
|
||||
"train: 124, val: 16, test: 16\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NOTE: Info comes from data explore notebook: 280 is max length,\n",
|
||||
"# 139 contains 80% and 192 contains 95% of the data\n",
|
||||
"max_len = 280\n",
|
||||
"\n",
|
||||
"train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n",
|
||||
"val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n",
|
||||
"test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n",
|
||||
"\n",
|
||||
"print('datasets length:', len(train_dataset), len(val_dataset))\n",
|
||||
"#NOTE: overfitting test\n",
|
||||
"#train_dataset.labels = train_dataset.labels[:100]\n",
|
||||
"#train_dataset.texts = train_dataset.texts[:100]\n",
|
||||
"\n",
|
||||
"train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
|
||||
"val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
|
||||
"\n",
|
||||
"# NOTE: samller because of batches not all data\n",
|
||||
"print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set training requirements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#TODO: change to RMSE\n",
|
||||
"\"\"\"\n",
|
||||
"criterion = nn.MSELoss()\n",
|
||||
"loss = torch.sqrt(criterion(x, y))\n",
|
||||
"loss.backward()\n",
|
||||
"print(x.grad)\n",
|
||||
"\"\"\"\n",
|
||||
"criterion = nn.MSELoss()\n",
|
||||
"\n",
|
||||
"optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n",
|
||||
" lr=params['lr']) #, \n",
|
||||
" #weight_decay=params['weight_decay'])\n",
|
||||
"\"\"\"\n",
|
||||
"scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n",
|
||||
" factor=params['lr_scheduler_factor'],\n",
|
||||
" patience=params['lr_scheduler_patience'],\n",
|
||||
" verbose=True)\n",
|
||||
"\"\"\"\n",
|
||||
"early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Training loop"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/100, Train Loss: 1.8054, Val Loss: 1.8873, Time: 2.55s\n",
|
||||
"Epoch 2/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.23s\n",
|
||||
"Epoch 3/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.36s\n",
|
||||
"Epoch 4/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
|
||||
"Epoch 5/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.28s\n",
|
||||
"Epoch 6/100, Train Loss: 1.8138, Val Loss: 1.8873, Time: 2.21s\n",
|
||||
"Epoch 7/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 8/100, Train Loss: 1.8110, Val Loss: 1.8873, Time: 2.06s\n",
|
||||
"Epoch 9/100, Train Loss: 1.8102, Val Loss: 1.8873, Time: 2.06s\n",
|
||||
"Epoch 10/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 11/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.26s\n",
|
||||
"Epoch 12/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.39s\n",
|
||||
"Epoch 13/100, Train Loss: 1.8050, Val Loss: 1.8873, Time: 2.29s\n",
|
||||
"Epoch 14/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.19s\n",
|
||||
"Epoch 15/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.29s\n",
|
||||
"Epoch 16/100, Train Loss: 1.8097, Val Loss: 1.8873, Time: 2.28s\n",
|
||||
"Epoch 17/100, Train Loss: 1.8081, Val Loss: 1.8873, Time: 2.44s\n",
|
||||
"Epoch 18/100, Train Loss: 1.8078, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 19/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.15s\n",
|
||||
"Epoch 20/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 21/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 22/100, Train Loss: 1.8103, Val Loss: 1.8873, Time: 2.09s\n",
|
||||
"Epoch 23/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.16s\n",
|
||||
"Epoch 24/100, Train Loss: 1.8034, Val Loss: 1.8873, Time: 2.24s\n",
|
||||
"Epoch 25/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.46s\n",
|
||||
"Epoch 26/100, Train Loss: 1.8084, Val Loss: 1.8873, Time: 2.38s\n",
|
||||
"Epoch 27/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.35s\n",
|
||||
"Epoch 28/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.15s\n",
|
||||
"Epoch 29/100, Train Loss: 1.8136, Val Loss: 1.8873, Time: 2.24s\n",
|
||||
"Epoch 30/100, Train Loss: 1.8051, Val Loss: 1.8873, Time: 2.28s\n",
|
||||
"Epoch 31/100, Train Loss: 1.8026, Val Loss: 1.8873, Time: 2.19s\n",
|
||||
"Epoch 32/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.16s\n",
|
||||
"Epoch 33/100, Train Loss: 1.8121, Val Loss: 1.8873, Time: 2.13s\n",
|
||||
"Epoch 34/100, Train Loss: 1.8098, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 35/100, Train Loss: 1.8036, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 36/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.19s\n",
|
||||
"Epoch 37/100, Train Loss: 1.8108, Val Loss: 1.8873, Time: 2.50s\n",
|
||||
"Epoch 38/100, Train Loss: 1.8082, Val Loss: 1.8873, Time: 2.45s\n",
|
||||
"Epoch 39/100, Train Loss: 1.8134, Val Loss: 1.8873, Time: 2.38s\n",
|
||||
"Epoch 40/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.22s\n",
|
||||
"Epoch 41/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.26s\n",
|
||||
"Epoch 42/100, Train Loss: 1.8088, Val Loss: 1.8873, Time: 2.30s\n",
|
||||
"Epoch 43/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.28s\n",
|
||||
"Epoch 44/100, Train Loss: 1.8029, Val Loss: 1.8873, Time: 2.14s\n",
|
||||
"Epoch 45/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.14s\n",
|
||||
"Epoch 46/100, Train Loss: 1.8091, Val Loss: 1.8873, Time: 2.22s\n",
|
||||
"Epoch 47/100, Train Loss: 1.8048, Val Loss: 1.8873, Time: 2.19s\n",
|
||||
"Epoch 48/100, Train Loss: 1.8069, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 49/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.22s\n",
|
||||
"Epoch 50/100, Train Loss: 1.8028, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 51/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 52/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.08s\n",
|
||||
"Epoch 53/100, Train Loss: 1.8075, Val Loss: 1.8873, Time: 2.00s\n",
|
||||
"Epoch 54/100, Train Loss: 1.8087, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 55/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.02s\n",
|
||||
"Epoch 56/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 57/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.34s\n",
|
||||
"Epoch 58/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 59/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.11s\n",
|
||||
"Epoch 60/100, Train Loss: 1.8100, Val Loss: 1.8873, Time: 2.05s\n",
|
||||
"Epoch 61/100, Train Loss: 1.8063, Val Loss: 1.8873, Time: 2.08s\n",
|
||||
"Epoch 62/100, Train Loss: 1.8068, Val Loss: 1.8873, Time: 2.22s\n",
|
||||
"Epoch 63/100, Train Loss: 1.8012, Val Loss: 1.8873, Time: 2.32s\n",
|
||||
"Epoch 64/100, Train Loss: 1.8079, Val Loss: 1.8873, Time: 2.35s\n",
|
||||
"Epoch 65/100, Train Loss: 1.8109, Val Loss: 1.8873, Time: 2.36s\n",
|
||||
"Epoch 66/100, Train Loss: 1.8030, Val Loss: 1.8873, Time: 2.28s\n",
|
||||
"Epoch 67/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.24s\n",
|
||||
"Epoch 68/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.20s\n",
|
||||
"Epoch 69/100, Train Loss: 1.8115, Val Loss: 1.8873, Time: 2.18s\n",
|
||||
"Epoch 70/100, Train Loss: 1.8019, Val Loss: 1.8873, Time: 2.15s\n",
|
||||
"Epoch 71/100, Train Loss: 1.8025, Val Loss: 1.8873, Time: 2.19s\n",
|
||||
"Epoch 72/100, Train Loss: 1.8124, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 73/100, Train Loss: 1.8086, Val Loss: 1.8873, Time: 2.06s\n",
|
||||
"Epoch 74/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.06s\n",
|
||||
"Epoch 75/100, Train Loss: 1.8049, Val Loss: 1.8873, Time: 2.08s\n",
|
||||
"Epoch 76/100, Train Loss: 1.8059, Val Loss: 1.8873, Time: 2.38s\n",
|
||||
"Epoch 77/100, Train Loss: 1.8141, Val Loss: 1.8873, Time: 2.39s\n",
|
||||
"Epoch 78/100, Train Loss: 1.8092, Val Loss: 1.8873, Time: 2.44s\n",
|
||||
"Epoch 79/100, Train Loss: 1.8106, Val Loss: 1.8873, Time: 2.30s\n",
|
||||
"Epoch 80/100, Train Loss: 1.8125, Val Loss: 1.8873, Time: 2.25s\n",
|
||||
"Epoch 81/100, Train Loss: 1.8142, Val Loss: 1.8873, Time: 2.26s\n",
|
||||
"Epoch 82/100, Train Loss: 1.8073, Val Loss: 1.8873, Time: 2.08s\n",
|
||||
"Epoch 83/100, Train Loss: 1.8064, Val Loss: 1.8873, Time: 2.14s\n",
|
||||
"Epoch 84/100, Train Loss: 1.8085, Val Loss: 1.8873, Time: 2.15s\n",
|
||||
"Epoch 85/100, Train Loss: 1.8080, Val Loss: 1.8873, Time: 2.17s\n",
|
||||
"Epoch 86/100, Train Loss: 1.8096, Val Loss: 1.8873, Time: 2.12s\n",
|
||||
"Epoch 87/100, Train Loss: 1.8083, Val Loss: 1.8873, Time: 2.09s\n",
|
||||
"Epoch 88/100, Train Loss: 1.8093, Val Loss: 1.8873, Time: 2.11s\n",
|
||||
"Epoch 89/100, Train Loss: 1.8101, Val Loss: 1.8873, Time: 2.25s\n",
|
||||
"Epoch 90/100, Train Loss: 1.8047, Val Loss: 1.8873, Time: 2.42s\n",
|
||||
"Epoch 91/100, Train Loss: 1.8056, Val Loss: 1.8873, Time: 2.34s\n",
|
||||
"Epoch 92/100, Train Loss: 1.8090, Val Loss: 1.8873, Time: 2.37s\n",
|
||||
"Epoch 93/100, Train Loss: 1.8107, Val Loss: 1.8873, Time: 2.20s\n",
|
||||
"Epoch 94/100, Train Loss: 1.8031, Val Loss: 1.8873, Time: 2.18s\n",
|
||||
"Epoch 95/100, Train Loss: 1.8032, Val Loss: 1.8873, Time: 2.07s\n",
|
||||
"Epoch 96/100, Train Loss: 1.8062, Val Loss: 1.8873, Time: 2.20s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Training loop\n",
|
||||
"\n",
|
||||
"for epoch in range(params['epochs']):\n",
|
||||
" epoch_start_time = time.time()\n",
|
||||
" model.train()\n",
|
||||
" \n",
|
||||
" train_loss = 0.0\n",
|
||||
" \n",
|
||||
" for batch in train_loader:\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" input_ids, labels = batch\n",
|
||||
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
|
||||
"\n",
|
||||
" outputs = model(input_ids)\n",
|
||||
" outputs = outputs.squeeze().float()\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" loss.backward()\n",
|
||||
" #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n",
|
||||
" optimizer.step()\n",
|
||||
" preds = outputs\n",
|
||||
" \n",
|
||||
" train_loss += loss.item()\n",
|
||||
"\n",
|
||||
" train_loss /= len(train_loader)\n",
|
||||
" \n",
|
||||
" # Validation\n",
|
||||
" model.eval()\n",
|
||||
" val_loss = 0.0\n",
|
||||
" \n",
|
||||
" with torch.no_grad():\n",
|
||||
" for batch in val_loader:\n",
|
||||
" input_ids, labels = batch\n",
|
||||
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
|
||||
" outputs = model(input_ids)\n",
|
||||
" outputs = outputs.squeeze().float()\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" preds = outputs\n",
|
||||
" \n",
|
||||
" val_loss += loss.item()\n",
|
||||
"\n",
|
||||
" val_loss /= len(val_loader)\n",
|
||||
" \n",
|
||||
" epoch_end_time = time.time()\n",
|
||||
" \n",
|
||||
" print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n",
|
||||
" f'Train Loss: {train_loss:.4f}, '\n",
|
||||
" f'Val Loss: {val_loss:.4f}, '\n",
|
||||
" f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
Loading…
Reference in New Issue