klara 2025-02-15 16:14:09 +01:00
commit e308a148aa
10 changed files with 2333 additions and 1700 deletions

417
BertFine.ipynb 100644

File diff suppressed because one or more lines are too long

227
CNN_CLASS.py 100644
View File

@ -0,0 +1,227 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from dataset_generator import create_embedding_matrix, split_data, load_preprocess_data
from HumorDataset import TextDataset
from BalancedCELoss import BalancedCELoss
import matplotlib.pyplot as plt
import numpy as np
# Hyperparameter und Konfigurationen
params = {
"embedding_dim": 100,
"filter_sizes": [2, 3, 4, 5],
"num_filters": 150,
"batch_size": 32,
"learning_rate": 0.001,
"epochs": 25,
"glove_path": 'data/glove.6B.100d.txt',
"max_len": 280,
"test_size": 0.1,
"val_size": 0.1,
"patience": 5,
"data_path": 'data/hack.csv',
"dropout": 0.6,
"weight_decay": 5e-4,
"alpha": 0.1 # Alpha für die Balance in der Loss-Funktion
}
# CNN-Modell für binäre Klassifikation
class EnhancedCNNBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, filter_sizes, num_filters, embedding_matrix, dropout):
super(EnhancedCNNBinaryClassifier, self).__init__()
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv2d(1, num_filters, (fs, embedding_dim)),
nn.BatchNorm2d(num_filters),
nn.ReLU(),
nn.MaxPool2d((params["max_len"] - fs + 1, 1)),
nn.Dropout(dropout)
)
for fs in filter_sizes
])
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)
self.fc2 = nn.Linear(128, 2) # 2 Klassen, daher 2 Outputs für CrossEntropyLoss
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x).unsqueeze(1)
conv_outputs = [conv(x).squeeze(3).squeeze(2) for conv in self.convs]
x = torch.cat(conv_outputs, 1)
x = torch.relu(self.fc1(x))
x = self.dropout(x)
return self.fc2(x) # 2 Outputs, CrossEntropyLoss übernimmt die Softmax
# Visualisierungsfunktionen
def visualize_predictions(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Unterschied zwischen vorhergesagten und wahren Werten
true_values = np.array(true_values)
predicted_values = np.array(predicted_values)
correct_indices = true_values == predicted_values
incorrect_indices = ~correct_indices
# Scatterplot
plt.scatter(
np.arange(len(true_values))[correct_indices],
true_values[correct_indices],
color='green',
label='Richtig vorhergesagt'
)
plt.scatter(
np.arange(len(true_values))[incorrect_indices],
true_values[incorrect_indices],
color='red',
label='Falsch vorhergesagt'
)
plt.axhline(0.5, linestyle='--', color='blue', label='Schwelle (0.5)')
plt.ylim(-0.5, 1.5)
plt.yticks([0, 1], labels=['Klasse 0', 'Klasse 1'])
plt.xlabel('Datenindex')
plt.ylabel('Klassifikation')
plt.title('Richtige vs. Falsche Vorhersagen')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
def visualize_distribution(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Häufigkeiten der Klassen berechnen
true_counts = np.bincount(true_values, minlength=2)
predicted_counts = np.bincount(predicted_values, minlength=2)
# Barplot erstellen
labels = ['Klasse 0', 'Klasse 1']
x = np.arange(len(labels))
plt.bar(x - 0.2, true_counts, width=0.4, color='skyblue', label='Wahre Werte', edgecolor='black')
plt.bar(x + 0.2, predicted_counts, width=0.4, color='salmon', label='Vorhergesagte Werte', edgecolor='black')
plt.title('Verteilung der wahren Werte und Vorhersagen')
plt.xticks(x, labels)
plt.ylabel('Häufigkeit')
plt.xlabel('Klassen')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Gerät initialisieren
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Daten laden
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path=params["glove_path"], emb_len=params["embedding_dim"]
)
X, y = load_preprocess_data(path_data=params["data_path"])
# Daten splitten
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)
# Modell initialisieren
model = EnhancedCNNBinaryClassifier(
vocab_size=vocab_size,
embedding_dim=params["embedding_dim"],
filter_sizes=params["filter_sizes"],
num_filters=params["num_filters"],
embedding_matrix=embedding_matrix,
dropout=params["dropout"]
)
model = model.to(device)
# BalancedCELoss verwenden
criterion = BalancedCELoss(alpha=params["alpha"])
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
# Training
history = {
"train_loss": [],
"val_loss": [],
"train_acc": [],
"val_acc": [],
}
for epoch in range(params["epochs"]):
model.train()
train_loss, correct, total = 0.0, 0, 0
with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{params['epochs']}") as pbar:
for X_batch, y_batch in pbar:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
predicted = torch.argmax(outputs, dim=1)
correct += (predicted == y_batch).sum().item()
total += y_batch.size(0)
pbar.set_postfix({"Train Loss": loss.item()})
train_acc = correct / total
history["train_loss"].append(train_loss / len(train_loader))
history["train_acc"].append(train_acc)
# Validation
model.eval()
val_loss, correct, total = 0.0, 0, 0
with torch.no_grad():
for X_batch, y_batch in val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
val_loss += loss.item()
predicted = torch.argmax(outputs, dim=1)
correct += (predicted == y_batch).sum().item()
total += y_batch.size(0)
val_acc = correct / total
history["val_loss"].append(val_loss / len(val_loader))
history["val_acc"].append(val_acc)
print(f"\nEpoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
print(f"Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")
# Testen und Visualisieren
model.eval()
test_correct, test_total = 0, 0
true_labels, predicted_labels = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
predicted = torch.argmax(outputs, dim=1)
true_labels.extend(y_batch.cpu().numpy())
predicted_labels.extend(predicted.cpu().numpy())
test_correct += (predicted == y_batch).sum().item()
test_total += y_batch.size(0)
test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy:.4f}")
# Visualisierung der Vorhersagen (Scatterplot)
visualize_predictions(true_labels, predicted_labels)
# Visualisierung der Verteilung (Barplot)
visualize_distribution(true_labels, predicted_labels)

View File

@ -302,47 +302,6 @@ test_mae = mean_absolute_error(test_labels, test_preds)
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
# Funktion zur Visualisierung der richtigen und falschen Vorhersagen
def visualize_predictions(true_values, predicted_values):
plt.figure(figsize=(10, 6))
# Unterschied zwischen vorhergesagten und wahren Werten
correct_indices = np.isclose(true_values, predicted_values, atol=0.3) # Als korrekt angenommen, wenn Differenz <= 0.3
# Plot
plt.scatter(true_values[correct_indices], predicted_values[correct_indices], color='green', label='Richtig vorhergesagt')
plt.scatter(true_values[~correct_indices], predicted_values[~correct_indices], color='red', label='Falsch vorhergesagt')
plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='blue', linestyle='--', label='Ideal-Linie')
plt.xlabel('Wahre Werte')
plt.ylabel('Vorhergesagte Werte')
plt.title('Richtige vs Falsche Vorhersagen')
plt.legend()
plt.grid(True)
plt.show()
# Test Evaluation
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device).float()
predictions = model(X_batch).float()
test_preds.extend(predictions.cpu().detach().numpy())
test_labels.extend(y_batch.cpu().detach().numpy())
# Konvertierung zu NumPy-Arrays
true_values = np.array(test_labels)
predicted_values = np.array(test_preds)
# Visualisierung der Ergebnisse
visualize_predictions(true_values, predicted_values)
# RMSE, MAE und R²-Score für das Test-Set
test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_mae = mean_absolute_error(test_labels, test_preds)
test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
# plot distribution of predicted values and true values

View File

@ -61,8 +61,10 @@ class CustomBert(nn.Module):
# self.sm = nn.Softmax(dim=1)
def forward(self, input_ids, attention_mask):
seq_out = self.bfsc(input_ids, attention_mask = attention_mask)
return self.classifier(self.dropout(seq_out[0]))
x = self.bfsc(input_ids, attention_mask = attention_mask)
x = self.dropout(x[0])
x = self.classifier(x)
return x
def freeze_bert_params(self):
@ -73,21 +75,22 @@ class CustomBert(nn.Module):
for param in self.bfsc.named_parameters():
param[1].requires_grad_(True)
def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim.AdamW,train_loader:DataLoader,freeze_bert:bool):
def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim.AdamW,train_loader:DataLoader,freeze_bert:bool=False):
model.train()
if freeze_bert:
model.freeze_bert_params()
total_loss = 0
len_train_loader = len(train_loader)
for index,train_batch in enumerate(train_loader):
for train_batch in train_loader:
# Set Gradient to Zero
optimizer.zero_grad()
# Unpack batch values and "push" it to GPU
input_ids, att_mask, labels = train_batch.values()
# print(f"{input_ids.shape}, {att_mask.shape}, {labels.shape}")
# print(f"Iteration {index} of {len_train_loader}")
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE),labels.to(DEVICE)
# Feed Model with Data
outputs = model(input_ids, attention_mask=att_mask)
# print(f"{model.bfsc.}")
@ -96,6 +99,7 @@ def training_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,optimizer:optim
loss.backward()
optimizer.step()
total_loss+=loss.item()
print(f"Training Loss is {(total_loss/len(train_loader)):.4f}")
return (total_loss/len(train_loader))
@ -103,109 +107,47 @@ def eval_loop(model:CustomBert,criterion:nn.CrossEntropyLoss,validation_loader:D
model.eval()
total, correct = 0.0, 0.0
total_loss = 0.0
best_loss = 10.0
best_loss = float("Inf")
with torch.no_grad():
for val_batch in validation_loader:
input_ids, att_mask ,labels = val_batch.values()
input_ids, att_mask, labels = input_ids.to(DEVICE),att_mask.to(DEVICE), labels.to(DEVICE)
outputs = model(input_ids,attention_mask=att_mask)
loss = criterion(outputs,labels)
total_loss += loss.item()
predictions = torch.argmax(outputs,1)
total += labels.size(0)
correct += (predictions == labels).sum().item()
if total_loss/len(validation_loader) < best_loss:
best_loss = total_loss/len(validation_loader)
torch.save(model,"best_bert_model")
print(f"Validation Loss: {total_loss/len(validation_loader):.4f} ### Test Accuracy {correct/total*100:.4f}%")
torch.save(model,"best_bert_model.pt")
print(f"Validation Loss: {total_loss/len(validation_loader):.4f} ### Validation Accuracy {correct/total*100:.4f}%")
return total_loss/len(validation_loader)
def test_loop(model:CustomBert, criterion:nn.CrossEntropyLoss, test_loader:DataLoader):
def test_loop(model:CustomBert, test_loader:DataLoader):
for batch in test_loader:
input_ids, att_mask, labels = batch.values()
input_ids, att_mask, labels = input_ids.to(DEVICE), att_mask.to(DEVICE), labels.to(DEVICE)
with torch.no_grad():
model = torch.load("best_bert_model")
model.to(DEVICE)
output = model(input_ids,att_mask)
output.detach().cpu().numpy()
labels.detach().cpu().numpy()
pred_flat = np.argmax(output,1).flatten()
print(accuracy_score(labels,pred_flat))
def performance_metrics(true_labels,predictions):
confusion_matrix(true_labels,predictions)
accuracy_score(true_labels,predictions)
f1_score(true_labels,predictions)
pass
if __name__ == "__main__":
# HYPERPARAMETERS
# Set Max Epoch Amount
EPOCH = 10
# DROPOUT-PROBABILITY
DROPOUT = 0.1
# BATCHSIZE
BATCH_SIZE = 16
#LEARNING RATE
LEARNING_RATE = 1e-5
# RANDOM SEED
RNDM_SEED = 501
torch.manual_seed(RNDM_SEED)
np.random.seed(RNDM_SEED)
torch.cuda.seed_all(RNDM_SEED)
# Initialize Bert Model with dropout probability and Num End Layers
mybert = CustomBert(DROPOUT)
print("Bert Initialized")
mybert.to(DEVICE)
# Read Raw Data from csv and save as DataFrame
df = pd.read_csv("./data/hack.csv",encoding="latin1")
print("Raw Data read")
# Initialize BertTokenizer from Pretrained
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
print("Tokenizer Initialized")
#Split DataFrame into Train and Test Sets
train,test = train_test_split(df,random_state=501,test_size=.2)
print("Splitted Data in Train and Test Sets")
test,val = train_test_split(test,random_state=501,test_size=.5)
# val = []
# Create Custom Datasets for Train and Test
train_data = SimpleHumorDataset(tokenizer,train)
val_data = SimpleHumorDataset(tokenizer,val)
test_data = SimpleHumorDataset(tokenizer,test)
print("Custom Datasets created")
# Initialize Dataloader with Train and Test Sets
train_loader = DataLoader(dataset=train_data,batch_size=BATCH_SIZE,shuffle=True)
validation_loader = DataLoader(dataset=val_data,batch_size=BATCH_SIZE,shuffle=True)
test_loader = DataLoader(dataset=test_data,batch_size=BATCH_SIZE,shuffle=False)
print("DataLoaders created")
# Set criterion to Cross Entropy and define Adam Optimizer with model parameters and learning rate
criterion_cross_entropy = nn.CrossEntropyLoss()
optimizer_adamW = optim.Adam(mybert.parameters(), lr = LEARNING_RATE)
import time
# Set Scheduler for dynamically Learning Rate adjustment
loss_values = np.zeros(EPOCH)
eval_values = np.zeros(EPOCH)
freeze = False
for epoch in range(EPOCH):
start = time.time()
print(f"For {epoch+1} the Scores are: ")
loss_values[epoch] = training_loop(mybert,optimizer=optimizer_adamW,criterion=criterion_cross_entropy,train_loader=train_loader,freeze_bert=freeze)
eval_values[epoch] = eval_loop(mybert,criterion=criterion_cross_entropy,validation_loader=test_loader)
end = time.time()
print((end-start),"seconds per epoch needed")
# Visualize Training Loss
def plot_metrics_loss_n_acc(train_loss,validation_loss,train_acc,validation_acc):
"""
Method that plots Loss and Accuracy of Training and Validation Data used in given modelinstance
"""
# Visualize Training Loss
# plt.plot(loss_values)
# plt.plot(eval_values)
# plt.hlines(np.mean(loss_values),xmin=0,xmax=EPOCH,colors='red',linestyles="dotted",label="Average Loss")
@ -214,5 +156,111 @@ if __name__ == "__main__":
# plt.xlabel("Num Epochs")
# plt.ylabel("Total Loss of Epoch")
# plt.show()
for epoch in range(EPOCH):
test_loop(mybert,criterion_cross_entropy,validation_loader)
pass
def plot_test_metrics(accuracy):
"""
Plot Test Metrics of Model (Confiuson Matrix, Accuracy)
"""
plt.plot(accuracy)
plt.hlines(np.mean(accuracy),0,len(accuracy),'red','dotted','Mean Accuracy %d'.format(np.mean(accuracy)))
plt.title("Accuracy of Test")
plt.xlabel("Num Epochs")
plt.ylabel("Accurcy 0.0 - 1.0")
plt.grid(True)
plt.legend()
plt.show()
# def performance_metrics(true_labels,predictions):
# confusion_matrix(true_labels,predictions)
# accuracy_score(true_labels,predictions)
# f1_score(true_labels,predictions)
# pass
def create_datasets(tokenizer:AutoTokenizer,dataframe:pd.DataFrame,train_split_ratio:float,val:bool=False)->tuple[SimpleHumorDataset,SimpleHumorDataset,SimpleHumorDataset]|tuple[SimpleHumorDataset,SimpleHumorDataset]:
if train_split_ratio > 1.0:
raise AssertionError("Trainsplit sollte kleiner(-gleich) 1.0 sein")
train,test = train_test_split(dataframe,train_size=train_split_ratio,random_state=501)
if val:
test,validation = train_test_split(test,train_size=.5,random_state=501)
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test), SimpleHumorDataset(tokenizer,validation)
return SimpleHumorDataset(tokenizer,train), SimpleHumorDataset(tokenizer,test)
def create_dataloaders(datasets:tuple|list,batchsize:int,shufflelist:list):
train_loader = DataLoader(datasets[0],batchsize,shuffle=shufflelist[0])
test_loader = DataLoader(datasets[1],batchsize,shuffle=shufflelist[1])
if len(datasets) == 3:
return train_loader, test_loader, DataLoader(datasets[2],batchsize,shuffle=shufflelist[2])
return train_loader, test_loader
# if __name__ == "__main__":
# # HYPERPARAMETERS
# # Set Max Epoch Amount
# EPOCH = 10
# # DROPOUT-PROBABILITY
# DROPOUT = 0.1
# # BATCHSIZE
# BATCH_SIZE = 16
# #LEARNING RATE
# LEARNING_RATE = 1e-5
# # RANDOM SEED
# RNDM_SEED = 501
# # FREEZE Bert Layers
# FREEZE = True
# torch.manual_seed(RNDM_SEED)
# np.random.seed(RNDM_SEED)
# torch.cuda.manual_seed_all(RNDM_SEED)
# Initialize Bert Model with dropout probability and port to DEVICE
# mybert = CustomBert(DROPOUT)
# print("Bert Initialized")
# mybert.to(DEVICE)
# Read Raw Data from csv and save as DataFrame
# df = pd.read_csv("./data/hack.csv",encoding="latin1")
# print("Raw Data read")
# Initialize BertTokenizer from Pretrained
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",do_lower_case=True)
# print("Tokenizer Initialized")
# Split DataFrame into Train and Test Sets
# Create Custom Datasets for Train and Test
# train_data,test_data,validation_data = create_datasets(tokenizer,df,.7,True)
# print("Splitted Data in Train and Test Sets")
# print("Custom Datasets created")
# Initialize Dataloader with Train and Test Sets
# train_loader, test_loader, validation_loader = create_dataloaders([train_data,test_data,validation_data],batchsize=BATCH_SIZE,shufflelist=[True,True,False])
# print("DataLoaders created")
# Set criterion to Cross Entropy and define Adam Optimizer with model parameters and learning rate
# criterion_cross_entropy = nn.CrossEntropyLoss()
# optimizer_adamW = optim.Adam(mybert.parameters(), lr = LEARNING_RATE)
# import time
# Set Scheduler for dynamically Learning Rate adjustment
loss_values, eval_values = np.zeros(EPOCH), np.zeros(EPOCH)
# for epoch in range(EPOCH):
# start = time.time()
# print(f"For {epoch+1} the Scores are: ")
# loss_values[epoch] = training_loop(mybert,optimizer=optimizer_adamW,criterion=criterion_cross_entropy,train_loader=train_loader,freeze_bert=FREEZE)
# eval_values[epoch] = eval_loop(mybert,criterion=criterion_cross_entropy,validation_loader=test_loader)
# end = time.time()
# print((end-start),"seconds per epoch needed")
# plot_metrics_loss_n_acc("x","x","x","x")
# for epoch in range(EPOCH):
# test_loop(mybert,validation_loader)

203
cnn.py
View File

@ -1,203 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import gensim
import nltk
import time
import matplotlib.pyplot as plt
# NLTK Downloads
nltk.download('punkt') # Entferne punkt_tab, da es nicht existiert
# Check if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
# Maximum sequence length
MAX_LEN = 100
# Data helpers
def get_embedding(model, word):
if word in model.wv:
return model.wv.key_to_index[word]
else:
return unk_index
def encode_tokens(tokens):
return [get_embedding(model_embedding, token) for token in tokens]
def pad_sequences(sequences, MAX_LEN):
return np.array([np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index)
if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences])
# Dataset class
class HumorDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.long)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
# CNN Model
class CNNBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_filters, kernel_sizes, hidden_dim, dropout=0.1):
super(CNNBinaryClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.conv_layers = nn.ModuleList([
nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
for k in kernel_sizes
])
self.fc = nn.Linear(num_filters * len(kernel_sizes), hidden_dim)
self.out = nn.Linear(hidden_dim, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids):
embedded = self.embedding(input_ids).permute(0, 2, 1)
conv_outs = [self.relu(conv(embedded)) for conv in self.conv_layers]
pooled_outs = [torch.max(out, dim=2)[0] for out in conv_outs]
concatenated = torch.cat(pooled_outs, dim=1)
fc_out = self.relu(self.fc(self.dropout(concatenated)))
logits = self.out(fc_out)
return self.sigmoid(logits)
# Main script
if __name__ == "__main__":
# Load and process data
df = pd.read_csv('/content/hack.csv')
print(f"Loaded dataset: {df.shape}")
X = df['text'].fillna("unknown").astype(str)
y = df['is_humor']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenization with error handling
train_tokens = []
test_tokens = []
for text in X_train:
try:
train_tokens.append(word_tokenize(text.lower()))
except Exception as e:
print(f"Error tokenizing: {text}. Error: {e}")
train_tokens.append(["unknown"])
for text in X_test:
try:
test_tokens.append(word_tokenize(text.lower()))
except Exception as e:
print(f"Error tokenizing: {text}. Error: {e}")
test_tokens.append(["unknown"])
print("Sample tokenization (Train):", train_tokens[:2])
print("Sample tokenization (Test):", test_tokens[:2])
# Train Word2Vec model
model_embedding = gensim.models.Word2Vec(train_tokens, vector_size=100, window=5, min_count=1, workers=4)
# Add unknown token
model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
unk_index = model_embedding.wv.key_to_index['<UNK>']
# Encode tokens
train_encodings = [encode_tokens(tokens) for tokens in train_tokens]
test_encodings = [encode_tokens(tokens) for tokens in test_tokens]
# Pad sequences with validation
train_encodings = pad_sequences(train_encodings, MAX_LEN)
test_encodings = pad_sequences(test_encodings, MAX_LEN)
if len(train_encodings) == 0 or len(test_encodings) == 0:
raise ValueError("Tokenization or padding failed. Please check your input data.")
# Create datasets
train_dataset = HumorDataset(train_encodings, y_train.reset_index(drop=True))
test_dataset = HumorDataset(test_encodings, y_test.reset_index(drop=True))
# Model parameters
vocab_size = len(model_embedding.wv.key_to_index)
embed_dim = model_embedding.vector_size
num_filters = 200
kernel_sizes = [3, 4, 5]
hidden_dim = 128
dropout = 0.5
model = CNNBinaryClassifier(vocab_size, embed_dim, num_filters, kernel_sizes, hidden_dim, dropout)
# Training parameters
epochs = 10
batch_size = 8
learning_rate = 2e-5
# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# Move model to device
model.to(DEVICE)
print("Starting training...")
train_losses = []
# Training loop
for epoch in range(epochs):
epoch_loss = 0
model.train()
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
train_loss = epoch_loss / len(train_loader)
train_losses.append(train_loss)
print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}")
# Visualize training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, marker='o', linestyle='-', label='Train Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()
print("Starting evaluation...")
# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
preds = (outputs > 0.5).float()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Final Accuracy: {accuracy:.4f}")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,186 +0,0 @@
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
from dataset_generator import create_embedding_matrix
from EarlyStopping import EarlyStopping
# 1. Gerät automatisch erkennen (MPS, CUDA oder CPU)
device = torch.device('mps' if torch.backends.mps.is_available()
else 'cuda' if torch.cuda.is_available()
else 'cpu')
print(f"Using device: {device}")
# 2. Daten laden
data = pd.read_csv('data/hack.csv')
# 3. Filtern humorvoller Texte
humor_data = data[data['is_humor'] == 1].dropna(subset=['humor_rating']).copy()
# 4. Einbettungsmatrix erstellen
embedding_matrix, word_index, vocab_size, d_model = create_embedding_matrix(
gloVe_path='data/glove.6B.100d.txt', emb_len=100
)
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
# 5. Tokenisierung und Padding mit PyTorch
def tokenize_and_pad(texts, word_index, max_len=50):
sequences = []
for text in texts:
tokens = [word_index.get(word, 0) for word in text.split()]
if len(tokens) < max_len:
tokens += [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
sequences.append(tokens)
return torch.tensor(sequences, dtype=torch.long)
# Training und Testdaten splitten
train_texts, test_texts, train_labels, test_labels = train_test_split(
humor_data['text'], humor_data['humor_rating'], test_size=0.2, random_state=42
)
# Tokenisierung und Padding
max_len = 50
train_input_ids = tokenize_and_pad(train_texts, word_index, max_len=max_len)
test_input_ids = tokenize_and_pad(test_texts, word_index, max_len=max_len)
# Labels in Tensor konvertieren
train_labels = torch.tensor(train_labels.values, dtype=torch.float)
test_labels = torch.tensor(test_labels.values, dtype=torch.float)
# 6. Dataset-Klasse für PyTorch
class HumorDataset(Dataset):
def __init__(self, input_ids, labels):
self.input_ids = input_ids
self.labels = labels
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.labels[idx]
# Dataset und DataLoader erstellen
train_dataset = HumorDataset(train_input_ids, train_labels)
test_dataset = HumorDataset(test_input_ids, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 7. CNN-Regression-Modell definieren
class CNNRegressor(nn.Module):
def __init__(self, vocab_size, embed_dim, embedding_matrix):
super(CNNRegressor, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.embedding.weight.data.copy_(embedding_matrix.clone().detach())
self.embedding.weight.requires_grad = False
self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3)
self.conv2 = nn.Conv1d(128, 64, kernel_size=3)
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(64, 1)
def forward(self, x):
x = self.embedding(x).permute(0, 2, 1)
x = torch.relu(self.conv1(x))
x = torch.relu(self.conv2(x))
x = self.dropout(x)
x = torch.max(x, dim=2).values
x = self.fc(x)
x = torch.sigmoid(x) * 5 # Wertebereich [0, 5]
return x
# Initialisiere das Modell
model = CNNRegressor(vocab_size, d_model, embedding_matrix).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Early Stopping
#early_stopping = EarlyStopping(patience=5)
# 8. Training mit Validierung
for epoch in range(20): # Maximal 20 Epochen
model.train()
train_loss = 0
for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs).squeeze()
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_loader)
# Validierungsverlust berechnen
model.eval()
val_loss = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, labels)
val_loss += loss.item()
val_loss /= len(test_loader)
print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
# Early Stopping
'''early_stopping(val_loss, model)
if early_stopping.early_stop:
print("Early stopping triggered")
break'''
# 9. Modell evaluieren
def evaluate_model(model, data_loader):
model.eval()
predictions = []
actuals = []
with torch.no_grad():
for inputs, labels in data_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs).squeeze()
predictions.extend(outputs.cpu().numpy())
actuals.extend(labels.cpu().numpy())
return predictions, actuals
predictions, actuals = evaluate_model(model, test_loader)
# Metriken berechnen
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
r2 = r2_score(actuals, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
# 10. Visualisierung (Korrekte und falsche Vorhersagen farblich darstellen)
tolerance = 0.5 # Toleranz für korrekte Vorhersagen
predictions = np.array(predictions)
actuals = np.array(actuals)
# Klassifikation: Grün (korrekt), Rot (falsch)
correct = np.abs(predictions - actuals) <= tolerance
colors = np.where(correct, 'green', 'red')
# Scatter-Plot
plt.figure(figsize=(8, 6))
plt.scatter(actuals, predictions, c=colors, alpha=0.6, edgecolor='k', s=50)
plt.plot([0, 5], [0, 5], color='red', linestyle='--') # Perfekte Vorhersage-Linie
# Legende
green_patch = mpatches.Patch(color='green', label='Correct Predictions')
red_patch = mpatches.Patch(color='red', label='Incorrect Predictions')
plt.legend(handles=[green_patch, red_patch])
# Achsen und Titel
plt.xlabel("True Humor Ratings")
plt.ylabel("Predicted Humor Ratings")
plt.title("True vs Predicted Humor Ratings (Correct vs Incorrect)")
plt.show()

View File

@ -0,0 +1,310 @@
import time
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_recall_curve, f1_score, confusion_matrix, r2_score
from sklearn.model_selection import KFold
# local imports
import ml_evaluation as ml_eval
import ml_helper
import ml_history
import dataset_generator as data_gen
# class imports
import HumorDataset as humor_ds
import EarlyStopping
import BalancedCELoss
torch.manual_seed(0)
np.random.seed(0)
best_model_filename = 'best_transformer_reg_model.pt'
device = ml_helper.get_device(verbose=True)
embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()
vocab_size = len(embedding_matrix)
d_model = len(embedding_matrix[0])
vocab_size, d_model = embedding_matrix.size()
print(f"vocab_size: {vocab_size}, d_model: {d_model}")
class PositionalEncoding(nn.Module):
def __init__(self, d_model, vocab_size=5000, dropout=0.1):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(vocab_size, d_model)
position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float()
* (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[:, : x.size(1), :]
return self.dropout(x)
class TransformerBinaryClassifier(nn.Module):
def __init__(
self,
embeddings,
nhead=8,
dim_feedforward=2048,
num_layers=6,
positional_dropout=0.1,
classifier_dropout=0.1,
activation="relu",
):
super().__init__()
vocab_size, d_model = embeddings.size()
assert d_model % nhead == 0, "nheads must divide evenly into d_model"
self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
self.pos_encoder = PositionalEncoding(
d_model=d_model,
dropout=positional_dropout,
vocab_size=vocab_size,
)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=nhead,
dim_feedforward=dim_feedforward,
dropout=classifier_dropout,
)
self.transformer_encoder = nn.TransformerEncoder(
encoder_layer,
num_layers=num_layers,
)
self.batch_norm = nn.BatchNorm1d(d_model)
self.classifier = nn.Linear(d_model, 1)
self.d_model = d_model
def forward(self, x):
x = self.emb(x) * math.sqrt(self.d_model)
x = self.pos_encoder(x)
x = self.transformer_encoder(x)
x = x.mean(dim=1)
x = self.classifier(x)
return x
def load_preprocess_data(path_data='data/hack.csv'):
df = pd.read_csv(path_data)
df = df.dropna(subset=['humor_rating'])
df['y'] = df['humor_rating']
X = df['text']
y = df['y']
return X, y
X, y = load_preprocess_data()
ret_dict = data_gen.split_data(X, y)
params = {
'equalize_classes_loss_factor': 0.15,
'batch_size': 32,
'epochs': 2,
'lr': 1e-4,
'clipping_max_norm': 0,
'early_stopping_patience': 5,
'lr_scheduler_factor': 0.5,
'lr_scheduler_patience': 3,
'nhead': 2,
'num_layers': 3,
'hidden_dim': 10,
'positional_dropout': 0.5,
'classifier_dropout': 0.5,
'weight_decay': 1e-2
}
max_len = 280
train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)
val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)
test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)
early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)
def train_model(model, train_dataset, criterion, optimizer, epochs, batch_size):
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
model.to(device)
# Store for plotting
train_losses, val_losses = [], []
train_r2_scores, val_r2_scores = [], []
for epoch in range(epochs):
model.train()
total_loss = 0
all_preds, all_targets = [], []
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
loss.backward()
optimizer.step()
total_loss += loss.item()
all_preds.extend(outputs.detach().cpu().numpy())
all_targets.extend(targets.detach().cpu().numpy())
# Calculate R2
r2 = r2_score(all_targets, all_preds)
train_losses.append(total_loss / len(dataloader))
train_r2_scores.append(r2)
# Validation phase
model.eval()
val_loss = 0
val_preds, val_targets = [], []
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs).squeeze()
loss = criterion(outputs, targets.float())
val_loss += loss.item()
val_preds.extend(outputs.cpu().numpy())
val_targets.extend(targets.cpu().numpy())
# Calculate Validation R2
val_r2 = r2_score(val_targets, val_preds)
val_losses.append(val_loss / len(val_loader))
val_r2_scores.append(val_r2)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}, R^2 (Train): {r2:.4f}, Val R^2: {val_r2:.4f}")
return train_losses, val_losses, train_r2_scores, val_r2_scores
def bootstrap_aggregation(ModelClass, train_dataset, num_models=5, epochs=10, batch_size=32, learning_rate=0.001):
models = []
all_train_losses, all_val_losses = [], []
all_train_r2_scores, all_val_r2_scores = [], []
subset_size = len(train_dataset) // num_models
for i in range(num_models):
print(f"Training Model {i + 1}/{num_models}...")
start_idx = i * subset_size
end_idx = start_idx + subset_size
subset_indices = list(range(0, start_idx)) + list(range(end_idx, len(train_dataset)))
subset = Subset(train_dataset, subset_indices)
model = ModelClass()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses, val_losses, train_r2_scores, val_r2_scores = train_model(model, subset, criterion, optimizer, epochs, batch_size)
models.append(model)
all_train_losses.append(train_losses)
all_val_losses.append(val_losses)
all_train_r2_scores.append(train_r2_scores)
all_val_r2_scores.append(val_r2_scores)
return models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores
# Ensemble Prediction
def ensemble_predict(models, test_dataset):
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
all_predictions = []
with torch.no_grad():
for inputs, _ in dataloader:
inputs = inputs.to(device)
predictions = torch.stack([model(inputs).squeeze() for model in models])
avg_predictions = predictions.mean(dim=0)
all_predictions.extend(avg_predictions.cpu().numpy())
return np.array(all_predictions)
# Bootstrap Aggregating
num_models = 2
ensemble_models, all_train_losses, all_val_losses, all_train_r2_scores, all_val_r2_scores = bootstrap_aggregation(
lambda: TransformerBinaryClassifier(
embeddings=embedding_matrix,
nhead=params['nhead'],
num_layers=params['num_layers'],
dim_feedforward=params['hidden_dim'],
positional_dropout=params['positional_dropout'],
classifier_dropout=params['classifier_dropout']
).to(device),
train_dataset,
num_models=num_models,
epochs=params['epochs'],
batch_size=params['batch_size'],
learning_rate=params['lr']
)
# Ensemble Prediction on Testset
ensemble_predictions = ensemble_predict(ensemble_models, test_dataset)
# Plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Train and Validation Losses
for i in range(num_models):
ax1.plot(range(1, params['epochs'] + 1), all_train_losses[i], label=f"Train Model {i+1}")
ax1.plot(range(1, params['epochs'] + 1), all_val_losses[i], label=f"Val Model {i+1}", linestyle='dashed')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()
# Plot Train and Validation R²
for i in range(num_models):
ax2.plot(range(1, params['epochs'] + 1), all_train_r2_scores[i], label=f"Train Model {i+1}")
ax2.plot(range(1, params['epochs'] + 1), all_val_r2_scores[i], label=f"Val Model {i+1}", linestyle='dashed')
ax2.set_title('Train and Validation R²')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('')
ax2.legend()
plt.tight_layout()
plt.show()
# Evaluation
mse = mean_squared_error(test_dataset.labels.to_numpy(), ensemble_predictions)
mae = mean_absolute_error(test_dataset.labels.to_numpy(), ensemble_predictions)
r2 = r2_score(test_dataset.labels.to_numpy(), ensemble_predictions)
print(f"Ensemble MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")