diff --git a/HumorDataset.py b/HumorDataset.py index 05c278b..f802d03 100644 --- a/HumorDataset.py +++ b/HumorDataset.py @@ -5,6 +5,40 @@ import torch import numpy as np from nltk.tokenize import word_tokenize +class TextRegDataset(torch.utils.data.Dataset): + def __init__(self, texts, labels, word_index, max_len=50): + + self.original_indices = labels.index.to_list() + + self.texts = texts.reset_index(drop=True) + self.labels = labels.reset_index(drop=True) + self.word_index = word_index + self.max_len = max_len + + def __len__(self): + return len(self.texts) + + def __getitem__(self, idx): + texts = self.texts[idx] + tokens = word_tokenize(texts.lower()) + + label = self.labels[idx] + + # Tokenize and convert to indices + input_ids = [self.word_index.get(word, self.word_index['']) for word in tokens] + + # Pad or truncate to max_len + if len(input_ids) < self.max_len: + input_ids += [self.word_index['']] * (self.max_len - len(input_ids)) + else: + input_ids = input_ids[:self.max_len] + + # Convert to PyTorch tensors + input_ids = torch.tensor(input_ids, dtype=torch.long) + label = torch.tensor(label, dtype=torch.float) + + return input_ids, label + class TextDataset(torch.utils.data.Dataset): def __init__(self, texts, labels, word_index, max_len=50): diff --git a/TEST_CNN_2.py b/TEST_CNN_2.py index f0f52d3..ea1a7fb 100644 --- a/TEST_CNN_2.py +++ b/TEST_CNN_2.py @@ -5,7 +5,7 @@ from torch.utils.data import DataLoader from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from tqdm import tqdm # Fortschrittsbalken-Bibliothek from dataset_generator import create_embedding_matrix, split_data -from HumorDataset import TextDataset +from HumorDataset import TextRegDataset import numpy as np import pandas as pd import os @@ -20,7 +20,7 @@ params = { "learning_rate": 0.001, "epochs": 25, "glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe - "max_len": 50, + "max_len": 280, "test_size": 0.1, "val_size": 0.1, "patience": 5, @@ -171,9 +171,9 @@ visualize_data_distribution(y) data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"]) # Dataset und DataLoader -train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) -val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) -test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) +train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) +val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) +test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) @@ -187,7 +187,10 @@ model = EnhancedCNNRegressor( num_filters=params["num_filters"], embedding_matrix=embedding_matrix, dropout=params["dropout"] -).to(device) +) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = model.to(device) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) @@ -340,3 +343,15 @@ test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds)) test_mae = mean_absolute_error(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds) print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") + + +# plot distribution of predicted values and true values +plt.figure(figsize=(10, 6)) +plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values') +plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values') +plt.title('Distribution of Predicted and True Values') +plt.xlabel('Score') +plt.ylabel('Frequency') +plt.legend() +plt.grid(axis='y', linestyle='--', alpha=0.7) +plt.show() \ No newline at end of file