from long to float for regression

main
Felix Jan Michael Mucha 2025-02-14 19:23:17 +01:00
parent b77bdd21b3
commit 276f03d61e
2 changed files with 55 additions and 6 deletions

View File

@ -5,6 +5,40 @@ import torch
import numpy as np import numpy as np
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
class TextRegDataset(torch.utils.data.Dataset):
def __init__(self, texts, labels, word_index, max_len=50):
self.original_indices = labels.index.to_list()
self.texts = texts.reset_index(drop=True)
self.labels = labels.reset_index(drop=True)
self.word_index = word_index
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
texts = self.texts[idx]
tokens = word_tokenize(texts.lower())
label = self.labels[idx]
# Tokenize and convert to indices
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
# Pad or truncate to max_len
if len(input_ids) < self.max_len:
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
else:
input_ids = input_ids[:self.max_len]
# Convert to PyTorch tensors
input_ids = torch.tensor(input_ids, dtype=torch.long)
label = torch.tensor(label, dtype=torch.float)
return input_ids, label
class TextDataset(torch.utils.data.Dataset): class TextDataset(torch.utils.data.Dataset):
def __init__(self, texts, labels, word_index, max_len=50): def __init__(self, texts, labels, word_index, max_len=50):

View File

@ -5,7 +5,7 @@ from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm # Fortschrittsbalken-Bibliothek from tqdm import tqdm # Fortschrittsbalken-Bibliothek
from dataset_generator import create_embedding_matrix, split_data from dataset_generator import create_embedding_matrix, split_data
from HumorDataset import TextDataset from HumorDataset import TextRegDataset
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import os import os
@ -20,7 +20,7 @@ params = {
"learning_rate": 0.001, "learning_rate": 0.001,
"epochs": 25, "epochs": 25,
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe "glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
"max_len": 50, "max_len": 280,
"test_size": 0.1, "test_size": 0.1,
"val_size": 0.1, "val_size": 0.1,
"patience": 5, "patience": 5,
@ -171,9 +171,9 @@ visualize_data_distribution(y)
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"]) data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
# Dataset und DataLoader # Dataset und DataLoader
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"]) train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"]) val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"]) test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True) train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False) val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
@ -187,7 +187,10 @@ model = EnhancedCNNRegressor(
num_filters=params["num_filters"], num_filters=params["num_filters"],
embedding_matrix=embedding_matrix, embedding_matrix=embedding_matrix,
dropout=params["dropout"] dropout=params["dropout"]
).to(device) )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.MSELoss() criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]) optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
@ -340,3 +343,15 @@ test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
test_mae = mean_absolute_error(test_labels, test_preds) test_mae = mean_absolute_error(test_labels, test_preds)
test_r2 = r2_score(test_labels, test_preds) test_r2 = r2_score(test_labels, test_preds)
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}") print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
# plot distribution of predicted values and true values
plt.figure(figsize=(10, 6))
plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
plt.title('Distribution of Predicted and True Values')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()