from long to float for regression
parent
b77bdd21b3
commit
276f03d61e
|
|
@ -5,6 +5,40 @@ import torch
|
|||
import numpy as np
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
class TextRegDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, texts, labels, word_index, max_len=50):
|
||||
|
||||
self.original_indices = labels.index.to_list()
|
||||
|
||||
self.texts = texts.reset_index(drop=True)
|
||||
self.labels = labels.reset_index(drop=True)
|
||||
self.word_index = word_index
|
||||
self.max_len = max_len
|
||||
|
||||
def __len__(self):
|
||||
return len(self.texts)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
texts = self.texts[idx]
|
||||
tokens = word_tokenize(texts.lower())
|
||||
|
||||
label = self.labels[idx]
|
||||
|
||||
# Tokenize and convert to indices
|
||||
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
|
||||
|
||||
# Pad or truncate to max_len
|
||||
if len(input_ids) < self.max_len:
|
||||
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
|
||||
else:
|
||||
input_ids = input_ids[:self.max_len]
|
||||
|
||||
# Convert to PyTorch tensors
|
||||
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
||||
label = torch.tensor(label, dtype=torch.float)
|
||||
|
||||
return input_ids, label
|
||||
|
||||
class TextDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, texts, labels, word_index, max_len=50):
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from torch.utils.data import DataLoader
|
|||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||
from tqdm import tqdm # Fortschrittsbalken-Bibliothek
|
||||
from dataset_generator import create_embedding_matrix, split_data
|
||||
from HumorDataset import TextDataset
|
||||
from HumorDataset import TextRegDataset
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
|
|
@ -20,7 +20,7 @@ params = {
|
|||
"learning_rate": 0.001,
|
||||
"epochs": 25,
|
||||
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
|
||||
"max_len": 50,
|
||||
"max_len": 280,
|
||||
"test_size": 0.1,
|
||||
"val_size": 0.1,
|
||||
"patience": 5,
|
||||
|
|
@ -171,9 +171,9 @@ visualize_data_distribution(y)
|
|||
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
|
||||
|
||||
# Dataset und DataLoader
|
||||
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||
val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||
test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||
|
|
@ -187,7 +187,10 @@ model = EnhancedCNNRegressor(
|
|||
num_filters=params["num_filters"],
|
||||
embedding_matrix=embedding_matrix,
|
||||
dropout=params["dropout"]
|
||||
).to(device)
|
||||
)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model = model.to(device)
|
||||
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||
|
|
@ -340,3 +343,15 @@ test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
|||
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||
test_r2 = r2_score(test_labels, test_preds)
|
||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||
|
||||
|
||||
# plot distribution of predicted values and true values
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
|
||||
plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
|
||||
plt.title('Distribution of Predicted and True Values')
|
||||
plt.xlabel('Score')
|
||||
plt.ylabel('Frequency')
|
||||
plt.legend()
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
plt.show()
|
||||
Loading…
Reference in New Issue