from long to float for regression
parent
b77bdd21b3
commit
276f03d61e
|
|
@ -5,6 +5,40 @@ import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
class TextRegDataset(torch.utils.data.Dataset):
|
||||||
|
def __init__(self, texts, labels, word_index, max_len=50):
|
||||||
|
|
||||||
|
self.original_indices = labels.index.to_list()
|
||||||
|
|
||||||
|
self.texts = texts.reset_index(drop=True)
|
||||||
|
self.labels = labels.reset_index(drop=True)
|
||||||
|
self.word_index = word_index
|
||||||
|
self.max_len = max_len
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.texts)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
texts = self.texts[idx]
|
||||||
|
tokens = word_tokenize(texts.lower())
|
||||||
|
|
||||||
|
label = self.labels[idx]
|
||||||
|
|
||||||
|
# Tokenize and convert to indices
|
||||||
|
input_ids = [self.word_index.get(word, self.word_index['<UNK>']) for word in tokens]
|
||||||
|
|
||||||
|
# Pad or truncate to max_len
|
||||||
|
if len(input_ids) < self.max_len:
|
||||||
|
input_ids += [self.word_index['<PAD>']] * (self.max_len - len(input_ids))
|
||||||
|
else:
|
||||||
|
input_ids = input_ids[:self.max_len]
|
||||||
|
|
||||||
|
# Convert to PyTorch tensors
|
||||||
|
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
||||||
|
label = torch.tensor(label, dtype=torch.float)
|
||||||
|
|
||||||
|
return input_ids, label
|
||||||
|
|
||||||
class TextDataset(torch.utils.data.Dataset):
|
class TextDataset(torch.utils.data.Dataset):
|
||||||
def __init__(self, texts, labels, word_index, max_len=50):
|
def __init__(self, texts, labels, word_index, max_len=50):
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from torch.utils.data import DataLoader
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||||
from tqdm import tqdm # Fortschrittsbalken-Bibliothek
|
from tqdm import tqdm # Fortschrittsbalken-Bibliothek
|
||||||
from dataset_generator import create_embedding_matrix, split_data
|
from dataset_generator import create_embedding_matrix, split_data
|
||||||
from HumorDataset import TextDataset
|
from HumorDataset import TextRegDataset
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
|
|
@ -20,7 +20,7 @@ params = {
|
||||||
"learning_rate": 0.001,
|
"learning_rate": 0.001,
|
||||||
"epochs": 25,
|
"epochs": 25,
|
||||||
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
|
"glove_path": 'data/glove.6B.100d.txt', # Pfad zu GloVe
|
||||||
"max_len": 50,
|
"max_len": 280,
|
||||||
"test_size": 0.1,
|
"test_size": 0.1,
|
||||||
"val_size": 0.1,
|
"val_size": 0.1,
|
||||||
"patience": 5,
|
"patience": 5,
|
||||||
|
|
@ -171,9 +171,9 @@ visualize_data_distribution(y)
|
||||||
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
|
data_split = split_data(X, y, test_size=params["test_size"], val_size=params["val_size"])
|
||||||
|
|
||||||
# Dataset und DataLoader
|
# Dataset und DataLoader
|
||||||
train_dataset = TextDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
train_dataset = TextRegDataset(data_split['train']['X'], data_split['train']['y'], word_index, max_len=params["max_len"])
|
||||||
val_dataset = TextDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
val_dataset = TextRegDataset(data_split['val']['X'], data_split['val']['y'], word_index, max_len=params["max_len"])
|
||||||
test_dataset = TextDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
test_dataset = TextRegDataset(data_split['test']['X'], data_split['test']['y'], word_index, max_len=params["max_len"])
|
||||||
|
|
||||||
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
|
||||||
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
|
||||||
|
|
@ -187,7 +187,10 @@ model = EnhancedCNNRegressor(
|
||||||
num_filters=params["num_filters"],
|
num_filters=params["num_filters"],
|
||||||
embedding_matrix=embedding_matrix,
|
embedding_matrix=embedding_matrix,
|
||||||
dropout=params["dropout"]
|
dropout=params["dropout"]
|
||||||
).to(device)
|
)
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
model = model.to(device)
|
||||||
|
|
||||||
criterion = nn.MSELoss()
|
criterion = nn.MSELoss()
|
||||||
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
|
||||||
|
|
@ -340,3 +343,15 @@ test_rmse = np.sqrt(mean_squared_error(test_labels, test_preds))
|
||||||
test_mae = mean_absolute_error(test_labels, test_preds)
|
test_mae = mean_absolute_error(test_labels, test_preds)
|
||||||
test_r2 = r2_score(test_labels, test_preds)
|
test_r2 = r2_score(test_labels, test_preds)
|
||||||
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
print(f"Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R²: {test_r2:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
# plot distribution of predicted values and true values
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
plt.hist(test_labels, bins=20, color='skyblue', edgecolor='black', alpha=0.7, label='True Values')
|
||||||
|
plt.hist(test_preds, bins=20, color='salmon', edgecolor='black', alpha=0.7, label='Predicted Values')
|
||||||
|
plt.title('Distribution of Predicted and True Values')
|
||||||
|
plt.xlabel('Score')
|
||||||
|
plt.ylabel('Frequency')
|
||||||
|
plt.legend()
|
||||||
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||||
|
plt.show()
|
||||||
Loading…
Reference in New Issue