486 lines
101 KiB
Plaintext
486 lines
101 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# CNN Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import time\n",
|
|
"import json\n",
|
|
"import math\n",
|
|
"\n",
|
|
"import torch\n",
|
|
"import torch.nn as nn\n",
|
|
"import torch.nn.functional as F\n",
|
|
"from torch.utils.data import DataLoader\n",
|
|
"from tqdm import tqdm # Fortschrittsbalken\n",
|
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"\n",
|
|
"# local imports\n",
|
|
"import ml_evaluation as ml_eval\n",
|
|
"import ml_helper\n",
|
|
"import ml_history\n",
|
|
"import dataset_generator as data_gen\n",
|
|
"# class imports\n",
|
|
"import HumorDataset as humor_ds\n",
|
|
"import EarlyStopping\n",
|
|
"import BalancedCELoss\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"torch.manual_seed(0)\n",
|
|
"np.random.seed(0)\n",
|
|
"\n",
|
|
"\n",
|
|
"best_model_filename = 'best_cnn_reg_model.pt'\n",
|
|
"\n",
|
|
"device = ml_helper.get_device(verbose=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embedding_matrix, word_index, vocab_size, d_model = data_gen.create_embedding_matrix()\n",
|
|
"\n",
|
|
"vocab_size = len(embedding_matrix)\n",
|
|
"d_model = len(embedding_matrix[0])\n",
|
|
"vocab_size, d_model = embedding_matrix.size()\n",
|
|
"print(f\"vocab_size: {vocab_size}, d_model: {d_model}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class CNN_HumorRegressor(nn.Module):\n",
|
|
" def __init__(self, embed_dim, filter_sizes, num_filters, dropout=0.5):\n",
|
|
" super(CNN_HumorRegressor, self).__init__()\n",
|
|
"\n",
|
|
" # Convolutional Layers mit verschiedenen Filtergrößen\n",
|
|
" self.convs = nn.ModuleList([\n",
|
|
" nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embed_dim)) \n",
|
|
" for fs in filter_sizes\n",
|
|
" ])\n",
|
|
"\n",
|
|
" # Highway-Netzwerk für bessere Feature-Extraktion\n",
|
|
" self.highway = nn.Linear(len(filter_sizes) * num_filters, len(filter_sizes) * num_filters)\n",
|
|
"\n",
|
|
" # Dropout zur Vermeidung von Overfitting\n",
|
|
" self.dropout = nn.Dropout(dropout)\n",
|
|
"\n",
|
|
" # Fully Connected Layers\n",
|
|
" self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 256)\n",
|
|
" self.fc2 = nn.Linear(256, 128)\n",
|
|
" self.fc3 = nn.Linear(128, 1)\n",
|
|
"\n",
|
|
" def forward(self, x):\n",
|
|
" x = x.unsqueeze(1) # [Batch Size, 1, Seq Length, Embed Dim]\n",
|
|
"\n",
|
|
" # Convolution + ReLU activation\n",
|
|
" conved = [F.relu(conv(x)).squeeze(3) for conv in self.convs]\n",
|
|
"\n",
|
|
" # Max-Pooling über jede Feature-Map\n",
|
|
" pooled = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in conved]\n",
|
|
"\n",
|
|
" # Feature-Vektor kombinieren\n",
|
|
" cat = torch.cat(pooled, dim=1)\n",
|
|
"\n",
|
|
" # Highway-Netzwerk\n",
|
|
" highway = F.relu(self.highway(cat))\n",
|
|
" highway = self.dropout(highway + cat)\n",
|
|
"\n",
|
|
" # Fully Connected Layers\n",
|
|
" fc_out = F.relu(self.fc1(highway))\n",
|
|
" fc_out = F.relu(self.fc2(fc_out))\n",
|
|
" return torch.sigmoid(self.fc3(fc_out)) # Sigmoid für Wertebereich [0, 1]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_preprocess_data(path_data='data/hack.csv'):\n",
|
|
" df = pd.read_csv(path_data)\n",
|
|
" df = df.dropna(subset=['humor_rating'])\n",
|
|
"\n",
|
|
" df['y'] = df['humor_rating']\n",
|
|
" X = df['text']\n",
|
|
" y = df['y']\n",
|
|
" return X, y"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X,y = load_preprocess_data()\n",
|
|
"\n",
|
|
"ret_dict = data_gen.split_data(X, y)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"params = {\n",
|
|
" # used for class balancing\n",
|
|
" 'equalize_classes_loss_factor': 0.15, # 0.15 (0.1 to 0.2)\n",
|
|
" # training parameters\n",
|
|
" 'batch_size': 32, # 32 (16 to 64)\n",
|
|
" 'epochs': 10, # 100\n",
|
|
" 'lr': 1e-4, # 1e-5 (1e-6 to 1e-3)\n",
|
|
" \n",
|
|
" # CNN parameters\n",
|
|
" 'filter_sizes': [2, 3, 4],\n",
|
|
" 'num_filters': 150,\n",
|
|
" \n",
|
|
" # patience for early stopping\n",
|
|
" 'early_stopping_patience': 5, # 5 (3 to 10)\n",
|
|
"\n",
|
|
" # learning rate scheduler\n",
|
|
" 'lr_scheduler_factor': 0.5, # 0.1 (0.05 to 0.2)\n",
|
|
" 'lr_scheduler_patience': 3, # 3 (2 to 5)\n",
|
|
"\n",
|
|
" # model parameters\n",
|
|
" 'nhead': 2, # 5\n",
|
|
" 'num_layers': 3, # 6\n",
|
|
" 'hidden_dim': 10, # 50\n",
|
|
" \n",
|
|
"\n",
|
|
" # regularization parameters\n",
|
|
" 'positional_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
|
|
" 'classifier_dropout': 0.5, # 0.1 (0.1 to 0.5)\n",
|
|
" 'weight_decay': 1e-2 # 0.0 (1e-6 to 1e-2)\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Model initialization\n",
|
|
"model = CNN_HumorRegressor(embed_dim=d_model, filter_sizes=params['filter_sizes'], num_filters=params['num_filters'], dropout=params['classifier_dropout'])\n",
|
|
"model.to(device)\n",
|
|
"print('model created')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# NOTE: Info comes from data explore notebook: 280 is max length,\n",
|
|
"# 139 contains 80% and 192 contains 95% of the data\n",
|
|
"max_len = 280\n",
|
|
"\n",
|
|
"train_dataset = humor_ds.TextDataset(ret_dict['train']['X'], ret_dict['train']['y'], word_index, max_len=max_len)\n",
|
|
"val_dataset = humor_ds.TextDataset(ret_dict['val']['X'], ret_dict['val']['y'], word_index, max_len=max_len)\n",
|
|
"test_dataset = humor_ds.TextDataset(ret_dict['test']['X'], ret_dict['test']['y'], word_index, max_len=max_len)\n",
|
|
"\n",
|
|
"print('datasets length:', len(train_dataset), len(val_dataset))\n",
|
|
"#NOTE: overfitting test\n",
|
|
"#train_dataset.labels = train_dataset.labels[:100]\n",
|
|
"#train_dataset.texts = train_dataset.texts[:100]\n",
|
|
"\n",
|
|
"train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
|
|
"val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)\n",
|
|
"test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
|
|
"\n",
|
|
"# NOTE: samller because of batches not all data\n",
|
|
"print(f\"train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#TODO: change to RMSE\n",
|
|
"\"\"\"\n",
|
|
"criterion = nn.MSELoss()\n",
|
|
"loss = torch.sqrt(criterion(x, y))\n",
|
|
"loss.backward()\n",
|
|
"print(x.grad)\n",
|
|
"\"\"\"\n",
|
|
"criterion = nn.MSELoss()\n",
|
|
"\n",
|
|
"optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), \n",
|
|
" lr=params['lr']) #, \n",
|
|
" #weight_decay=params['weight_decay'])\n",
|
|
"\"\"\"\n",
|
|
"scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', \n",
|
|
" factor=params['lr_scheduler_factor'],\n",
|
|
" patience=params['lr_scheduler_patience'],\n",
|
|
" verbose=True)\n",
|
|
"\"\"\"\n",
|
|
"early_stopping = EarlyStopping.EarlyStopping(patience=params['early_stopping_patience'], verbose=False)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Training loop\n",
|
|
"\n",
|
|
"for epoch in range(params['epochs']):\n",
|
|
" epoch_start_time = time.time()\n",
|
|
" model.train()\n",
|
|
" \n",
|
|
" train_loss = 0.0\n",
|
|
" \n",
|
|
" for batch in train_loader:\n",
|
|
" optimizer.zero_grad()\n",
|
|
" input_ids, labels = batch\n",
|
|
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
|
|
"\n",
|
|
" outputs = model(input_ids)\n",
|
|
" outputs = outputs.squeeze().float()\n",
|
|
" loss = criterion(outputs, labels)\n",
|
|
" loss.backward()\n",
|
|
" #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=params['clipping_max_norm'])\n",
|
|
" optimizer.step()\n",
|
|
" preds = outputs\n",
|
|
" \n",
|
|
" train_loss += loss.item()\n",
|
|
"\n",
|
|
" train_loss /= len(train_loader)\n",
|
|
" \n",
|
|
" # Validation\n",
|
|
" model.eval()\n",
|
|
" val_loss = 0.0\n",
|
|
" \n",
|
|
" with torch.no_grad():\n",
|
|
" for batch in val_loader:\n",
|
|
" input_ids, labels = batch\n",
|
|
" input_ids, labels = input_ids.to(device), labels.to(device).float() \n",
|
|
" outputs = model(input_ids)\n",
|
|
" outputs = outputs.squeeze().float()\n",
|
|
" loss = criterion(outputs, labels)\n",
|
|
" preds = outputs\n",
|
|
" \n",
|
|
" val_loss += loss.item()\n",
|
|
"\n",
|
|
" val_loss /= len(val_loader)\n",
|
|
" \n",
|
|
" epoch_end_time = time.time()\n",
|
|
" \n",
|
|
" print(f'Epoch {epoch+1}/{params[\"epochs\"]}, '\n",
|
|
" f'Train Loss: {train_loss:.4f}, '\n",
|
|
" f'Val Loss: {val_loss:.4f}, '\n",
|
|
" f'Time: {epoch_end_time - epoch_start_time:.2f}s')\n",
|
|
"\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluation Metrics on Test Data:\n",
|
|
"Mean Squared Error (MSE): 0.3358\n",
|
|
"Root Mean Squared Error (RMSE): 0.5795\n",
|
|
"Mean Absolute Error (MAE): 0.3900\n",
|
|
"R² Score: -0.3445\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# TODO: Evaluate model\n",
|
|
"'''\n",
|
|
"def evaluate_metrics(model, test_loader, device):\n",
|
|
" model.eval()\n",
|
|
" predictions = []\n",
|
|
" actuals = []\n",
|
|
" with torch.no_grad():\n",
|
|
" for inputs, labels in test_loader:\n",
|
|
" inputs, labels = inputs.to(device), labels.to(device)\n",
|
|
" outputs = model(inputs)\n",
|
|
" predictions.extend(outputs.cpu().numpy().flatten())\n",
|
|
" actuals.extend(labels.cpu().numpy().flatten())\n",
|
|
"\n",
|
|
" mse = mean_squared_error(actuals, predictions)\n",
|
|
" rmse = np.sqrt(mse)\n",
|
|
" mae = mean_absolute_error(actuals, predictions)\n",
|
|
" r2 = r2_score(actuals, predictions)\n",
|
|
"\n",
|
|
" return mse, rmse, mae, r2, actuals, predictions\n",
|
|
"\n",
|
|
"mse, rmse, mae, r2, actuals, predictions = evaluate_metrics(model, test_loader, device)\n",
|
|
"\n",
|
|
"print(\"Evaluation Metrics on Test Data:\")\n",
|
|
"print(f\"Mean Squared Error (MSE): {mse:.4f}\")\n",
|
|
"print(f\"Root Mean Squared Error (RMSE): {rmse:.4f}\")\n",
|
|
"print(f\"Mean Absolute Error (MAE): {mae:.4f}\")\n",
|
|
"print(f\"R² Score: {r2:.4f}\")\n",
|
|
"\n",
|
|
"'''"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 800x600 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"#TODO: Plotting\n",
|
|
"'''\n",
|
|
"# Definiere korrekte und falsche Vorhersagen basierend auf einem Schwellenwert\n",
|
|
"threshold = 0.5\n",
|
|
"predicted_labels = (np.array(predictions) > threshold).astype(int)\n",
|
|
"true_labels = (np.array(actuals) > threshold).astype(int)\n",
|
|
"\n",
|
|
"# Bool-Array für korrekte Vorhersagen\n",
|
|
"correct = predicted_labels == true_labels\n",
|
|
"\n",
|
|
"# Farben zuordnen: Grün für korrekt, Rot für falsch\n",
|
|
"colors = ['green' if is_correct else 'red' for is_correct in correct]\n",
|
|
"\n",
|
|
"# Scatter-Plot\n",
|
|
"plt.figure(figsize=(8, 6))\n",
|
|
"plt.scatter(actuals, predictions, c=colors, alpha=0.6, edgecolor='k')\n",
|
|
"\n",
|
|
"\n",
|
|
"# Legende anpassen\n",
|
|
"import matplotlib.patches as mpatches\n",
|
|
"green_patch = mpatches.Patch(color='green', label='Correct Predictions')\n",
|
|
"red_patch = mpatches.Patch(color='red', label='Incorrect Predictions')\n",
|
|
"plt.legend(handles=[green_patch, red_patch])\n",
|
|
"\n",
|
|
"# Achsen und Titel\n",
|
|
"plt.title('True vs. Predicted Humor Scores')\n",
|
|
"plt.xlabel('True Humor Score')\n",
|
|
"plt.ylabel('Predicted Humor Score')\n",
|
|
"plt.show()\n",
|
|
"'''\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"239\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"'''\n",
|
|
"#TODO: Plotting\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"# Load the data from csv\n",
|
|
"df = pd.read_csv('data/hack.csv')\n",
|
|
"df_test = df.iloc[test_dataset.original_indices].copy()\n",
|
|
"df_test['prediction'] = predicted_labels\n",
|
|
"df_test['label'] = true_labels\n",
|
|
"df_test['pred_correct'] = (df_test['prediction'] == df_test['label'])\n",
|
|
"\n",
|
|
"df_test_sorted = df_test.sort_values(by='humor_rating').reset_index(drop=True)\n",
|
|
"\n",
|
|
"from matplotlib import patches as mpatches\n",
|
|
"\n",
|
|
"median_rating = df['humor_rating'].median()\n",
|
|
"# get first index where humor_rating is greater than median_rating\n",
|
|
"median_idx = df_test_sorted[df_test_sorted['humor_rating'] > median_rating].index[0]\n",
|
|
"print(median_idx)\n",
|
|
"# range idx for len df_test\n",
|
|
"range_idx = range(len(df_test))\n",
|
|
"colors = df_test_sorted['pred_correct'].map({True: 'g', False: 'r'})\n",
|
|
"# bar plot for each df_test humor_rating value \n",
|
|
"plt.bar(range_idx, df_test_sorted['humor_rating'], color=colors)\n",
|
|
"# vertical line for True/False cut off\n",
|
|
"plt.axvline(x=median_idx, color='black', linestyle='--')\n",
|
|
"# Create a legend handles\n",
|
|
"green_patch = mpatches.Patch(color='g', label='Correct Prediction')\n",
|
|
"red_patch = mpatches.Patch(color='r', label='Incorrect Prediction')\n",
|
|
"line_patch = mpatches.Patch(color='black', label='humor_rating cut off')\n",
|
|
"plt.title('Humor Rating vs Prediction for Test Set')\n",
|
|
"plt.xlabel('Index')\n",
|
|
"plt.ylabel('Humor Rating')\n",
|
|
"plt.legend(handles=[green_patch, red_patch, line_patch])\n",
|
|
"plt.show()\n",
|
|
"''''''"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|