ANLP_WS24_CA2/ml_plots.py

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import scipy.stats as stats
import matplotlib.gridspec as gridspec

from sklearn.linear_model import LinearRegression
import os
import time

def save_plot(plt, plot_name):
    if not os.path.exists('plots'):
        os.makedirs('plots')
    # create timestamp
    time_stamp = time.strftime('%Y%m%d-%H%M%S')
    plt.savefig(f'plots/{plot_name}_{time_stamp}.png')

def plot_training_history(hist_data, colors, title='Training History', save=True):

    epochs = range(1, len(hist_data['train_loss']) + 1)

    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    # Plot accuracy
    axs[1].plot(epochs, hist_data['train_rmse'], label='Train RMSE', color=colors['blue'])
    axs[1].plot(epochs, hist_data['val_rmse'], label='Validation RMSE', color=colors['green'])
    axs[1].set_title('RMSE')
    axs[1].set_xlabel('Epochs')
    axs[1].set_ylabel('RMSE')
    axs[1].legend()

    # Plot loss
    axs[0].plot(epochs, hist_data['train_loss'], label='Train Loss', color=colors['blue'])
    axs[0].plot(epochs, hist_data['val_loss'], label='Validation Loss', color=colors['green'])
    axs[0].set_title('Loss')
    axs[0].set_xlabel('Epochs')
    axs[0].set_ylabel('Loss')
    axs[0].legend()

    plt.tight_layout()
    plt.suptitle(title)

    # save plot
    if save:
        save_plot(plt, title)
    return plt

def plot_distribution(true_values, predicted_values, colors, title='Distribution of Predicted and True Values', save=True):
    plt.figure(figsize=(10, 6))
    plt.hist(true_values, bins=20, color=colors['green'], edgecolor='black', alpha=0.7, label='True Values')
    plt.hist(predicted_values, bins=20, color=colors['blue'], edgecolor='black', alpha=0.7, label='Predicted Values')
    plt.title(title)
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    # save plot
    if save:
        save_plot(plt, title)
    return plt

def plot_predictions(true_values, predicted_values, colors, title='True vs Predicted Values', threshold=0.3, save=True):
    plt.figure(figsize=(10, 6))
    # Difference between predicted and true values
    correct_indices = np.isclose(true_values, predicted_values, atol=threshold)
    incorrect_indices = ~correct_indices
    # Plot
    plt.scatter(np.array(true_values)[correct_indices], np.array(predicted_values)[correct_indices], color=colors['green'], alpha=0.5, label='Correctly predicted')
    plt.scatter(np.array(true_values)[incorrect_indices], np.array(predicted_values)[incorrect_indices], color=colors['red'], alpha=0.5, label='Incorrectly predicted')
    plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color=colors['blue'], linestyle='--', label='Ideal Line')
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    # save plot
    if save:
        save_plot(plt, title)
    return plt

def plot_residuals(labels, preds, colors, title='Residuals Plot', save=True):
    residuals = np.array(preds) - np.array(labels)

    fig = plt.figure(figsize=(14, 6))
    gs = gridspec.GridSpec(1, 2, width_ratios=[4, 1])

    # Main plot
    ax0 = plt.subplot(gs[0])
    ax0.scatter(labels, residuals, label='Residuals', color=colors['blue'], alpha=0.5)

    # Fit linear regression model to residuals
    labels_reshaped = np.array(labels).reshape(-1, 1)
    model = LinearRegression()
    model.fit(labels_reshaped, residuals)
    trend_line = model.predict(labels_reshaped)

    # Plot trend line
    ax0.plot(labels, trend_line, color=colors['red'], label='Trend Line', linewidth=2)

    ax0.set_xlabel('True Values')
    ax0.set_ylabel('Residuals')
    ax0.axhline(y=0, color='k', linestyle='--')
    ax0.set_title(title)
    ax0.legend()

    # Side plot for distribution of true values
    ax1 = plt.subplot(gs[1], sharey=ax0)
    ax1.hist(residuals, bins=30, alpha=0.5, color=colors['blue'], orientation='horizontal')
    ax1.set_xlabel('Frequency')
    ax1.set_title('Distribution of residuals')
    ax1.yaxis.tick_right()
    ax1.yaxis.set_label_position("right")

    plt.tight_layout()
    # save plot
    if save:
        save_plot(plt, title)
    return plt

def plot_qq(labels, preds, colors, title='Q-Q Plot of Residuals', save=True):
    residuals = np.array(preds) - np.array(labels)

    # Generate a Normal Q-Q plot
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111)
    stats.probplot(residuals, dist="norm", plot=ax)

    # Set colors
    line = ax.get_lines()
    line[0].set_color(colors['blue'])  # Data points
    line[1].set_color(colors['red'])   # Fit line

    plt.title(title)
    # save plot
    if save:
        save_plot(plt, title)
    return plt

def plot_val_preds(val_preds, val_labels, colors, title='Histogram of Validation Predictions', save=True):
    plt.figure(figsize=(10, 6))
    plt.hist(val_labels, bins=20, alpha=0.5, label='True Values', color=colors['green'],)

    cmap = cm.get_cmap('coolwarm', len(val_preds))  # Use 'coolwarm' colormap for gradient from red to blue
    for epoch, preds in val_preds.items():
        color = cmap(len(val_preds) - epoch )  # Get color from colormap
        plt.hist(preds, bins=20, alpha=0.5, label=f'Epoch {epoch}', color=color)

    plt.xlabel('Predicted Values')
    plt.ylabel('Frequency')
    plt.title(title)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    # save plot
    if save:
        save_plot(plt, title)
    return plt


####################################################################################################
############### Comparison Plots ###################################################################
####################################################################################################