5.1. PLSR vs. CNN Learning Curves

Visualizing the behaviours (performances) of PLSR and CNN models as the number of samples increases

if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive',  force_remount=False)
    !pip install mirzai
else:

# Python utils
from pathlib import Path
import pickle

from mirzai.vis.core import (centimeter, PRIMARY_COLOR, 
                             plot_learning_curve, plot_capacity,
                             set_style, DEFAULT_STYLE)
from mirzai.training.core import load_dumps

# Data vis.
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# Data science stack
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

Utilities

def reduce(dumps):
    df = pd.concat([pd.DataFrame(perf) for perf in dumps])
    return df.groupby(['n_samples']).agg({'test_score':['mean','std']})

Input data

To generate the learning curves for both the PLSR and CNN models, run the following notebooks: * PLSR Learning curve * CNN Learning curve

Instead, we load already generated and saved data: history_pls_learning_curve.pickle and history_cnn_learning_curve.pickle.

PLSR

src_dir = Path('dumps/plsr/learning_curve')

dumps = load_dumps(src_dir)

df_plsr = reduce(dumps); df_plsr

	test_score
	mean	std
n_samples
500	0.357241	0.168292
1000	0.446311	0.064118
2000	0.484049	0.066392
5000	0.563827	0.078549
10000	0.619452	0.030407
20000	0.634474	0.016305
30000	0.633108	0.013739
40132	0.639170	0.012540

CNN

src_dir = Path('dumps/cnn/learning_curve')

dumps = load_dumps(src_dir)

df_cnn = reduce(dumps); df_cnn.head()

	test_score
	mean	std
n_samples
500	0.419518	0.198391
1000	0.453100	0.086281
2000	0.533940	0.054055
5000	0.622144	0.034591
10000	0.679383	0.024025

history_plsr, history_cnn = [{'nb_samples': df.index.to_numpy(),
                              'r2_mean': df[('test_score', 'mean')].to_numpy(),
                              'r2_std': df[('test_score', 'std')].to_numpy()
                             }
                             for df in [df_plsr, df_cnn]]

history_plsr

{'nb_samples': array([  500,  1000,  2000,  5000, 10000, 20000, 30000, 40132]),
 'r2_mean': array([0.35724121, 0.44631125, 0.48404897, 0.56382695, 0.61945173,
        0.63447414, 0.63310825, 0.63917001]),
 'r2_std': array([0.16829222, 0.06411818, 0.06639178, 0.0785487 , 0.03040741,
        0.01630533, 0.01373926, 0.0125405 ])}

history_cnn

{'nb_samples': array([  500,  1000,  2000,  5000, 10000, 20000, 30000, 40132]),
 'r2_mean': array([0.41951778, 0.45309992, 0.53394023, 0.6221441 , 0.67938348,
        0.72648183, 0.75480461, 0.77354623]),
 'r2_std': array([0.19839095, 0.08628061, 0.05405458, 0.03459126, 0.02402495,
        0.02137746, 0.02162855, 0.01649216])}

Plot

def plot_learning_curve(x, losses_train, losses_valid, ax=None,  train_kwargs={}, valid_kwargs={}):
    if ax is None:
        ax = plt.gca()
    ax.plot(x, losses_train, label='Training', **train_kwargs) 
    #ax.plot(x, losses_valid, label='Validation', **valid_kwargs) 
    ax.set_yscale('log')
    ax.set_xscale('log')
    return(ax)

def plot_learning_curves(history_pls, history_cnn,
                         figsize=(10*centimeter,8*centimeter), dpi=600):
    # Layout 
    fig = plt.figure(figsize=figsize, dpi=600)

    gs = GridSpec(nrows=1, ncols=1)
    ax = fig.add_subplot(gs[0, 0])
    # Plots
    params = {'marker': 'o', 'mfc':'w', 'ms':3}
    x, mean, std = history_plsr.values()
    fill_between_params = {'facecolor': 'C0', 'alpha': 0.15, 'zorder': 1}
    ax.fill_between(x, mean + std, mean - std, **fill_between_params)
    ax.plot(x, mean, label='PLSR', c='C0', **params)
    
    x, mean, std = history_cnn.values()
    fill_between_params = {'facecolor': 'C1', 'alpha': 0.15, 'zorder': 1}
    ax.fill_between(x, mean + std, mean - std, **fill_between_params)
    ax.plot(x, mean, label='CNN', c='C1', **params)
    
    ax.set_xscale('log')
    
    # Ornaments
    ax.legend(loc='best', frameon=False) 
    ax.set_ylabel('$R^2$ →', loc='top')

    ax.set_xlabel('Data size →', loc='right')
    ax.grid(True, "minor", color="0.85", linewidth=0.2, zorder=-20)
    ax.grid(True, "major", color="0.65", linewidth=0.4, zorder=-10) 

    plt.tight_layout()

FIG_PATH = Path('images/')
set_style(DEFAULT_STYLE)
plot_learning_curves(history_plsr, history_cnn)

# To save/export it
plt.savefig(FIG_PATH/'learning-curves.png', dpi=600, transparent=True, format='png')