if 'google.colab' in str(get_ipython()):
from google.colab import drive
'/content/drive', force_remount=False)
drive.mount(!pip install mirzai
else:
5.1. PLSR vs. CNN Learning Curves
Visualizing the behaviours (performances) of PLSR and CNN models as the number of samples increases
# Python utils
from pathlib import Path
import pickle
from mirzai.vis.core import (centimeter, PRIMARY_COLOR,
plot_learning_curve, plot_capacity,
set_style, DEFAULT_STYLE)from mirzai.training.core import load_dumps
# Data vis.
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
# Data science stack
import pandas as pd
import numpy as np
import warnings
'ignore') warnings.filterwarnings(
Utilities
def reduce(dumps):
= pd.concat([pd.DataFrame(perf) for perf in dumps])
df return df.groupby(['n_samples']).agg({'test_score':['mean','std']})
Input data
To generate the learning curves for both the PLSR and CNN models, run the following notebooks: * PLSR Learning curve * CNN Learning curve
Instead, we load already generated and saved data: history_pls_learning_curve.pickle
and history_cnn_learning_curve.pickle
.
PLSR
= Path('dumps/plsr/learning_curve') src_dir
= load_dumps(src_dir) dumps
= reduce(dumps); df_plsr df_plsr
test_score | ||
---|---|---|
mean | std | |
n_samples | ||
500 | 0.357241 | 0.168292 |
1000 | 0.446311 | 0.064118 |
2000 | 0.484049 | 0.066392 |
5000 | 0.563827 | 0.078549 |
10000 | 0.619452 | 0.030407 |
20000 | 0.634474 | 0.016305 |
30000 | 0.633108 | 0.013739 |
40132 | 0.639170 | 0.012540 |
CNN
= Path('dumps/cnn/learning_curve') src_dir
= load_dumps(src_dir) dumps
= reduce(dumps); df_cnn.head() df_cnn
test_score | ||
---|---|---|
mean | std | |
n_samples | ||
500 | 0.419518 | 0.198391 |
1000 | 0.453100 | 0.086281 |
2000 | 0.533940 | 0.054055 |
5000 | 0.622144 | 0.034591 |
10000 | 0.679383 | 0.024025 |
= [{'nb_samples': df.index.to_numpy(),
history_plsr, history_cnn 'r2_mean': df[('test_score', 'mean')].to_numpy(),
'r2_std': df[('test_score', 'std')].to_numpy()
}for df in [df_plsr, df_cnn]]
history_plsr
{'nb_samples': array([ 500, 1000, 2000, 5000, 10000, 20000, 30000, 40132]),
'r2_mean': array([0.35724121, 0.44631125, 0.48404897, 0.56382695, 0.61945173,
0.63447414, 0.63310825, 0.63917001]),
'r2_std': array([0.16829222, 0.06411818, 0.06639178, 0.0785487 , 0.03040741,
0.01630533, 0.01373926, 0.0125405 ])}
history_cnn
{'nb_samples': array([ 500, 1000, 2000, 5000, 10000, 20000, 30000, 40132]),
'r2_mean': array([0.41951778, 0.45309992, 0.53394023, 0.6221441 , 0.67938348,
0.72648183, 0.75480461, 0.77354623]),
'r2_std': array([0.19839095, 0.08628061, 0.05405458, 0.03459126, 0.02402495,
0.02137746, 0.02162855, 0.01649216])}
Plot
def plot_learning_curve(x, losses_train, losses_valid, ax=None, train_kwargs={}, valid_kwargs={}):
if ax is None:
= plt.gca()
ax ='Training', **train_kwargs)
ax.plot(x, losses_train, label#ax.plot(x, losses_valid, label='Validation', **valid_kwargs)
'log')
ax.set_yscale('log')
ax.set_xscale(return(ax)
def plot_learning_curves(history_pls, history_cnn,
=(10*centimeter,8*centimeter), dpi=600):
figsize# Layout
= plt.figure(figsize=figsize, dpi=600)
fig
= GridSpec(nrows=1, ncols=1)
gs = fig.add_subplot(gs[0, 0])
ax # Plots
= {'marker': 'o', 'mfc':'w', 'ms':3}
params = history_plsr.values()
x, mean, std = {'facecolor': 'C0', 'alpha': 0.15, 'zorder': 1}
fill_between_params + std, mean - std, **fill_between_params)
ax.fill_between(x, mean ='PLSR', c='C0', **params)
ax.plot(x, mean, label
= history_cnn.values()
x, mean, std = {'facecolor': 'C1', 'alpha': 0.15, 'zorder': 1}
fill_between_params + std, mean - std, **fill_between_params)
ax.fill_between(x, mean ='CNN', c='C1', **params)
ax.plot(x, mean, label
'log')
ax.set_xscale(
# Ornaments
='best', frameon=False)
ax.legend(loc'$R^2$ →', loc='top')
ax.set_ylabel(
'Data size →', loc='right')
ax.set_xlabel(True, "minor", color="0.85", linewidth=0.2, zorder=-20)
ax.grid(True, "major", color="0.65", linewidth=0.4, zorder=-10)
ax.grid(
plt.tight_layout()
= Path('images/')
FIG_PATH
set_style(DEFAULT_STYLE)
plot_learning_curves(history_plsr, history_cnn)
# To save/export it
/'learning-curves.png', dpi=600, transparent=True, format='png') plt.savefig(FIG_PATH