3.2. Train & evaluate (PLSR)

Train & evaluate on multiple train/test splits with different random seeds

if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive',  force_remount=False)
    !pip install mirzai
else:
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
# Python utilities
from pathlib import Path
import pickle


# mirzai utilities
from mirzai.data.loading import load_kssl
from mirzai.data.selection import (select_y, select_tax_order, select_X)
from mirzai.data.transform import (log_transform_y, CO2_REGION)

#from mirzai.training.plsr import (compute_valid_curve, PLS_model, Evaluator)
from mirzai.training.plsr import (PLS_model, Learners)

from fastcore.transform import compose

# Data science stack
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

Load and transform

src_dir = 'data'
fnames = ['spectra-features.npy', 'spectra-wavenumbers.npy', 
          'depth-order.npy', 'target.npy', 
          'tax-order-lu.pkl', 'spectra-id.npy']

X, X_names, depth_order, y, tax_lookup, X_id = load_kssl(src_dir, fnames=fnames)

data = X, y, X_id, depth_order

transforms = [select_y, select_tax_order, select_X, log_transform_y]
X, y, X_id, depth_order = compose(*transforms)(data)
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'Wavenumbers:\n {X_names}')
print(f'depth_order (first 3 rows):\n {depth_order[:3, :]}')
print(f'Taxonomic order lookup:\n {tax_lookup}')
X shape: (40132, 1764)
y shape: (40132,)
Wavenumbers:
 [3999 3997 3995 ...  603  601  599]
depth_order (first 3 rows):
 [[43.  2.]
 [ 0.  0.]
 [ 0.  1.]]
Taxonomic order lookup:
 {'alfisols': 0, 'mollisols': 1, 'inceptisols': 2, 'entisols': 3, 'spodosols': 4, 'undefined': 5, 'ultisols': 6, 'andisols': 7, 'histosols': 8, 'oxisols': 9, 'vertisols': 10, 'aridisols': 11, 'gelisols': 12}

Experiment

Setup

split_ratio = 0.1
seeds = range(20)
dest_dir = Path('files/dumps/plsr/train_eval')

Train on all Soil Taxonomic Orders

dest_dir_model = Path('files/dumps/plsr/train_eval/all/models')
seeds = range(20)
learners = Learners(tax_lookup, seeds=seeds)
learners.train((X, y, depth_order[:, -1]), 
               n_cpts_range=range(40, 70, 2),
               delta=2e-3,
               dest_dir_model=dest_dir_model)
--------------------------------------------------------------------------------
Seed: 0
--------------------------------------------------------------------------------
# of components chosen: 50
--------------------------------------------------------------------------------
Seed: 1
--------------------------------------------------------------------------------
# of components chosen: 52
--------------------------------------------------------------------------------
Seed: 2
--------------------------------------------------------------------------------
# of components chosen: 54
--------------------------------------------------------------------------------
Seed: 3
--------------------------------------------------------------------------------
# of components chosen: 56
--------------------------------------------------------------------------------
Seed: 4
--------------------------------------------------------------------------------
# of components chosen: 52
--------------------------------------------------------------------------------
Seed: 5
--------------------------------------------------------------------------------
# of components chosen: 54
--------------------------------------------------------------------------------
Seed: 6
--------------------------------------------------------------------------------
# of components chosen: 58
--------------------------------------------------------------------------------
Seed: 7
--------------------------------------------------------------------------------
# of components chosen: 62
--------------------------------------------------------------------------------
Seed: 8
--------------------------------------------------------------------------------
# of components chosen: 48
--------------------------------------------------------------------------------
Seed: 9
--------------------------------------------------------------------------------
# of components chosen: 52
--------------------------------------------------------------------------------
Seed: 10
--------------------------------------------------------------------------------
# of components chosen: 58
--------------------------------------------------------------------------------
Seed: 11
--------------------------------------------------------------------------------
# of components chosen: 48
--------------------------------------------------------------------------------
Seed: 12
--------------------------------------------------------------------------------
# of components chosen: 48
--------------------------------------------------------------------------------
Seed: 13
--------------------------------------------------------------------------------
# of components chosen: 48
--------------------------------------------------------------------------------
Seed: 14
--------------------------------------------------------------------------------
# of components chosen: 44
--------------------------------------------------------------------------------
Seed: 15
--------------------------------------------------------------------------------
# of components chosen: 68
--------------------------------------------------------------------------------
Seed: 16
--------------------------------------------------------------------------------
# of components chosen: 44
--------------------------------------------------------------------------------
Seed: 17
--------------------------------------------------------------------------------
# of components chosen: 50
--------------------------------------------------------------------------------
Seed: 18
--------------------------------------------------------------------------------
# of components chosen: 64
--------------------------------------------------------------------------------
Seed: 19
--------------------------------------------------------------------------------
# of components chosen: 58

Evaluate on all

src_dir_model = Path('files/dumps/plsr/train_eval/all/models')
seeds = range(20)
learners = Learners(tax_lookup, seeds=seeds)
perfs_global_all, y_hats_all, y_trues_all = learners.evaluate((X, y, depth_order[:, -1]),
                                                              src_dir_model=src_dir_model)
# Save spectific seed y_hat, y_true to plot "Observed vs. predicted" scatterplots
dest_dir_predicted = Path('files/dumps/')
seed = 1
with open(dest_dir_predicted/f'predicted-true-plsr-seed-{seed}.pickle', 'wb') as f: 
    pickle.dump((y_hats_all[seed].to_numpy(), y_trues_all[seed].to_numpy()), f)
perfs_global_all.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 1.658763 2.272073 0.636279 0.780492 1.057159 1.306802 0.335821 135.026036 -0.000259 -0.000547
std 0.022627 0.033917 0.009834 0.005444 0.446290 1.222881 0.016914 3.283663 0.003848 0.007405
min 1.623276 2.204548 0.620402 0.771287 0.671250 0.450576 0.308898 130.007174 -0.006300 -0.012485
25% 1.640177 2.256584 0.628185 0.777881 0.756833 0.572822 0.325999 132.585609 -0.003009 -0.005756
50% 1.660981 2.267233 0.637440 0.780165 0.861632 0.742537 0.331193 134.434034 -0.001383 -0.002745
75% 1.666828 2.296689 0.639975 0.784414 1.093822 1.216215 0.344576 136.850054 0.002217 0.004202
max 1.707217 2.328779 0.656813 0.793939 2.242110 5.027059 0.372859 140.504871 0.007079 0.013396
src_dir_model = Path('files/dumps/plsr/train_eval/all/models')

seeds = range(20)

for k, v in {k: v for k, v in tax_lookup.items() if k != 'oxisols'}.items():
    print(80*'-')
    print(f'Test metrics on {k}')
    print(80*'-')
    learners = Learners(tax_lookup, seeds=seeds)
    perfs_global, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                           order=v,
                                           src_dir_model=src_dir_model)
    print(perfs_global.describe())
--------------------------------------------------------------------------------
Test metrics on alfisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.488581   2.023110   0.544065   0.727878   0.411130   0.198526   
std     0.077245   0.111888   0.048141   0.028838   0.176213   0.187989   
min     1.335787   1.825327   0.438167   0.673686   0.207121   0.042899   
25%     1.429706   1.971203   0.509535   0.713507   0.282230   0.079724   
50%     1.493973   2.024866   0.550797   0.731100   0.390250   0.152373   
75%     1.533459   2.092083   0.573742   0.749183   0.462969   0.214501   
max     1.625880   2.267384   0.620742   0.769198   0.845685   0.715184   

             mae       mape       bias        stb  
count  20.000000  20.000000  20.000000  20.000000  
mean    0.162636  82.627329  -0.001926  -0.004745  
std     0.020796   4.371945   0.008761   0.023061  
min     0.128730  73.976273  -0.018855  -0.047651  
25%     0.146705  79.772143  -0.009633  -0.025287  
50%     0.159680  83.069107  -0.000834  -0.002193  
75%     0.179517  85.662533   0.003653   0.009967  
max     0.195241  93.340300   0.011908   0.032238  
--------------------------------------------------------------------------------
Test metrics on mollisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.582966   2.090417   0.599952   0.742203   0.769248   0.629664   
std     0.034982   0.078073   0.017652   0.012356   0.199794   0.322337   
min     1.525933   1.972214   0.570089   0.722419   0.526864   0.277585   
25%     1.558545   2.042070   0.587883   0.734876   0.600345   0.360454   
50%     1.579016   2.077462   0.598502   0.738004   0.735055   0.541195   
75%     1.610838   2.141203   0.614204   0.752421   0.917944   0.842758   
max     1.642984   2.250635   0.629145   0.764131   1.123730   1.262768   

             mae       mape       bias        stb  
count  20.000000  20.000000  20.000000  20.000000  
mean    0.307466  94.298259   0.001662   0.003734  
std     0.018656   3.052491   0.008008   0.018876  
min     0.279430  87.394573  -0.013899  -0.033922  
25%     0.289349  92.537801  -0.004175  -0.009866  
50%     0.308846  94.409650   0.001370   0.003153  
75%     0.320209  96.313669   0.008152   0.018857  
max     0.340558  98.688058   0.014374   0.032833  
--------------------------------------------------------------------------------
Test metrics on inceptisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.484931   2.062821   0.543190   0.715217   0.560004   0.327634   
std     0.052383   0.113698   0.032576   0.021247   0.121523   0.141674   
min     1.393604   1.834631   0.483181   0.680782   0.383973   0.147435   
25%     1.443211   1.980935   0.518045   0.696649   0.462424   0.214457   
50%     1.496851   2.072912   0.552023   0.716683   0.563604   0.317676   
75%     1.515227   2.150964   0.562994   0.733597   0.621687   0.386500   
max     1.588288   2.274688   0.602151   0.752080   0.802342   0.643753   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.258099  117.611669  -0.025303  -0.051641  
std     0.030971    6.840199   0.011152   0.023835  
min     0.206618  105.015942  -0.046778  -0.093141  
25%     0.236181  114.370854  -0.033645  -0.068934  
50%     0.253719  116.712206  -0.022784  -0.045203  
75%     0.276829  119.444905  -0.017516  -0.032937  
max     0.332711  134.933979  -0.007504  -0.014204  
--------------------------------------------------------------------------------
Test metrics on entisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.495344   2.102047   0.535041   0.733857   0.442698   0.204343   
std     0.160012   0.348114   0.099822   0.051184   0.093817   0.089328   
min     1.229760   1.616803   0.334548   0.616401   0.310090   0.096156   
25%     1.435865   1.802874   0.511824   0.723841   0.396742   0.157491   
50%     1.505428   2.070569   0.555707   0.748509   0.417861   0.174608   
75%     1.587539   2.257373   0.600692   0.761758   0.506173   0.256319   
max     1.916463   2.839117   0.725927   0.830845   0.663214   0.439853   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.227277  124.306496  -0.013363  -0.027085  
std     0.033105   15.203068   0.022993   0.048705  
min     0.172270  102.336085  -0.045416  -0.111028  
25%     0.209802  113.533998  -0.030312  -0.054147  
50%     0.226905  119.691934  -0.020139  -0.041128  
75%     0.247482  131.740359  -0.003590  -0.006261  
max     0.287580  161.354752   0.034848   0.076445  
--------------------------------------------------------------------------------
Test metrics on spodosols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.884385   2.713797   0.699472   0.824446   0.482583   0.249341   
std     0.238908   0.639696   0.079492   0.043133   0.131607   0.130359   
min     1.390288   1.803467   0.474431   0.721559   0.276331   0.076359   
25%     1.712985   2.360031   0.652982   0.792727   0.385248   0.148728   
50%     1.870642   2.527430   0.709356   0.835642   0.477664   0.228171   
75%     2.015491   3.008323   0.749113   0.844006   0.557888   0.312005   
max     2.398842   4.506760   0.823003   0.895621   0.706393   0.498991   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.262912  157.167552   0.014543   0.020080  
std     0.058003   19.733897   0.028305   0.043673  
min     0.171519  126.395882  -0.019549  -0.047892  
25%     0.219475  142.879484  -0.001157  -0.002089  
50%     0.263831  156.169654   0.007113   0.011952  
75%     0.296229  171.764491   0.019708   0.037011  
max     0.359890  188.821685   0.101276   0.143619  
--------------------------------------------------------------------------------
Test metrics on undefined
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.699452   2.327902   0.653167   0.788484   1.152385   1.582740   
std     0.032766   0.070709   0.013227   0.007203   0.517839   1.675655   
min     1.648992   2.217120   0.632006   0.777364   0.766317   0.587241   
25%     1.677440   2.275579   0.644376   0.781214   0.880365   0.775066   
50%     1.692550   2.328214   0.650700   0.788912   0.935093   0.874408   
75%     1.724165   2.352930   0.663386   0.795110   1.060103   1.132949   
max     1.757505   2.475099   0.676041   0.798610   2.462503   6.063923   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.424512  152.988067   0.004325   0.007633  
std     0.029141    7.248857   0.006910   0.012443  
min     0.362205  141.274022  -0.006812  -0.012853  
25%     0.407466  148.161140  -0.001129  -0.002094  
50%     0.424653  151.755239   0.003296   0.005683  
75%     0.443348  157.515627   0.009594   0.017346  
max     0.475289  168.416540   0.016393   0.029619  
--------------------------------------------------------------------------------
Test metrics on ultisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.201589   1.665164   0.296985   0.581148   0.326529   0.109017   
std     0.067696   0.152486   0.081464   0.056106   0.050224   0.034916   
min     1.079312   1.352102   0.136519   0.489327   0.252949   0.063983   
25%     1.151943   1.560671   0.242226   0.546748   0.295477   0.087307   
50%     1.207474   1.676421   0.310236   0.584230   0.321999   0.103689   
75%     1.237247   1.789812   0.343037   0.616695   0.350845   0.123099   
max     1.329557   1.890884   0.430578   0.701262   0.450793   0.203214   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.181925   80.909836   0.039877   0.094603  
std     0.025022    9.029538   0.014663   0.031388  
min     0.150119   67.915820   0.013417   0.031652  
25%     0.164187   74.465798   0.025832   0.068632  
50%     0.174361   80.450314   0.037480   0.100163  
75%     0.197729   88.393075   0.050413   0.112495  
max     0.249080  101.466523   0.065988   0.147605  
--------------------------------------------------------------------------------
Test metrics on andisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.561501   2.013977   0.584894   0.728846   0.592610   0.428996   
std     0.054676   0.214017   0.028844   0.027988   0.286190   0.451123   
min     1.462280   1.607348   0.528559   0.677337   0.344830   0.118907   
25%     1.522535   1.939558   0.564517   0.712462   0.397968   0.158418   
50%     1.560284   2.031585   0.585664   0.727166   0.479231   0.229689   
75%     1.594774   2.100454   0.603265   0.746108   0.630045   0.397882   
max     1.659770   2.348595   0.633790   0.781835   1.299265   1.688089   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.281092  106.394236   0.015788   0.033443  
std     0.049266   11.345530   0.017747   0.037467  
min     0.205634   84.861332  -0.015052  -0.033935  
25%     0.241596   98.732944   0.005526   0.013816  
50%     0.288964  106.024153   0.013751   0.028963  
75%     0.307314  112.704328   0.029538   0.064234  
max     0.390000  130.842886   0.043541   0.094777  
--------------------------------------------------------------------------------
Test metrics on histosols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.711375   2.500980   0.637454   0.778795   1.165003   1.492735   
std     0.214934   0.635935   0.090172   0.055099   0.377669   1.060323   
min     1.374536   1.609684   0.462316   0.677843   0.634321   0.402363   
25%     1.566681   2.034666   0.586609   0.753988   0.927356   0.860062   
50%     1.681201   2.410043   0.641241   0.776610   1.093495   1.195750   
75%     1.820641   2.907901   0.693132   0.811247   1.278496   1.634970   
max     2.161752   3.628460   0.782561   0.867816   2.175235   4.731648   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.610852  243.155805   0.001554  -0.003005  
std     0.133948   46.102071   0.035374   0.045302  
min     0.380170  172.238410  -0.064465  -0.110639  
25%     0.546008  210.254481  -0.010255  -0.015323  
50%     0.567238  231.883135  -0.000548  -0.000847  
75%     0.674668  284.666143   0.009428   0.010923  
max     0.865779  324.890279   0.077207   0.078856  
--------------------------------------------------------------------------------
Test metrics on vertisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.524148   2.101732   0.547234   0.732481   0.390509   0.161691   
std     0.175537   0.331334   0.101618   0.061666   0.098375   0.090253   
min     1.233878   1.556100   0.333783   0.607804   0.279383   0.078055   
25%     1.428539   1.912700   0.503573   0.695811   0.327822   0.107521   
50%     1.478822   2.143325   0.536402   0.728507   0.361795   0.130898   
75%     1.632835   2.287894   0.619066   0.781692   0.423892   0.179688   
max     1.895009   2.787665   0.717768   0.831755   0.673910   0.454155   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.238076   98.744299  -0.052767  -0.134787  
std     0.035953   12.347859   0.015789   0.043975  
min     0.183381   83.009898  -0.075401  -0.206095  
25%     0.206193   89.681748  -0.061264  -0.162066  
50%     0.231768   94.428492  -0.055714  -0.138102  
75%     0.259825  106.099271  -0.046249  -0.110102  
max     0.309477  127.327009  -0.015200  -0.042732  
--------------------------------------------------------------------------------
Test metrics on aridisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse         mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000   20.000000   
mean    1.378008   1.827588   0.444671   0.686068   2.131294   11.685892   
std     0.164302   0.273668   0.154683   0.067033   2.742162   28.829063   
min     1.015784   1.353244   0.024153   0.533021   0.484385    0.234629   
25%     1.294828   1.616959   0.398816   0.656175   0.657982    0.433422   
50%     1.420750   1.811119   0.501060   0.704963   1.069559    1.148925   
75%     1.471227   2.008619   0.534730   0.727574   1.995471    4.232994   
max     1.643348   2.293749   0.626948   0.771557  10.787938  116.379602   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.500131  145.908265  -0.010957  -0.025541  
std     0.250221   56.170231   0.018981   0.046494  
min     0.294676   96.568375  -0.047555  -0.126001  
25%     0.352105  113.472506  -0.021256  -0.045881  
50%     0.403889  124.989228  -0.007535  -0.013668  
75%     0.562606  154.904255   0.004738   0.008801  
max     1.257594  322.681936   0.015741   0.036257  
--------------------------------------------------------------------------------
Test metrics on gelisols
--------------------------------------------------------------------------------
             rpd       rpiq         r2       lccc       rmse        mse  \
count  20.000000  20.000000  20.000000  20.000000  20.000000  20.000000   
mean    1.692878   2.534223   0.626330   0.766703   0.646289   0.435929   
std     0.200619   0.761655   0.096257   0.059995   0.138561   0.188936   
min     1.274587   1.094882   0.368254   0.619306   0.418552   0.175186   
25%     1.607986   2.005345   0.602193   0.747244   0.552801   0.305750   
50%     1.651799   2.517481   0.624528   0.765263   0.631649   0.398982   
75%     1.849051   2.810388   0.698211   0.819691   0.723203   0.523037   
max     2.033034   3.926091   0.752298   0.864522   0.954032   0.910176   

             mae        mape       bias        stb  
count  20.000000   20.000000  20.000000  20.000000  
mean    0.389866  193.838336  -0.088982  -0.145424  
std     0.084392   31.399886   0.034638   0.082395  
min     0.255306  134.782910  -0.161743  -0.406283  
25%     0.324760  169.096695  -0.107498  -0.178604  
50%     0.390097  191.552850  -0.092811  -0.129146  
75%     0.459532  213.219983  -0.058120  -0.106705  
max     0.542090  257.645578  -0.023931  -0.030365  

Evaluate on Mollisols

src_dir_model = Path('files/dumps/plsr/train_eval/all/models')
seeds = range(20)
order = 1
learners = Learners(tax_lookup, seeds=seeds)
perfs_global_mollisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                 order=1,
                                                 src_dir_model=src_dir_model)
perfs_global_mollisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 1.582966 2.090417 0.599952 0.742203 0.769248 0.629664 0.307466 94.298259 0.001662 0.003734
std 0.034982 0.078073 0.017652 0.012356 0.199794 0.322337 0.018656 3.052491 0.008008 0.018876
min 1.525933 1.972214 0.570089 0.722419 0.526864 0.277585 0.279430 87.394573 -0.013899 -0.033922
25% 1.558545 2.042070 0.587883 0.734876 0.600345 0.360454 0.289349 92.537801 -0.004175 -0.009866
50% 1.579016 2.077462 0.598502 0.738004 0.735055 0.541195 0.308846 94.409650 0.001370 0.003153
75% 1.610838 2.141203 0.614204 0.752421 0.917944 0.842758 0.320209 96.313669 0.008152 0.018857
max 1.642984 2.250635 0.629145 0.764131 1.123730 1.262768 0.340558 98.688058 0.014374 0.032833

Evaluate on Gelisols

src_dir_model = Path('files/dumps/plsr/train_eval/all/models')
seeds = range(20)
order = 12
learners = Learners(tax_lookup, seeds=seeds)
perfs_global_gelisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                order=order,
                                                src_dir_model=src_dir_model)
perfs_global_gelisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 1.692878 2.534223 0.626330 0.766703 0.646289 0.435929 0.389866 193.838336 -0.088982 -0.145424
std 0.200619 0.761655 0.096257 0.059995 0.138561 0.188936 0.084392 31.399886 0.034638 0.082395
min 1.274587 1.094882 0.368254 0.619306 0.418552 0.175186 0.255306 134.782910 -0.161743 -0.406283
25% 1.607986 2.005345 0.602193 0.747244 0.552801 0.305750 0.324760 169.096695 -0.107498 -0.178604
50% 1.651799 2.517481 0.624528 0.765263 0.631649 0.398982 0.390097 191.552850 -0.092811 -0.129146
75% 1.849051 2.810388 0.698211 0.819691 0.723203 0.523037 0.459532 213.219983 -0.058120 -0.106705
max 2.033034 3.926091 0.752298 0.864522 0.954032 0.910176 0.542090 257.645578 -0.023931 -0.030365

Evaluate on Vertisols

src_dir_model = Path('files/dumps/plsr/train_eval/all/models')
seeds = range(20)
order = 10
learners = Learners(tax_lookup, seeds=seeds)
perfs_global_vertisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                 order=order,
                                                 src_dir_model=src_dir_model)
perfs_global_vertisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 1.524148 2.101732 0.547234 0.732481 0.390509 0.161691 0.238076 98.744299 -0.052767 -0.134787
std 0.175537 0.331334 0.101618 0.061666 0.098375 0.090253 0.035953 12.347859 0.015789 0.043975
min 1.233878 1.556100 0.333783 0.607804 0.279383 0.078055 0.183381 83.009898 -0.075401 -0.206095
25% 1.428539 1.912700 0.503573 0.695811 0.327822 0.107521 0.206193 89.681748 -0.061264 -0.162066
50% 1.478822 2.143325 0.536402 0.728507 0.361795 0.130898 0.231768 94.428492 -0.055714 -0.138102
75% 1.632835 2.287894 0.619066 0.781692 0.423892 0.179688 0.259825 106.099271 -0.046249 -0.110102
max 1.895009 2.787665 0.717768 0.831755 0.673910 0.454155 0.309477 127.327009 -0.015200 -0.042732

Train and test on Mollisols

dest_dir_model = Path('files/dumps/plsr/train_eval/mollisols/models')
seeds = range(20)
order = 1
learners = Learners(tax_lookup, seeds=seeds)
learners.train((X, y, depth_order[:, -1]), 
               order=order,
               n_cpts_range=range(30, 60, 2),
               delta=2e-3,
               dest_dir_model=dest_dir_model)
--------------------------------------------------------------------------------
Seed: 0
--------------------------------------------------------------------------------
# of components chosen: 46
--------------------------------------------------------------------------------
Seed: 1
--------------------------------------------------------------------------------
# of components chosen: 52
--------------------------------------------------------------------------------
Seed: 2
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 3
--------------------------------------------------------------------------------
# of components chosen: 40
--------------------------------------------------------------------------------
Seed: 4
--------------------------------------------------------------------------------
# of components chosen: 46
--------------------------------------------------------------------------------
Seed: 5
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 6
--------------------------------------------------------------------------------
# of components chosen: 40
--------------------------------------------------------------------------------
Seed: 7
--------------------------------------------------------------------------------
# of components chosen: 40
--------------------------------------------------------------------------------
Seed: 8
--------------------------------------------------------------------------------
# of components chosen: 40
--------------------------------------------------------------------------------
Seed: 9
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 10
--------------------------------------------------------------------------------
# of components chosen: 54
--------------------------------------------------------------------------------
Seed: 11
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 12
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 13
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 14
--------------------------------------------------------------------------------
# of components chosen: 56
--------------------------------------------------------------------------------
Seed: 15
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 16
--------------------------------------------------------------------------------
# of components chosen: 34
--------------------------------------------------------------------------------
Seed: 17
--------------------------------------------------------------------------------
# of components chosen: 40
--------------------------------------------------------------------------------
Seed: 18
--------------------------------------------------------------------------------
# of components chosen: 50
--------------------------------------------------------------------------------
Seed: 19
--------------------------------------------------------------------------------
# of components chosen: 34
src_dir_model = Path('files/dumps/plsr/train_eval/mollisols/models')
seeds = range(20)
order = 1
learners = Learners(tax_lookup, seeds=seeds)
perfs_local_mollisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                order=order,
                                                src_dir_model=src_dir_model)
perfs_local_mollisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 1.729961 2.284160 0.664719 0.803311 0.682758 0.501425 0.271352 102.920252 0.000623 0.001149
std 0.049480 0.085176 0.019570 0.011727 0.192671 0.281706 0.014817 3.289161 0.008043 0.018940
min 1.620689 2.157094 0.618909 0.776560 0.431012 0.185771 0.243688 95.089908 -0.016145 -0.039405
25% 1.704100 2.221866 0.655256 0.801153 0.535298 0.286687 0.261261 100.989098 -0.005376 -0.012943
50% 1.734100 2.282031 0.667110 0.803882 0.609356 0.371682 0.273628 103.076593 -0.001222 -0.002843
75% 1.759684 2.322080 0.676704 0.809385 0.793591 0.630660 0.275906 105.139386 0.007436 0.017265
max 1.827361 2.468705 0.700207 0.826625 1.028421 1.057650 0.303412 108.383273 0.013749 0.033221

Train and test on Gelisols

dest_dir_model = Path('files/dumps/plsr/train_eval/gelisols/models')
seeds = range(20)
order = 12
learners = Learners(tax_lookup, seeds=seeds)
learners.train((X, y, depth_order[:, -1]), 
               order=order,
               n_cpts_range=range(2, 10, 1),
               delta=1e-2,
               dest_dir_model=dest_dir_model)
--------------------------------------------------------------------------------
Seed: 0
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 1
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 2
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 3
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 4
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 5
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 6
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 7
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 8
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 9
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 10
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 11
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 12
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 13
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 14
--------------------------------------------------------------------------------
# of components chosen: 8
--------------------------------------------------------------------------------
Seed: 15
--------------------------------------------------------------------------------
# of components chosen: 7
--------------------------------------------------------------------------------
Seed: 16
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 17
--------------------------------------------------------------------------------
# of components chosen: 7
--------------------------------------------------------------------------------
Seed: 18
--------------------------------------------------------------------------------
# of components chosen: 7
--------------------------------------------------------------------------------
Seed: 19
--------------------------------------------------------------------------------
# of components chosen: 9
src_dir_model = Path('files/dumps/plsr/train_eval/gelisols/models')
seeds = range(20)
order = 12
learners = Learners(tax_lookup, seeds=seeds)
perfs_local_gelisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                order=order,
                                                src_dir_model=src_dir_model)
perfs_local_gelisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 2.077886 3.084523 0.746774 0.858587 0.646908 0.452289 0.319878 205.707385 -0.002118 -0.006263
std 0.308216 0.896429 0.076636 0.042218 0.188620 0.248623 0.069856 44.668977 0.040502 0.064437
min 1.547630 1.263899 0.570562 0.773000 0.348828 0.121681 0.208846 122.575289 -0.067213 -0.125266
25% 1.823879 2.564071 0.692319 0.827514 0.530327 0.281372 0.269832 172.185662 -0.025272 -0.038202
50% 2.053352 3.090948 0.756956 0.857239 0.614913 0.378319 0.308076 204.408200 -0.007451 -0.014036
75% 2.299863 3.404348 0.805802 0.899488 0.817420 0.669407 0.361622 227.222367 0.016758 0.020807
max 2.652731 5.047128 0.851434 0.914563 0.913832 0.835089 0.452944 302.505106 0.103336 0.170306

Train and test on Vertisols

dest_dir_model = Path('files/dumps/plsr/train_eval/vertisols/models')
seeds = range(20)
order = 10
learners = Learners(tax_lookup, seeds=seeds)
learners.train((X, y, depth_order[:, -1]), 
               order=order,
               n_cpts_range=range(2, 30, 1),
               delta=1e-2,
               dest_dir_model=dest_dir_model)
--------------------------------------------------------------------------------
Seed: 0
--------------------------------------------------------------------------------
# of components chosen: 11
--------------------------------------------------------------------------------
Seed: 1
--------------------------------------------------------------------------------
# of components chosen: 11
--------------------------------------------------------------------------------
Seed: 2
--------------------------------------------------------------------------------
# of components chosen: 20
--------------------------------------------------------------------------------
Seed: 3
--------------------------------------------------------------------------------
# of components chosen: 18
--------------------------------------------------------------------------------
Seed: 4
--------------------------------------------------------------------------------
# of components chosen: 12
--------------------------------------------------------------------------------
Seed: 5
--------------------------------------------------------------------------------
# of components chosen: 10
--------------------------------------------------------------------------------
Seed: 6
--------------------------------------------------------------------------------
# of components chosen: 17
--------------------------------------------------------------------------------
Seed: 7
--------------------------------------------------------------------------------
# of components chosen: 8
--------------------------------------------------------------------------------
Seed: 8
--------------------------------------------------------------------------------
# of components chosen: 15
--------------------------------------------------------------------------------
Seed: 9
--------------------------------------------------------------------------------
# of components chosen: 11
--------------------------------------------------------------------------------
Seed: 10
--------------------------------------------------------------------------------
# of components chosen: 9
--------------------------------------------------------------------------------
Seed: 11
--------------------------------------------------------------------------------
# of components chosen: 12
--------------------------------------------------------------------------------
Seed: 12
--------------------------------------------------------------------------------
# of components chosen: 19
--------------------------------------------------------------------------------
Seed: 13
--------------------------------------------------------------------------------
# of components chosen: 11
--------------------------------------------------------------------------------
Seed: 14
--------------------------------------------------------------------------------
# of components chosen: 12
--------------------------------------------------------------------------------
Seed: 15
--------------------------------------------------------------------------------
# of components chosen: 15
--------------------------------------------------------------------------------
Seed: 16
--------------------------------------------------------------------------------
# of components chosen: 14
--------------------------------------------------------------------------------
Seed: 17
--------------------------------------------------------------------------------
# of components chosen: 12
--------------------------------------------------------------------------------
Seed: 18
--------------------------------------------------------------------------------
# of components chosen: 8
--------------------------------------------------------------------------------
Seed: 19
--------------------------------------------------------------------------------
# of components chosen: 11
src_dir_model = Path('files/dumps/plsr/train_eval/vertisols/models')
seeds = range(20)
order = 10
learners = Learners(tax_lookup, seeds=seeds)
perfs_local_vertisols, _, _ = learners.evaluate((X, y, depth_order[:, -1]),
                                                order=order,
                                                src_dir_model=src_dir_model)
perfs_local_vertisols.describe()
rpd rpiq r2 lccc rmse mse mae mape bias stb
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 2.009008 2.772106 0.737918 0.855452 0.285306 0.084297 0.187213 93.891373 -0.004065 -0.011017
std 0.233000 0.461799 0.071342 0.035866 0.055230 0.033674 0.024124 11.393788 0.020619 0.053431
min 1.410521 1.902507 0.490198 0.741338 0.182039 0.033138 0.129494 76.538749 -0.041485 -0.121708
25% 1.894380 2.447912 0.717390 0.849941 0.247126 0.061071 0.173790 85.490789 -0.016195 -0.042687
50% 2.024185 2.805916 0.752705 0.860249 0.283139 0.080169 0.188396 91.373366 -0.002301 -0.006435
75% 2.081399 3.036756 0.766082 0.870251 0.299627 0.089804 0.204240 102.965234 0.007755 0.018465
max 2.549723 3.779666 0.843884 0.916982 0.418275 0.174954 0.223513 113.539148 0.036139 0.086563

Compile metrics for “local vs. global” Fig. 6

def format_metrics(*dfs):
    perfs = {'r2': {'mean': [], 'std': []},
             'mape': {'mean': [], 'std': []}}
    for df in dfs:
        for metric in ['r2', 'mape']:
            mean, std = df.describe().loc[['mean', 'std'], metric].items()
            perfs[metric]['mean'].append(mean[1])
            perfs[metric]['std'].append(std[1])
    return perfs
perfs = {}
perfs['global'] = format_metrics(perfs_global_mollisols, perfs_global_gelisols, perfs_global_vertisols)
perfs['local'] = format_metrics(perfs_local_mollisols, perfs_local_gelisols, perfs_local_vertisols)
perfs
{'global': {'r2': {'mean': [0.5999520975577833,
    0.6263296407122689,
    0.5472338139745212],
   'std': [0.01765197454826005, 0.09625659382541236, 0.10161782452206956]},
  'mape': {'mean': [94.29825920533892, 193.83833562261376, 98.74429900717739],
   'std': [3.0524914231457974, 31.399886042493737, 12.347859402802634]}},
 'local': {'r2': {'mean': [0.6647188332987304,
    0.7467735410966299,
    0.737918199125363],
   'std': [0.01957008650036342, 0.07663555355310155, 0.07134199041747523]},
  'mape': {'mean': [102.92025183047231, 205.70738511998024, 93.89137312512031],
   'std': [3.289160840803382, 44.668977271871476, 11.393787965933955]}}}
dest_dir = Path('files/dumps/plsr')
with open(dest_dir/'global_vs_local.pickle', 'wb') as f: 
                pickle.dump(perfs, f)