if 'google.colab' in str(get_ipython()):
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
!pip install mirzai
else:2. Select & transform
Selection of data subset based on features, target and auxiliary data such as Soil Taxonomy order. Simple transformation such as log-10 transform of the target is also done (and piped).
from mirzai.data.loading import load_kssl
from mirzai.data.selection import (select_y, select_tax_order, select_X)
from mirzai.data.transform import log_transform_y
from fastcore.transform import compose
import warnings
warnings.filterwarnings('ignore')2.1 Piping data selection and simple transformation
src_dir = 'data'
fnames = ['spectra-features.npy', 'spectra-wavenumbers.npy',
'depth-order.npy', 'target.npy',
'tax-order-lu.pkl', 'spectra-id.npy']
X, X_names, depth_order, y, tax_lookup, X_id = load_kssl(src_dir, fnames=fnames)print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'Wavenumbers:\n {X_names}')
print(f'depth_order (first 3 rows):\n {depth_order[:3, :]}')
print(f'Taxonomic order lookup:\n {tax_lookup}')X shape: (50494, 1764)
y shape: (50494,)
Wavenumbers:
[3999 3997 3995 ... 603 601 599]
depth_order (first 3 rows):
[[43. 2.]
[ 0. 0.]
[ 0. 1.]]
Taxonomic order lookup:
{'alfisols': 0, 'mollisols': 1, 'inceptisols': 2, 'entisols': 3, 'spodosols': 4, 'undefined': 5, 'ultisols': 6, 'andisols': 7, 'histosols': 8, 'oxisols': 9, 'vertisols': 10, 'aridisols': 11, 'gelisols': 12}
data = X, y, X_id, depth_order
transforms = [select_y, select_tax_order, select_X, log_transform_y]
X, y, X_id, depth_order = compose(*transforms)(data)print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'depth_order shape: {depth_order.shape}')X shape: (40132, 1764)
y shape: (40132,)
depth_order shape: (40132, 2)