OSSL datasets

Data loading for the OSSL dataset

The official OSSL documentation provides more details on the dataset and its variables.


source

OSSLData


def OSSLData(
    df:DataFrame, # dataframe containing OSSL data
):

OSSL (Open Soil Spectral Library) data container


source

OSSLData._parse_columns


def _parse_columns(
    
):

Parse columns into visnir, mir and properties


source

get_cache_path


def get_cache_path(
    dest_dir:str='.soilspecdata', # Name of the cache directory
)->Path: # Path to the cache directory (~/dest_dir)

Get cache path for OSSL data

For instance:

get_cache_path()
Path('/Users/franckalbinet/.soilspecdata')

The default gzipped file is downloaded from the following URL: https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz

# def get_ossl(
#     url='https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz', # OSSL data gzipped file URL
#     force_download=False # if True, force download
#     ):
#     "Load OSSL data from cache or download it"
#     cache_path = get_cache_path()/'ossl_v1.2.csv.gz'
#     if not cache_path.exists() or force_download:
#         cache_path.parent.mkdir(exist_ok=True)
#         urlretrieve(url, cache_path)
        
#      # Define date columns
#     date_columns = [
#         'scan.mir.date.begin_iso.8601_yyyy.mm.dd',
#         'scan.mir.date.end_iso.8601_yyyy.mm.dd',
#         'scan.visnir.date.begin_iso.8601_yyyy.mm.dd',
#         'scan.visnir.date.end_iso.8601_yyyy.mm.dd'
#     ]
    
#     # Update dtype dictionary without datetime columns
#     dtype = {
#         # IDs and codes
#         'id.layer_local_c': 'string',
#         'id.location_olc_txt': 'string',
#         'id.dataset.site_ascii_txt': 'string',
#         'id.scan_local_c': 'string',
        
#         # Categorical text fields
#         'layer.texture_usda_txt': 'category',
#         'pedon.taxa_usda_txt': 'category',
#         'horizon.designation_usda_txt': 'category',
#         'location.country_iso.3166_txt': 'category',
#         'surveyor.address_utf8_txt': 'category',
#         'efferv_usda.a479_class': 'category',
        
#         # Text fields
#         'scan.mir.model.name_utf8_txt': 'string',
#         'scan.mir.model.code_any_txt': 'string',
#         'scan.mir.method.optics_any_txt': 'string',
#         'scan.mir.method.preparation_any_txt': 'string',
#         'scan.mir.license.title_ascii_txt': 'string',
#         'scan.mir.license.address_idn_url': 'string',
#         'scan.mir.doi_idf_url': 'string',
#         'scan.mir.contact.name_utf8_txt': 'string',
#         'scan.mir.contact.email_ietf_txt': 'string',
#         'scan.visnir.model.name_utf8_txt': 'string',
#         'scan.visnir.model.code_any_txt': 'string',
#         'scan.visnir.method.optics_any_txt': 'string',
#         'scan.visnir.method.preparation_any_txt': 'string',
#         'scan.visnir.license.title_ascii_txt': 'string',
#         'scan.visnir.license.address_idn_url': 'string',
#         'scan.visnir.doi_idf_url': 'string',
#         'scan.visnir.contact.name_utf8_txt': 'string',
#         'scan.visnir.contact.email_ietf_txt': 'string'
#     }
#     df = pd.read_csv(cache_path, compression='gzip', dtype=dtype,
#                      parse_dates=date_columns)
#     return OSSLData(df)

source

get_ossl


def get_ossl(
    url:str='https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz', # OSSL data gzipped file URL
    force_download:bool=False, # if True, force download
):

Load OSSL data from cache or download it

How to use it:

ossl = get_ossl(force_download=False)
ossl.visnir_cols[:2], ossl.mir_cols[:2], ossl.properties_cols[:2]
(['scan_visnir.350_ref', 'scan_visnir.352_ref'],
 ['scan_mir.600_abs', 'scan_mir.602_abs'],
 ['dataset.code_ascii_txt', 'id.layer_local_c'])
ossl.properties_cols
['dataset.code_ascii_txt',
 'id.layer_local_c',
 'id.layer_uuid_txt',
 'id.project_ascii_txt',
 'id.location_olc_txt',
 'id.dataset.site_ascii_txt',
 'id.scan_local_c',
 'longitude.point_wgs84_dd',
 'latitude.point_wgs84_dd',
 'layer.sequence_usda_uint16',
 'layer.upper.depth_usda_cm',
 'layer.lower.depth_usda_cm',
 'observation.date.begin_iso.8601_yyyy.mm.dd',
 'observation.date.end_iso.8601_yyyy.mm.dd',
 'surveyor.title_utf8_txt',
 'layer.texture_usda_txt',
 'pedon.taxa_usda_txt',
 'horizon.designation_usda_txt',
 'longitude.county_wgs84_dd',
 'latitude.county_wgs84_dd',
 'location.point.error_any_m',
 'location.country_iso.3166_txt',
 'observation.ogc.schema.title_ogc_txt',
 'observation.ogc.schema_idn_url',
 'surveyor.contact_ietf_email',
 'surveyor.address_utf8_txt',
 'dataset.title_utf8_txt',
 'dataset.owner_utf8_txt',
 'dataset.address_idn_url',
 'dataset.doi_idf_url',
 'dataset.license.title_ascii_txt',
 'dataset.license.address_idn_url',
 'dataset.contact.name_utf8_txt',
 'dataset.contact_ietf_email',
 'acidity_usda.a795_cmolc.kg',
 'aggstb_usda.a1_w.pct',
 'al.dith_usda.a65_w.pct',
 'al.ext_aquaregia_g.kg',
 'al.ext_usda.a1056_mg.kg',
 'al.ext_usda.a69_cmolc.kg',
 'al.ox_usda.a59_w.pct',
 'awc.33.1500kPa_usda.c80_w.frac',
 'b.ext_mel3_mg.kg',
 'bd_iso.11272_g.cm3',
 'bd_usda.a21_g.cm3',
 'bd_usda.a4_g.cm3',
 'c.tot_iso.10694_w.pct',
 'c.tot_usda.a622_w.pct',
 'ca.ext_aquaregia_mg.kg',
 'ca.ext_usda.a1059_mg.kg',
 'ca.ext_usda.a722_cmolc.kg',
 'caco3_iso.10693_w.pct',
 'caco3_usda.a54_w.pct',
 'cec_iso.11260_cmolc.kg',
 'cec_usda.a723_cmolc.kg',
 'cf_iso.11464_w.pct',
 'cf_usda.c236_w.pct',
 'clay.tot_iso.11277_w.pct',
 'clay.tot_usda.a334_w.pct',
 'cu.ext_usda.a1063_mg.kg',
 'ec_iso.11265_ds.m',
 'ec_usda.a364_ds.m',
 'efferv_usda.a479_class',
 'fe.dith_usda.a66_w.pct',
 'fe.ext_aquaregia_g.kg',
 'fe.ext_usda.a1064_mg.kg',
 'fe.ox_usda.a60_w.pct',
 'file_sequence',
 'k.ext_aquaregia_mg.kg',
 'k.ext_usda.a1065_mg.kg',
 'k.ext_usda.a725_cmolc.kg',
 'mg.ext_aquaregia_mg.kg',
 'mg.ext_usda.a1066_mg.kg',
 'mg.ext_usda.a724_cmolc.kg',
 'mn.ext_aquaregia_mg.kg',
 'mn.ext_usda.a1067_mg.kg',
 'mn.ext_usda.a70_mg.kg',
 'n.tot_iso.11261_w.pct',
 'n.tot_iso.13878_w.pct',
 'n.tot_usda.a623_w.pct',
 'na.ext_aquaregia_mg.kg',
 'na.ext_usda.a1068_mg.kg',
 'na.ext_usda.a726_cmolc.kg',
 'oc_iso.10694_w.pct',
 'oc_usda.c1059_w.pct',
 'oc_usda.c729_w.pct',
 'p.ext_aquaregia_mg.kg',
 'p.ext_iso.11263_mg.kg',
 'p.ext_usda.a1070_mg.kg',
 'p.ext_usda.a270_mg.kg',
 'p.ext_usda.a274_mg.kg',
 'p.ext_usda.a652_mg.kg',
 'ph.cacl2_iso.10390_index',
 'ph.cacl2_usda.a477_index',
 'ph.cacl2_usda.a481_index',
 'ph.h2o_iso.10390_index',
 'ph.h2o_usda.a268_index',
 's.ext_mel3_mg.kg',
 's.tot_usda.a624_w.pct',
 'sand.tot_iso.11277_w.pct',
 'sand.tot_usda.c405_w.pct',
 'sand.tot_usda.c60_w.pct',
 'silt.tot_iso.11277_w.pct',
 'silt.tot_usda.c407_w.pct',
 'silt.tot_usda.c62_w.pct',
 'wr.10kPa_usda.a414_w.pct',
 'wr.10kPa_usda.a8_w.pct',
 'wr.1500kPa_usda.a417_w.pct',
 'wr.33kPa_usda.a415_w.pct',
 'wr.33kPa_usda.a9_w.pct',
 'zn.ext_usda.a1073_mg.kg',
 'scan.mir.date.begin_iso.8601_yyyy.mm.dd',
 'scan.mir.date.end_iso.8601_yyyy.mm.dd',
 'scan.mir.model.name_utf8_txt',
 'scan.mir.model.code_any_txt',
 'scan.mir.method.optics_any_txt',
 'scan.mir.method.preparation_any_txt',
 'scan.mir.license.title_ascii_txt',
 'scan.mir.license.address_idn_url',
 'scan.mir.doi_idf_url',
 'scan.mir.contact.name_utf8_txt',
 'scan.mir.contact.email_ietf_txt',
 'scan.visnir.date.begin_iso.8601_yyyy.mm.dd',
 'scan.visnir.date.end_iso.8601_yyyy.mm.dd',
 'scan.visnir.model.name_utf8_txt',
 'scan.visnir.model.code_any_txt',
 'scan.visnir.method.optics_any_txt',
 'scan.visnir.method.preparation_any_txt',
 'scan.visnir.license.title_ascii_txt',
 'scan.visnir.license.address_idn_url',
 'scan.visnir.doi_idf_url',
 'scan.visnir.contact.name_utf8_txt',
 'scan.visnir.contact.email_ietf_txt']

source

OSSLData._get_valid_spectra_mask


def _get_valid_spectra_mask(
    spectra_cols:List, # Spectra column names
)->ndarray: # Mask

Return mask for samples with all non-null values in spectra

OSSL gzip archive is formated in a wide format (with metadata, soil properties, visnir and mir spectra as columns). Note that all samples have not been scanned simultaneously with VisNIR and MIR instruments according to the data source/provider.

As a result, when selecting a subset of columns, e.g. ossl.mir_cols, the returned dataframe will have a lot of missing values (NaN). The above function return a mask for samples with all non-null values in spectra.

ossl.df[ossl.mir_cols]
scan_mir.600_abs scan_mir.602_abs scan_mir.604_abs scan_mir.606_abs scan_mir.608_abs scan_mir.610_abs scan_mir.612_abs scan_mir.614_abs scan_mir.616_abs scan_mir.618_abs ... scan_mir.3982_abs scan_mir.3984_abs scan_mir.3986_abs scan_mir.3988_abs scan_mir.3990_abs scan_mir.3992_abs scan_mir.3994_abs scan_mir.3996_abs scan_mir.3998_abs scan_mir.4000_abs
0 1.527853 1.531908 1.532084 1.530892 1.530645 1.531506 1.531582 1.531413 1.532904 1.535459 ... 0.356776 0.356642 0.355784 0.354743 0.354104 0.353663 0.353237 0.352923 0.352548 0.352053
1 1.538449 1.543622 1.545751 1.546997 1.549450 1.553714 1.557981 1.561652 1.566082 1.571555 ... 0.358399 0.358142 0.357144 0.355980 0.355242 0.354722 0.354217 0.353825 0.353376 0.352798
2 1.619721 1.614226 1.615612 1.620649 1.626406 1.631747 1.636411 1.639527 1.642449 1.646890 ... 0.372522 0.372338 0.371425 0.370337 0.369679 0.369245 0.368808 0.368469 0.368084 0.367563
3 1.570129 1.567954 1.573055 1.580834 1.586880 1.590397 1.595117 1.600492 1.603847 1.606447 ... 0.357992 0.357734 0.356713 0.355480 0.354681 0.354137 0.353619 0.353217 0.352756 0.352158
4 1.484832 1.484367 1.484977 1.486258 1.488400 1.492040 1.495075 1.496595 1.498354 1.501437 ... 0.316249 0.316089 0.315098 0.313910 0.313210 0.312758 0.312312 0.311971 0.311568 0.311044
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
135646 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
135647 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
135648 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
135649 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
135650 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

135651 rows × 1701 columns

mask = ossl.df[ossl.mir_cols].notna().all(axis=1)
print(mask.sum(), 'samples with all non-null values in mir spectra out of the total', len(mask))
85684 samples with all non-null values in mir spectra out of the total 135651

source

OSSLData._extract_wavenumbers


def _extract_wavenumbers(
    cols:List, # column names
):

Extract wavenumbers from spectral column names

For instance, to retrieve the wavenumbers from the MIR columns:

ossl._extract_wavenumbers(ossl.mir_cols)
array([ 600,  602,  604, ..., 3996, 3998, 4000], shape=(1701,))

source

OSSLData._extract_measurement_type


def _extract_measurement_type(
    cols:List, # Spectral column names
)->str: # `abs` (Absorbance) or `ref` (Reflectance)

Extract measurement type from column names

For instance, to retrieve the measurement type from the MIR or VISNIR columns:

ossl._extract_measurement_type(ossl.visnir_cols), ossl._extract_measurement_type(ossl.mir_cols)
('ref', 'abs')

source

OSSLData._filter_wavelength_range


def _filter_wavelength_range(
    wavenumbers:ndarray, # Wavenumbers
    spectra:ndarray, # Spectra
    cols:List, # Column names
    wmin:Optional=None, # Min wavenumber
    wmax:Optional=None, # Max wavenumber
)->Tuple: # Filtered wavenumbers, spectra, columns

Filter spectra based on wavenumber range

wavenumbers, spectra, cols = ossl._filter_wavelength_range(
    wavenumbers=ossl._extract_wavenumbers(ossl.visnir_cols), 
    spectra=ossl.df[ossl.visnir_cols].values, 
    cols=ossl.visnir_cols, 
    wmin=4000, wmax=25000
)

print(f'Original wavenumbers: {ossl._extract_wavenumbers(ossl.visnir_cols).min()} - {ossl._extract_wavenumbers(ossl.visnir_cols).max()}')
print(f'Filtered wavenumbers: {wavenumbers.min()} - {wavenumbers.max()}')
print(f'Spectra shape: {spectra.shape}')
print(f'Filtered columns. From: {cols[0]} to: {cols[-1]}')
Original wavenumbers: 4000 - 28571
Filtered wavenumbers: 4000 - 25000
Spectra shape: (135651, 1051)
Filtered columns. From: scan_visnir.400_ref to: scan_visnir.2500_ref

IMPORTANT: Not that by default, both VISNIR and MIR spectra are converted to wavenumbers.


source

OSSLData.get_visnir


def get_visnir(
    wmin:Optional=4000, # Min wavenumber
    wmax:Optional=25000, # Max wavenumber
)->SpectraData: # VISNIR data

Get VISNIR spectra within specified wavenumber range

For instance, to retrieve the VISNIR spectra between 8000 and 25000 wavenumbers:

visnir_data = ossl.get_visnir(wmin=8000, wmax=25000)
visnir_data.spectra.shape
(64644, 426)

source

OSSLData.get_mir


def get_mir(
    wmin:Optional=600, # Min wavenumber
    wmax:Optional=4000, # Max wavenumber
)->SpectraData: # MIR data

Get MIR spectra within specified wavenumber range

For instance, to retrieve the MIR spectra between 600 and 4000 wavenumbers (default range):

mir_data = ossl.get_mir()
mir_data.spectra.shape, mir_data.wavenumbers.min(), mir_data.wavenumbers.max()
((85684, 1701), np.int64(600), np.int64(4000))

source

OSSLData.get_properties


def get_properties(
    properties:NoneType=None, # Properties
    require_complete:bool=False, # if True, only return samples with no null values
)->DataFrame: # Selected properties data

Get properties data with sample IDs

Get only complete MIR spectra:

mir_data = ossl.get_mir()

Get properties needed as ML targets (must be complete):

targets = ossl.get_properties(['cec_usda.a723_cmolc.kg'], require_complete=True)
targets.shape, targets.head()
((57064, 1),
         cec_usda.a723_cmolc.kg
 id                            
 S40857                6.633217
 S40858                3.822628
 S40859                3.427324
 S40860                1.906545
 S40861               13.403203)

Get optional metadata (can have NaN values):

metadata = ossl.get_properties(['longitude.point_wgs84_dd', 'latitude.point_wgs84_dd'], require_complete=False)
metadata.shape, metadata.head()
((135651, 2),
            longitude.point_wgs84_dd  latitude.point_wgs84_dd
 id                                                          
 icr072246                 15.687492                -7.377750
 icr072247                 15.687492                -7.377750
 icr072266                 15.687817                -7.351243
 icr072267                 15.687817                -7.351243
 icr072286                 15.687965                -7.331673)

source

OSSLData.get_aligned_data


def get_aligned_data(
    spectra_data:SpectraData, # Spectra data
    target_cols:Union, # Target columns
)->Tuple: # Aligned spectra, targets, sample IDs

Get aligned spectra and target data for ML, along with their sample IDs

For instance, to retrieve the MIR spectra and the corresponding CEC values in an amenable form for a Machine/Deep Learning pipeline:

X, y, ids = ossl.get_aligned_data(
    spectra_data=mir_data,
    target_cols='cec_usda.a723_cmolc.kg'
)

X.shape, y.shape, ids.shape
((57064, 1701), (57064, 1), (57064,))

Later, if you need metadata for these samples:

metadata = ossl.get_properties(['longitude.point_wgs84_dd', 'latitude.point_wgs84_dd']).loc[ids]
metadata.head()
longitude.point_wgs84_dd latitude.point_wgs84_dd
id
S40857 174.42 -36.78
S40858 174.42 -36.78
S40859 174.42 -36.78
S40860 174.42 -36.78
S40861 174.55 -36.67