This notebook contains a data pipeline (handler) that converts the master MARIS database dump into NetCDF format. It enables batch encoding of all legacy datasets into NetCDF.

Key functions of this handler:

The result is a set of NetCDF files, one for each unique reference ID in the input data.

 DataLoader (fname:str)

Load specific MARIS dataset through its ref_id.

Type Details
fname str Path to the MARIS global dump file
Exported source
class DataLoader:
    "Load specific MARIS dataset through its ref_id."
    LUT = {
        'Sediment': 'sediment', 'Seawater': 'seawater',
        'Suspended matter': 'suspended-matter', 'Biota': 'biota'}

    def __init__(self, 
                 fname: str # Path to the MARIS global dump file
        self.fname = fname
        self.df = None  # Lazy loading

    def _load_data(self):
        if self.df is None:
            self.df = pd.read_csv(self.fname, sep='\t', encoding='ISO-8859-1')

    def __call__(self, 
                 ref_id: int # Reference ID of interest
                 ) -> dict: # Dictionary of dataframes
        filtered_df = self.df[self.df.ref_id == ref_id]
        return {
            self.LUT[name]: grp
            for name, grp in filtered_df.groupby('samptype')
            if name in self.LUT



 get_zotero_key (dfs)

Retrieve Zotero key from MARIS dump.

Exported source
def get_zotero_key(dfs):
    "Retrieve Zotero key from MARIS dump."
    return dfs[next(iter(dfs))][['zoterourl']].iloc[0].values[0].split('/')[-1]



 get_fname (dfs)

Retrieve filename from MARIS dump.

Exported source
def get_fname(dfs):
    "Retrieve filename from MARIS dump."
    id, name = dfs[next(iter(dfs))][['ref_id', 'displaytext']].iloc[0]
    name = name.replace(',', '').replace('.', '').replace('-', ' ').split(' ')
    return '-'.join(([str(id)] + name)) + '.nc'

Load data

Let’s get a quick look at the input MARIS dump:

df = pd.read_csv(fname_in, sep='\t', encoding='ISO-8859-1')

print('# of unique refs: ', len(df.ref_id.unique()))
print('columns: ', df.columns)
# of unique refs:  526
columns:  Index(['ref_id', 'displaytext', 'samptype', 'nuclide_id', 'latitude',
       'longitude', 'begperiod', 'endperiod', 'sampdepth', 'totdepth',
       'uncertaint', 'unit_id', 'detection', 'area_id', 'species_id',
       'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume', 'salinity',
       'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id', 'activity',
ref_id displaytext samptype nuclide_id latitude longitude begperiod endperiod sampdepth totdepth ... bodypar_id sedtype_id volume salinity temperatur sampmet_id prepmet_id counmet_id activity zoterourl
0 182 Urban et al., 2015 Biota 33 -35.140833 117.604444 2014-05-06 00:00:00 NaN -1.0 NaN ... 52 0 NaN NaN NaN 0 6 20 0.387
1 182 Urban et al., 2015 Biota 47 -35.140833 117.604444 2014-05-06 00:00:00 NaN -1.0 NaN ... 52 0 NaN NaN NaN 0 6 5 1.44
2 182 Urban et al., 2015 Biota 31 -16.466944 123.535833 2014-02-27 00:00:00 NaN -1.0 NaN ... 52 0 NaN NaN NaN 0 6 20 0.042
3 182 Urban et al., 2015 Biota 33 -16.466944 123.535833 2014-02-27 00:00:00 NaN -1.0 NaN ... 52 0 NaN NaN NaN 0 6 20 0.075
4 182 Urban et al., 2015 Biota 47 -16.466944 123.535833 2014-02-27 00:00:00 NaN -1.0 NaN ... 52 0 NaN NaN NaN 0 6 5 0.069

5 rows × 26 columns

Let’s checkout if we retrieve the expected keys (sample types) and associated dataframes:

dataloader = DataLoader(fname_in)
ref_id = 100 # Some other ref_id examples: OSPAR: 191, HELCOM: 100, 717 (only seawater)

dfs = dataloader(ref_id=ref_id)
print(f'keys: {dfs.keys()}')
keys: dict_keys(['biota', 'seawater', 'sediment'])
ref_id displaytext samptype nuclide_id latitude longitude begperiod endperiod sampdepth totdepth ... bodypar_id sedtype_id volume salinity temperatur sampmet_id prepmet_id counmet_id activity zoterourl
549778 100 HELCOM MORS, 2018 Sediment 17 54.838333 9.9 1989-06-14 00:00:00 NaN -1.0 24.0 ... 0 59 NaN NaN NaN 0 0 0 26.6
549779 100 HELCOM MORS, 2018 Sediment 24 54.838333 9.9 1989-06-14 00:00:00 NaN -1.0 24.0 ... 0 59 NaN NaN NaN 0 0 0 134.0
549780 100 HELCOM MORS, 2018 Sediment 24 54.838333 9.9 1989-06-14 00:00:00 NaN -1.0 24.0 ... 0 59 NaN NaN NaN 0 0 0 18.6
549781 100 HELCOM MORS, 2018 Sediment 31 54.838333 9.9 1989-06-14 00:00:00 NaN -1.0 24.0 ... 0 59 NaN NaN NaN 0 0 0 42.5
549782 100 HELCOM MORS, 2018 Sediment 31 54.838333 9.9 1989-06-14 00:00:00 NaN -1.0 24.0 ... 0 59 NaN NaN NaN 0 0 0 5.9

5 rows × 26 columns

Data transformation pipeline

Normalize nuclide names

Remap nuclide_id to MARIS radionuclide standard names:



 RemapRdnNameCB (fn_lut=<function <lambda>>)

Remap to MARIS radionuclide names.

Exported source
nuclide_id_to_name = lambda: get_lut(lut_path(), 'dbo_nuclide.xlsx', 
                                     key='nc_name', value='nuclide_id',
Exported source
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self, fn_lut=nuclide_id_to_name): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide_id'].replace(lut)
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

['ru106' 'sb125' 'cs134' 'cs137' 'k40' 'co60' 'ag110m' 'ra226' 'th232'
 'pb212' 'pb214' 'pu238' 'am241' 'pu239_240_tot' 'zr95' 'mn54' 'ac228'
 'u235' 'tl208' 'be7' 'bi214' 'ra223' 'ru103' 'sr90' 'eu155' 'ba140'
 'co58' 'ra224' 'po210' 'ra228' 'th228' 'ce144' 'cs134_137_tot' 'pb210'
 'pu239' 'cd109' 'bi212' 'pu238_240_tot' 'nb95' 'ir192' 'sb124' 'zn65'
 'th234' 'pu241']

Rename columns

Rename MARIS dump columns to MARIS netCDF standard names:

Index(['ref_id', 'displaytext', 'samptype', 'nuclide_id', 'latitude',
       'longitude', 'begperiod', 'endperiod', 'sampdepth', 'totdepth',
       'uncertaint', 'unit_id', 'detection', 'area_id', 'species_id',
       'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume', 'salinity',
       'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id', 'activity',



 renaming_rules ()

Rename MARIS dump columns to MARIS netCDF standard names.

Exported source
def renaming_rules():
    "Rename MARIS dump columns to MARIS netCDF standard names."
    vars = cdl_cfg()['vars']
    return {
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'begperiod': vars['defaults']['time']['name'],
        'sampdepth': vars['defaults']['smp_depth']['name'],
        'totdepth': vars['defaults']['tot_depth']['name'],
        'uncertaint': vars['suffixes']['uncertainty']['name'],
        'unit_id': vars['suffixes']['unit']['name'],
        'detection': vars['suffixes']['detection_limit']['name'],
        'area_id': vars['defaults']['area']['name'], 
        'species_id': vars['bio']['species']['name'],
        'biogroup_id': vars['bio']['bio_group']['name'],
        'bodypar_id': vars['bio']['body_part']['name'],
        'sedtype_id': vars['sed']['sed_type']['name'],
        'volume': vars['suffixes']['volume']['name'],
        'salinity': vars['suffixes']['salinity']['name'],
        'temperatur': vars['suffixes']['temperature']['name'],
        'sampmet_id': vars['suffixes']['sampling_method']['name'],
        'prepmet_id': vars['suffixes']['preparation_method']['name'],
        'counmet_id': vars['suffixes']['counting_method']['name'],
        'activity': 'value',
        'nuclide_id': 'nuclide'



 RenameColumnCB (renaming_rules=<function renaming_rules>)

Renaming variables to MARIS standard names.

Exported source
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self, renaming_rules=renaming_rules): fc.store_attr()
    def __call__(self, tfm):
        lut = renaming_rules()
        coi = lut.keys()
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].loc[:, coi]
            tfm.dfs[k].rename(columns=lut, inplace=True)
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

               lat        lon                 time smp_depth tot_depth  \
549778   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549779   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549780   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549781   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549782   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
...            ...        ...                  ...       ...       ...   
1532415  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532416  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532417  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532418  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532419  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   

             _unc _unit _dl  area species  ... body_part sed_type _vol _sal  \
549778       3.99     4   =  2374       0  ...         0       59  NaN  NaN   
549779        NaN     2   =  2374       0  ...         0       59  NaN  NaN   
549780      1.674     4   =  2374       0  ...         0       59  NaN  NaN   
549781        NaN     2   =  2374       0  ...         0       59  NaN  NaN   
549782      1.829     4   =  2374       0  ...         0       59  NaN  NaN   
...           ...   ...  ..   ...     ...  ...       ...      ...  ...  ...   
1532415   86.2836     4   =  2409       0  ...         0       58  NaN  NaN   
1532416       NaN     2   =  2409       0  ...         0       58  NaN  NaN   
1532417  24.45552     4   =  2409       0  ...         0       58  NaN  NaN   
1532418       NaN     2   =  2409       0  ...         0       58  NaN  NaN   
1532419  123.2568     4   =  2409       0  ...         0       58  NaN  NaN   

        _temp _sampmet _prepmet _counmet    value nuclide  
549778    NaN        0        0        0     26.6   ru106  
549779    NaN        0        0        0    134.0   sb125  
549780    NaN        0        0        0     18.6   sb125  
549781    NaN        0        0        0     42.5   cs134  
549782    NaN        0        0        0      5.9   cs134  
...       ...      ...      ...      ...      ...     ...  
1532415   NaN        0        0        0   1106.2     k40  
1532416   NaN        0        0        0  991.023   cs137  
1532417   NaN        0        0        0    550.8   cs137  
1532418   NaN        0        0        0  2461.36     k40  
1532419   NaN        0        0        0   1368.0     k40  

[123196 rows x 21 columns]

Drop NaN only columns



 DropNAColumnsCB (na_value=0)

Drop variable containing only NaN or ‘Not available’ (id=0 in MARIS lookup tables).

Exported source
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0): fc.store_attr()
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

               lat        lon                 time smp_depth tot_depth  \
549778   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549779   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549780   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549781   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549782   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
...            ...        ...                  ...       ...       ...   
1532415  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532416  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532417  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532418  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532419  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   

             _unc _unit _dl  area sed_type _sampmet _prepmet    value nuclide  
549778       3.99     4   =  2374       59        0        0     26.6   ru106  
549779        NaN     2   =  2374       59        0        0    134.0   sb125  
549780      1.674     4   =  2374       59        0        0     18.6   sb125  
549781        NaN     2   =  2374       59        0        0     42.5   cs134  
549782      1.829     4   =  2374       59        0        0      5.9   cs134  
...           ...   ...  ..   ...      ...      ...      ...      ...     ...  
1532415   86.2836     4   =  2409       58        0        0   1106.2     k40  
1532416       NaN     2   =  2409       58        0        0  991.023   cs137  
1532417  24.45552     4   =  2409       58        0        0    550.8   cs137  
1532418       NaN     2   =  2409       58        0        0  2461.36     k40  
1532419  123.2568     4   =  2409       58        0        0   1368.0     k40  

[123196 rows x 14 columns]

Remap detection limit values

Exported source
dl_name_to_id = lambda: get_lut(lut_path(), 'dbo_detectlimit.xlsx', key='name', value='id')
{'Not applicable': -1, 'Not Available': 0, '=': 1, '<': 2, 'ND': 3, 'DE': 4}



 SanitizeDetectionLimitCB (fn_lut=<function <lambda>>)

Assign Detection Limit name to its id based on MARIS nomenclature.

Exported source
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
        self.var_name = cdl_cfg()['vars']['suffixes']['detection_limit']['name']

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.var_name] = tfm.dfs[k][self.var_name].replace(lut)
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

549778     1
549779     1
549780     1
549781     1
549782     1
1532415    1
1532416    1
1532417    1
1532418    1
1532419    1
Name: _dl, Length: 123196, dtype: int64

Parse and encode time

We remind that in netCDF format time need to be encoded as integer representing the number of seconds since a time of reference. In our case we chose 1970-01-01 00:00:00.0 as defined in configs.ipynb.



 ParseTimeCB ()

Parse time column from MARIS dump.

Exported source
class ParseTimeCB(Callback):
    "Parse time column from MARIS dump."
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].time, format='ISO8601')
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

               lat        lon        time smp_depth tot_depth      _unc _unit  \
549778   54.838333        9.9   613785600      -1.0      24.0      3.99     4   
549779   54.838333        9.9   613785600      -1.0      24.0       NaN     2   
549780   54.838333        9.9   613785600      -1.0      24.0     1.674     4   
549781   54.838333        9.9   613785600      -1.0      24.0       NaN     2   
549782   54.838333        9.9   613785600      -1.0      24.0     1.829     4   
...            ...        ...         ...       ...       ...       ...   ...   
1532415  57.619722  23.621389  1133481600      -1.0      55.0   86.2836     4   
1532416  57.619722  23.621389  1133481600      -1.0      55.0       NaN     2   
1532417  57.619722  23.621389  1133481600      -1.0      55.0  24.45552     4   
1532418  57.619722  23.621389  1133481600      -1.0      55.0       NaN     2   
1532419  57.619722  23.621389  1133481600      -1.0      55.0  123.2568     4   

         _dl  area sed_type _sampmet _prepmet    value nuclide  
549778     1  2374       59        0        0     26.6   ru106  
549779     1  2374       59        0        0    134.0   sb125  
549780     1  2374       59        0        0     18.6   sb125  
549781     1  2374       59        0        0     42.5   cs134  
549782     1  2374       59        0        0      5.9   cs134  
...      ...   ...      ...      ...      ...      ...     ...  
1532415    1  2409       58        0        0   1106.2     k40  
1532416    1  2409       58        0        0  991.023   cs137  
1532417    1  2409       58        0        0    550.8   cs137  
1532418    1  2409       58        0        0  2461.36     k40  
1532419    1  2409       58        0        0   1368.0     k40  

[123196 rows x 14 columns]

Reshape: long to wide

dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

                 lon        time  area        lat  smp_depth  tot_depth  \
549834      9.633333   544838400  2374  54.850000       -1.0       16.0   
549835      9.633333   544838400  2374  54.850000       -1.0       16.0   
549836      9.633333   544838400  2374  54.850000       -1.0       16.0   
549837      9.633333   544838400  2374  54.850000       -1.0       16.0   
549838      9.633333   544838400  2374  54.850000       -1.0       16.0   
...              ...         ...   ...        ...        ...        ...   
1518808    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1518809    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1518810    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1528756    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1528757    29.833333  1128211200  2407  59.983333       -1.0        0.0   

           sed_type  ac228_dl  ag110m_dl  am241_dl  ...  sb124  sb125  sr90  \
org_index                                           ...                       
549834           58       NaN        NaN       NaN  ...    NaN    NaN   NaN   
549835           58       NaN        NaN       NaN  ...    NaN    NaN   NaN   
549836           58       NaN        NaN       NaN  ...    NaN    NaN   NaN   
549837           58       NaN        NaN       NaN  ...    NaN    NaN   NaN   
549838           58       NaN        NaN       NaN  ...    NaN    NaN   NaN   
...             ...       ...        ...       ...  ...    ...    ...   ...   
1518808           2       NaN        NaN       NaN  ...    NaN    NaN   NaN   
1518809           2       NaN        NaN       NaN  ...    NaN    NaN   NaN   
1518810           2       NaN        NaN       NaN  ...    NaN    NaN   NaN   
1528756           2       NaN        NaN       NaN  ...    NaN    NaN   NaN   
1528757           2       NaN        NaN       NaN  ...    NaN    NaN   NaN   

           th228  th232  th234  tl208  u235  zn65  zr95  
549834       NaN    NaN    NaN    NaN   NaN   NaN   NaN  
549835       NaN    NaN    NaN    NaN   NaN   NaN   NaN  
549836       NaN    NaN    NaN    NaN   NaN   NaN   NaN  
549837       NaN    NaN    NaN    NaN   NaN   NaN   NaN  
549838       NaN    NaN    NaN    NaN   NaN   NaN   NaN  
...          ...    ...    ...    ...   ...   ...   ...  
1518808      NaN    NaN    NaN    NaN   NaN   NaN   NaN  
1518809      NaN    NaN    NaN    NaN   NaN   NaN   NaN  
1518810      NaN    NaN    NaN    NaN   NaN   NaN   NaN  
1528756      NaN    NaN    NaN    NaN   NaN   NaN   NaN  
1528757      NaN    NaN    NaN    NaN   NaN   NaN   NaN  

[123196 rows x 270 columns]

Sanitize coordinates

dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

lon time area lat smp_depth tot_depth sed_type ac228_dl ag110m_dl am241_dl ... sb124 sb125 sr90 th228 th232 th234 tl208 u235 zn65 zr95
549834 9.633333 544838400 2374 54.850000 -1.0 16.0 58 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
549835 9.633333 544838400 2374 54.850000 -1.0 16.0 58 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
549836 9.633333 544838400 2374 54.850000 -1.0 16.0 58 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
549837 9.633333 544838400 2374 54.850000 -1.0 16.0 58 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
549838 9.633333 544838400 2374 54.850000 -1.0 16.0 58 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1518808 29.833333 1128211200 2407 59.983333 -1.0 0.0 2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1518809 29.833333 1128211200 2407 59.983333 -1.0 0.0 2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1518810 29.833333 1128211200 2407 59.983333 -1.0 0.0 2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1528756 29.833333 1128211200 2407 59.983333 -1.0 0.0 2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1528757 29.833333 1128211200 2407 59.983333 -1.0 0.0 2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

123196 rows × 270 columns

Encode to NetCDF

dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[

dfs_tfm = tfm()
['Remap to MARIS radionuclide names.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.']



 get_attrs (tfm, zotero_key, kw=['oceanography', 'Earth Science > Oceans >
            Ocean Chemistry> Radionuclides', 'Earth Science > Human
            Dimensions > Environmental Impacts > Nuclear Radiation
            Exposure', 'Earth Science > Oceans > Ocean Chemistry > Ocean
            Tracers, Earth Science > Oceans > Marine Sediments', 'Earth
            Science > Oceans > Ocean Chemistry, Earth Science > Oceans >
            Sea Ice > Isotopes', 'Earth Science > Oceans > Water Quality >
            Ocean Contaminants', 'Earth Science > Biological
            Classification > Animals/Vertebrates > Fish', 'Earth Science >
            Biosphere > Ecosystems > Marine Ecosystems', 'Earth Science >
            Biological Classification > Animals/Invertebrates > Mollusks',
            'Earth Science > Biological Classification >
            Animals/Invertebrates > Arthropods > Crustaceans', 'Earth
            Science > Biological Classification > Plants > Macroalgae

Retrieve global attributes from MARIS dump.

Exported source
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']
Exported source
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)
{'geospatial_lat_min': '30.435833333333335',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.633333333333333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.633333333333333 53.5, 30.435833333333335 53.5, 30.435833333333335 65.75, 9.633333333333333 65.75, 9.633333333333333 53.5))',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-14T00:00:00',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "A.", "lastName": "McGarry"}, {"creatorType": "author", "firstName": "S.", "lastName": "Lyons"}, {"creatorType": "author", "firstName": "C.", "lastName": "McEnri"}, {"creatorType": "author", "firstName": "T.", "lastName": "Ryan"}, {"creatorType": "author", "firstName": "M.", "lastName": "O\'Colmain"}, {"creatorType": "author", "firstName": "J.D.", "lastName": "Cunningham"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes, Earth Science > Oceans > Water Quality > Ocean Contaminants, Earth Science > Biological Classification > Animals/Vertebrates > Fish, Earth Science > Biosphere > Ecosystems > Marine Ecosystems, Earth Science > Biological Classification > Animals/Invertebrates > Mollusks, Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans, Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)',
 'publisher_postprocess_logs': "Remap to MARIS radionuclide names., Renaming variables to MARIS standard names., Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)., Assign Detection Limit name to its id based on MARIS nomenclature., Encode time as `int` representing seconds since xxx, Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator."}



 enums_xtra (tfm, vars)

Retrieve a subset of the lengthy enum as species_t for instance.

Exported source
def enums_xtra(tfm, vars):
    "Retrieve a subset of the lengthy enum as `species_t` for instance."
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras



 encode (fname_in, fname_out, nc_tpl_path, **kwargs)
Exported source
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    dataloader = DataLoader(fname_in)
    ref_ids = kwargs.get('ref_ids', df.ref_id.unique())
    print('Encoding ...')
    for ref_id in tqdm(ref_ids, leave=False):
        dfs = dataloader(ref_id=ref_id)
        tfm = Transformer(dfs, cbs=[
        encoder = NetCDFEncoder(tfm.dfs, 
                                dest_fname=Path(fname_out) / get_fname(dfs), 
                                global_attrs=get_attrs(tfm, zotero_key=get_zotero_key(dfs), kw=kw),
                                verbose=kwargs.get('verbose', False),
                                enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])

Single dataset

tfm = Transformer(dfs, cbs=[

dfs_test = tfm()
ref_id = 100
encode(fname_in, dir_dest, nc_tpl_path(), verbose=True, ref_ids=[ref_id])
All datasets

