Encoders

Handler-curated DataFrames → MARIS NetCDF

The test data below simulates what a handler might produce: two DataFrames (SEAWATER and BIOTA) with a handful of rows each, covering the key column types: identifiers, coordinates, timestamps, measurements, and controlled-vocabulary fields like AREA, NUCLIDE, and SPECIES.

import tempfile, os
from fastcore.test import test_eq

df_seawater = pd.DataFrame({
    'SMP_ID': [0, 1, 2],
    'SMP_ID_PROVIDER': ['1', '2', '3'],
    'LON': [141.0, 142.0, 143.0],
    'LAT': [37.3, 38.3, 39.3],
    'TIME': [1234, 1235, 1236],
    'NUCLIDE': [1, 2, 3],
    'VALUE': [0.1, 1.1, 2.1],
    'AREA': [2374, 2379, 2401],
    'STATION': ['A0', 'A11', 'B234']
    })

df_biota = pd.DataFrame({
    'SMP_ID': [0, 1, 2, 3],
    'SMP_ID_PROVIDER': ['ID1', 'ID2', 'ID3', 'ID4'],
    'LON': [141.0, 142.0, 143.0, 144.0],
    'LAT': [37.3, 38.3, 39.3, 40.3],
    'TIME': [1234, 1235, 1236, 1237],
    'NUCLIDE': [1, 2, 3, 3],
    'VALUE': [0.1, 1.1, 2.1, 3.1],
    'SPECIES': [1, 2, 3, 3]
    })

dfs = {'SEAWATER': df_seawater, 'BIOTA': df_biota}
attrs = {'id': '123', 'title': 'Test title', 'summary': 'Summary test'}
dest = tempfile.mktemp(suffix='.nc')

The NetCDFEncoder class is the workhorse of this module: it takes a dict of handler-curated DataFrames and writes them out as a single self-contained NetCDF4 file following the MARIS template.


source

NetCDFEncoder


def NetCDFEncoder(
    dfs:Dict, # {NC_GROUPS key → DataFrame}, e.g. {'SEAWATER': df_sw, 'BIOTA': df_bio}
    dest_fname:str, # Name of output file to produce
    global_attrs:Dict, # NetCDF global attributes (id, title, summary, keywords, ...)
    fn_src_fname:Callable=nc_tpl_path, # Callable returning path to the MARIS NetCDF template
    verbose:bool=False, # Print currently written NetCDF group and variable names
):

MARIS NetCDF encoder: transforms handler-curated DataFrames into a self-contained NetCDF4 file.


source

NetCDFEncoder.copy_global_attrs


def copy_global_attrs(
    
):

Update NetCDF template global attributes as specified by global_attrs argument.


source

NetCDFEncoder.copy_dims


def copy_dims(
    grp_dest, # Destination NetCDF group
):

Copy dimensions from template into a group.


source

NetCDFEncoder.process_grps


def process_grps(
    
):

Iterate all groups in dfs and encode each one.


source

NetCDFEncoder.process_grp


def process_grp(
    grp_name:str, # NC_GROUPS key, e.g. `'SEAWATER'`
    df:DataFrame, # Measurements for this group
):

Create a destination group, copy dimensions, then create and populate variables from the DataFrame.


source

NetCDFEncoder.copy_vars


def copy_vars(
    grp_name:str, # NC_GROUPS key
    df:DataFrame, # Measurements for this group
    grp_dest, # Destination NetCDF group
):

Copy variables from template into group, filling from df.


source

NetCDFEncoder.copy_var


def copy_var(
    var_name:str, # NetCDF variable name
    var_src, # Source template variable
    df:DataFrame, # DataFrame with the data
    grp_dest, # Destination NetCDF group
):

Copy a single variable: create, populate, copy attrs.


source

NetCDFEncoder.var_type


def var_type(
    dtype_name:str, # Datatype name from template
    var_src, # Source template variable
):

Pick enum type if available, else template datatype.


source

NetCDFEncoder.create_var


def create_var(
    grp_dest, # Destination NetCDF group
    var_name:str, # NetCDF variable name
    variable_type, # NetCDF type (enum, str, or float)
):

Create a NetCDF variable with zlib compression.


source

NetCDFEncoder.fill_var


def fill_var(
    grp_dest, # Destination NetCDF group
    var_name:str, # NetCDF variable name
    variable_type, # NetCDF type (enum, str, or float)
    df:DataFrame, # DataFrame with the data
):

Populate a NetCDF variable from a DataFrame column.


source

NetCDFEncoder.fillna_enum


def fillna_enum(
    values, # Array of values, possibly with NaN
    fill_value:int=-1, # Sentinel for missing enum values
):

Replace NaN in enum-typed columns with a fill value.

NetCDF enum types store values as plain integers. They don’t have a native concept of “missing.” So when an enum column contains NaN, we replace them with a sentinel value (-1) before writing to disk.

enc = NetCDFEncoder(dfs, dest_fname=tempfile.mktemp(suffix='.nc'), global_attrs=attrs)

vals = np.array([1.0, np.nan, 3.0, np.nan])
res = enc.fillna_enum(vals)
test_eq(list(res), [1, -1, 3, -1])

source

NetCDFEncoder.copy_var_attrs


def copy_var_attrs(
    var_name:str, # NetCDF variable name
    var_src, # Source template variable
    grp_dest, # Destination NetCDF group
):

Copy attributes from template variable to destination.


source

NetCDFEncoder.all_cols


def all_cols(
    
):

All unique NC columns present across all groups.

Before we create enum types, we need to know which columns we’re dealing with across all groups. all_cols gathers them for us. It picks up SPECIES from BIOTA and AREA, LAT, LON, NUCLIDE, SMP_ID, SMP_ID_PROVIDER, STATION, TIME, and VALUE from both groups.

enc = NetCDFEncoder(dfs, dest_fname=tempfile.mktemp(suffix='.nc'), global_attrs=attrs)
test_eq(set(enc.all_cols), {'AREA', 'LAT', 'LON', 'NUCLIDE', 'SMP_ID',
                            'SMP_ID_PROVIDER', 'SPECIES', 'STATION', 'TIME', 'VALUE'})

source

NetCDFEncoder.create_enums


def create_enums(
    
):

Create NetCDF enum types for all columns referenced in the data.


source

NetCDFEncoder.encode


def encode(
    
):

Encode MARIS NetCDF based on template and dataframes.

How it works

Let’s run the encoder on test data and inspect each step to see how the pieces fit together.

Global attributes

We inherit the template’s standard attributes, then layer on the ones unique to this dataset. Things like title, summary, and identifier.

Let’s encode our test dataframes and verify the global attributes made it through:

encoder = NetCDFEncoder(dfs, dest_fname=dest, global_attrs=attrs)
encoder.encode()

with Dataset(dest, 'r', format='NETCDF4') as nc:
    test_eq(nc.id, '123')
    test_eq(nc.title, 'Test title')
    test_eq(nc.summary, 'Summary test')

Dimensions

Each group gets an id dimension from the template. It’s an unlimited dimension, so rows can be appended later without restructuring the file. Its length matches the number of rows in the group’s DataFrame.

with Dataset(dest, 'r', format='NETCDF4') as nc:
    for grp_name in ('seawater', 'biota'):
        grp = nc.groups[grp_name]
        test_eq('id' in grp.dimensions, True)
        test_eq(grp.dimensions['id'].isunlimited(), True)
        test_eq(len(grp.dimensions['id']), len(dfs[grp_name.upper()]))

Groups and variables

Each key in dfs becomes a NetCDF group. Within each group, only the variables matching the DataFrame columns are created. So biota gets a species variable while seawater doesn’t.

with Dataset(dest, 'r', format='NETCDF4') as nc:
    test_eq(list(nc.groups.keys()), ['seawater', 'biota'])

    sw_vars = list(nc['seawater'].variables.keys())
    bio_vars = list(nc['biota'].variables.keys())
    test_eq('species' in bio_vars, True)
    test_eq('species' in sw_vars, False)

Variable values

Each DataFrame column becomes the corresponding NetCDF variable. The mapping goes through NC_VARS, so VALUE becomes value, LAT becomes lat, and so on.

with Dataset(dest, 'r', format='NETCDF4') as nc:
    sw = nc['seawater']
    test_eq(list(sw['id'][:]), [0, 1, 2])
    test_eq(list(sw['lon'][:]), [141.0, 142.0, 143.0])
    test_eq(list(sw['lat'][:]), [37.3, 38.3, 39.3])
    test_eq(list(sw['value'][:]), [0.1, 1.1, 2.1])

Enum types

Columns backed by a controlled vocabulary (area, nuclide, species, …) use NetCDF enum types rather than plain integers.

NetCDF enums store values as plain integers (int64) on disk, with a human-readable label mapping attached as metadata. The file stores [1, 2, 3, 3] as int64, but ncdump or netCDF4-python display them as species names. You can inspect this directly:

with Dataset(dest, 'r', format='NETCDF4') as nc:
    species = nc['biota']['species']
    print("Enum name:    ", species.datatype.name)
    print("On-disk type: ", species.dtype)
    print("Raw values:   ", species[:])
    mapping = species.datatype.enum_dict
    for k, v in list(mapping.items())[:5]:
        print(f"  {v} → '{k}'")
Enum name:     species_t
On-disk type:  int64
Raw values:    [1 2 3 3]
  0 → 'NOT AVAILABLE'
  1 → 'Aristeus antennatus'
  2 → 'Apostichopus'
  3 → 'Saccharina japonica var religiosa'
  4 → 'Siganus fuscescens'

The test confirms that species is a proper NetCDF enum type, coordinates like lon are stored as plain float32, and free-text fields like station use the NetCDF VLType (variable-length string).

with Dataset(dest, 'r', format='NETCDF4') as nc:
    biota = nc['biota']
    test_eq('EnumType' in str(type(biota['species'].datatype)), True)
    test_eq(biota['lon'].dtype, np.float32)
    sw = nc['seawater']
    test_eq(sw['station'].datatype.__class__.__name__, 'VLType')