import tempfile, os
from fastcore.test import test_eq
df_seawater = pd.DataFrame({
'SMP_ID': [0, 1, 2],
'SMP_ID_PROVIDER': ['1', '2', '3'],
'LON': [141.0, 142.0, 143.0],
'LAT': [37.3, 38.3, 39.3],
'TIME': [1234, 1235, 1236],
'NUCLIDE': [1, 2, 3],
'VALUE': [0.1, 1.1, 2.1],
'AREA': [2374, 2379, 2401],
'STATION': ['A0', 'A11', 'B234']
})
df_biota = pd.DataFrame({
'SMP_ID': [0, 1, 2, 3],
'SMP_ID_PROVIDER': ['ID1', 'ID2', 'ID3', 'ID4'],
'LON': [141.0, 142.0, 143.0, 144.0],
'LAT': [37.3, 38.3, 39.3, 40.3],
'TIME': [1234, 1235, 1236, 1237],
'NUCLIDE': [1, 2, 3, 3],
'VALUE': [0.1, 1.1, 2.1, 3.1],
'SPECIES': [1, 2, 3, 3]
})
dfs = {'SEAWATER': df_seawater, 'BIOTA': df_biota}
attrs = {'id': '123', 'title': 'Test title', 'summary': 'Summary test'}
dest = tempfile.mktemp(suffix='.nc')Encoders
The test data below simulates what a handler might produce: two DataFrames (SEAWATER and BIOTA) with a handful of rows each, covering the key column types: identifiers, coordinates, timestamps, measurements, and controlled-vocabulary fields like AREA, NUCLIDE, and SPECIES.
The NetCDFEncoder class is the workhorse of this module: it takes a dict of handler-curated DataFrames and writes them out as a single self-contained NetCDF4 file following the MARIS template.
NetCDFEncoder
def NetCDFEncoder(
dfs:Dict, # {NC_GROUPS key → DataFrame}, e.g. {'SEAWATER': df_sw, 'BIOTA': df_bio}
dest_fname:str, # Name of output file to produce
global_attrs:Dict, # NetCDF global attributes (id, title, summary, keywords, ...)
fn_src_fname:Callable=nc_tpl_path, # Callable returning path to the MARIS NetCDF template
verbose:bool=False, # Print currently written NetCDF group and variable names
):
MARIS NetCDF encoder: transforms handler-curated DataFrames into a self-contained NetCDF4 file.
NetCDFEncoder.copy_global_attrs
def copy_global_attrs(
):
Update NetCDF template global attributes as specified by global_attrs argument.
NetCDFEncoder.copy_dims
def copy_dims(
grp_dest, # Destination NetCDF group
):
Copy dimensions from template into a group.
NetCDFEncoder.process_grps
def process_grps(
):
Iterate all groups in dfs and encode each one.
NetCDFEncoder.process_grp
def process_grp(
grp_name:str, # NC_GROUPS key, e.g. `'SEAWATER'`
df:DataFrame, # Measurements for this group
):
Create a destination group, copy dimensions, then create and populate variables from the DataFrame.
NetCDFEncoder.copy_vars
def copy_vars(
grp_name:str, # NC_GROUPS key
df:DataFrame, # Measurements for this group
grp_dest, # Destination NetCDF group
):
Copy variables from template into group, filling from df.
NetCDFEncoder.copy_var
def copy_var(
var_name:str, # NetCDF variable name
var_src, # Source template variable
df:DataFrame, # DataFrame with the data
grp_dest, # Destination NetCDF group
):
Copy a single variable: create, populate, copy attrs.
NetCDFEncoder.var_type
def var_type(
dtype_name:str, # Datatype name from template
var_src, # Source template variable
):
Pick enum type if available, else template datatype.
NetCDFEncoder.create_var
def create_var(
grp_dest, # Destination NetCDF group
var_name:str, # NetCDF variable name
variable_type, # NetCDF type (enum, str, or float)
):
Create a NetCDF variable with zlib compression.
NetCDFEncoder.fill_var
def fill_var(
grp_dest, # Destination NetCDF group
var_name:str, # NetCDF variable name
variable_type, # NetCDF type (enum, str, or float)
df:DataFrame, # DataFrame with the data
):
Populate a NetCDF variable from a DataFrame column.
NetCDFEncoder.fillna_enum
def fillna_enum(
values, # Array of values, possibly with NaN
fill_value:int=-1, # Sentinel for missing enum values
):
Replace NaN in enum-typed columns with a fill value.
NetCDF enum types store values as plain integers. They don’t have a native concept of “missing.” So when an enum column contains NaN, we replace them with a sentinel value (-1) before writing to disk.
enc = NetCDFEncoder(dfs, dest_fname=tempfile.mktemp(suffix='.nc'), global_attrs=attrs)
vals = np.array([1.0, np.nan, 3.0, np.nan])
res = enc.fillna_enum(vals)
test_eq(list(res), [1, -1, 3, -1])NetCDFEncoder.copy_var_attrs
def copy_var_attrs(
var_name:str, # NetCDF variable name
var_src, # Source template variable
grp_dest, # Destination NetCDF group
):
Copy attributes from template variable to destination.
NetCDFEncoder.all_cols
def all_cols(
):
All unique NC columns present across all groups.
Before we create enum types, we need to know which columns we’re dealing with across all groups. all_cols gathers them for us. It picks up SPECIES from BIOTA and AREA, LAT, LON, NUCLIDE, SMP_ID, SMP_ID_PROVIDER, STATION, TIME, and VALUE from both groups.
enc = NetCDFEncoder(dfs, dest_fname=tempfile.mktemp(suffix='.nc'), global_attrs=attrs)
test_eq(set(enc.all_cols), {'AREA', 'LAT', 'LON', 'NUCLIDE', 'SMP_ID',
'SMP_ID_PROVIDER', 'SPECIES', 'STATION', 'TIME', 'VALUE'})NetCDFEncoder.create_enums
def create_enums(
):
Create NetCDF enum types for all columns referenced in the data.
NetCDFEncoder.encode
def encode(
):
Encode MARIS NetCDF based on template and dataframes.
How it works
Let’s run the encoder on test data and inspect each step to see how the pieces fit together.
Global attributes
We inherit the template’s standard attributes, then layer on the ones unique to this dataset. Things like title, summary, and identifier.
Let’s encode our test dataframes and verify the global attributes made it through:
encoder = NetCDFEncoder(dfs, dest_fname=dest, global_attrs=attrs)
encoder.encode()
with Dataset(dest, 'r', format='NETCDF4') as nc:
test_eq(nc.id, '123')
test_eq(nc.title, 'Test title')
test_eq(nc.summary, 'Summary test')Dimensions
Each group gets an id dimension from the template. It’s an unlimited dimension, so rows can be appended later without restructuring the file. Its length matches the number of rows in the group’s DataFrame.
with Dataset(dest, 'r', format='NETCDF4') as nc:
for grp_name in ('seawater', 'biota'):
grp = nc.groups[grp_name]
test_eq('id' in grp.dimensions, True)
test_eq(grp.dimensions['id'].isunlimited(), True)
test_eq(len(grp.dimensions['id']), len(dfs[grp_name.upper()]))Groups and variables
Each key in dfs becomes a NetCDF group. Within each group, only the variables matching the DataFrame columns are created. So biota gets a species variable while seawater doesn’t.
with Dataset(dest, 'r', format='NETCDF4') as nc:
test_eq(list(nc.groups.keys()), ['seawater', 'biota'])
sw_vars = list(nc['seawater'].variables.keys())
bio_vars = list(nc['biota'].variables.keys())
test_eq('species' in bio_vars, True)
test_eq('species' in sw_vars, False)Variable values
Each DataFrame column becomes the corresponding NetCDF variable. The mapping goes through NC_VARS, so VALUE becomes value, LAT becomes lat, and so on.
with Dataset(dest, 'r', format='NETCDF4') as nc:
sw = nc['seawater']
test_eq(list(sw['id'][:]), [0, 1, 2])
test_eq(list(sw['lon'][:]), [141.0, 142.0, 143.0])
test_eq(list(sw['lat'][:]), [37.3, 38.3, 39.3])
test_eq(list(sw['value'][:]), [0.1, 1.1, 2.1])Enum types
Columns backed by a controlled vocabulary (area, nuclide, species, …) use NetCDF enum types rather than plain integers.
NetCDF enums store values as plain integers (int64) on disk, with a human-readable label mapping attached as metadata. The file stores [1, 2, 3, 3] as int64, but ncdump or netCDF4-python display them as species names. You can inspect this directly:
with Dataset(dest, 'r', format='NETCDF4') as nc:
species = nc['biota']['species']
print("Enum name: ", species.datatype.name)
print("On-disk type: ", species.dtype)
print("Raw values: ", species[:])
mapping = species.datatype.enum_dict
for k, v in list(mapping.items())[:5]:
print(f" {v} → '{k}'")Enum name: species_t
On-disk type: int64
Raw values: [1 2 3 3]
0 → 'NOT AVAILABLE'
1 → 'Aristeus antennatus'
2 → 'Apostichopus'
3 → 'Saccharina japonica var religiosa'
4 → 'Siganus fuscescens'
The test confirms that species is a proper NetCDF enum type, coordinates like lon are stored as plain float32, and free-text fields like station use the NetCDF VLType (variable-length string).
with Dataset(dest, 'r', format='NETCDF4') as nc:
biota = nc['biota']
test_eq('EnumType' in str(type(biota['species'].datatype)), True)
test_eq(biota['lon'].dtype, np.float32)
sw = nc['seawater']
test_eq(sw['station'].datatype.__class__.__name__, 'VLType')