Utilities

Various utilities

We define below useful constants throughout the package.

Exported source

# TBD: move to configs
NA = 'Not available'

Core

Abstracting some common operations.

source

get_unique_across_dfs

 get_unique_across_dfs (dfs:Dict[str,pandas.core.frame.DataFrame],
                        col_name:str='NUCLIDE', as_df:bool=False,
                        include_nchars:bool=False)

Get a list of unique column values across dataframes.

	Type	Default	Details
dfs	Dict		Dictionary of dataframes
col_name	str	NUCLIDE	Column name to extract unique values from
as_df	bool	False	Return a DataFrame of unique values
include_nchars	bool	False	Add a column with the number of characters in the value
Returns	List		Returns a list of unique column values across dataframes

Exported source

def get_unique_across_dfs(dfs: Dict[str, pd.DataFrame],  # Dictionary of dataframes
                          col_name: str='NUCLIDE', # Column name to extract unique values from
                          as_df: bool=False, # Return a DataFrame of unique values
                          include_nchars: bool=False # Add a column with the number of characters in the value
                          ) -> List[str]: # Returns a list of unique column values across dataframes
    "Get a list of unique column values across dataframes."
    unique_values = list(set().union(*(df[col_name].unique() for df in dfs.values() if col_name in df.columns)))
    if not as_df:
        return unique_values
    else:
        df_uniques = pd.DataFrame(unique_values, columns=['value']).reset_index()
        if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len()
        return df_uniques

Example of use:

dfs_test = {'SEAWATER': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'BIOTA': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'SEDIMENT': pd.DataFrame({'NUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))

What if the column name is not in one of the dataframe?

dfs_test = {'SEAWATER': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'BIOTA': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'SEDIMENT': pd.DataFrame({'NONUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))

get_unique_across_dfs(dfs_test, col_name='NUCLIDE', as_df=True, include_nchars=True)

	index	value	n_chars
0	0	cs134_137_tot	13
1	1	cs137	5
2	2	cs134	5

source

Remapper

 Remapper (provider_lut_df:pandas.core.frame.DataFrame,
           maris_lut_fn:Union[Callable,pandas.core.frame.DataFrame],
           maris_col_id:str, maris_col_name:str,
           provider_col_to_match:str, provider_col_key:str,
           fname_cache:str)

Remap a data provider lookup table to a MARIS lookup table using fuzzy matching.

	Type	Details
provider_lut_df	DataFrame	Data provider lookup table to be remapped
maris_lut_fn	Union	MARIS lookup table or function returning the path
maris_col_id	str	MARIS lookup table column name for the id
maris_col_name	str	MARIS lookup table column name for the name
provider_col_to_match	str	Data provider lookup table column name for the name to match
provider_col_key	str	Data provider lookup table column name for the key
fname_cache	str	Cache file name

Exported source

class Remapper():
    "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
    def __init__(self,
                 provider_lut_df: pd.DataFrame, # Data provider lookup table to be remapped
                 maris_lut_fn: Union[Callable, pd.DataFrame], # MARIS lookup table or function returning the path
                 maris_col_id: str, # MARIS lookup table column name for the id
                 maris_col_name: str, # MARIS lookup table column name for the name
                 provider_col_to_match: str, # Data provider lookup table column name for the name to match
                 provider_col_key: str, # Data provider lookup table column name for the key
                 fname_cache: str # Cache file name
                 ):
        fc.store_attr()
        self.cache_file = cache_path() / fname_cache
        # Check if maris_lut is a callable function or already a DataFrame
        if callable(maris_lut_fn):
            self.maris_lut = maris_lut_fn()
        else:
            self.maris_lut = maris_lut_fn
        self.lut = {}

    def generate_lookup_table(self, 
                              fixes={}, # Lookup table fixes
                              as_df=True, # Whether to return a DataFrame
                              overwrite=True):
        "Generate a lookup table from a data provider lookup table to a MARIS lookup table using fuzzy matching."
        self.fixes = fixes
        self.as_df = as_df
        if overwrite or not self.cache_file.exists():
            self._create_lookup_table()
            fc.save_pickle(self.cache_file, self.lut)
        else:
            self.lut = fc.load_pickle(self.cache_file)

        return self._format_output()

    def _create_lookup_table(self):
        df = self.provider_lut_df
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"): 
            self._process_row(row)

    def _process_row(self, row):
        value_to_match = row[self.provider_col_to_match]
        if isinstance(value_to_match, str):  # Only process if value is a string
            # If value is in fixes, use the fixed value
            name_to_match = self.fixes.get(value_to_match, value_to_match)
            result = match_maris_lut(self.maris_lut, name_to_match, self.maris_col_id, self.maris_col_name).iloc[0]
            match = Match(result[self.maris_col_id], result[self.maris_col_name], 
                          value_to_match, result['score'])
            self.lut[row[self.provider_col_key]] = match
        else:
            # Handle non-string values (e.g., NaN)
            self.lut[row[self.provider_col_key]] = Match(-1, "Unknown", value_to_match, 0)
            
    def select_match(self, match_score_threshold:int=1, verbose:bool=False):
        if verbose:
            matched_len= len([v for v in self.lut.values() if v.match_score < match_score_threshold])
            print(f"{matched_len} entries matched the criteria, while {len(self.lut) - matched_len} entries had a match score of {match_score_threshold} or higher.")
        
        self.lut = {k: v for k, v in self.lut.items() if v.match_score >= match_score_threshold}
        return self._format_output()

    def _format_output(self):
        if not self.as_df: return self.lut
        df_lut = pd.DataFrame.from_dict(self.lut, orient='index', 
                                        columns=['matched_maris_name', 'source_name', 'match_score'])
        df_lut.index.name = 'source_key'
        return df_lut.sort_values(by='match_score', ascending=False)

Validation

source

has_valid_varname

 has_valid_varname (var_names:List[str], cdl_path:str,
                    group:Optional[str]=None)

Check that proposed variable names are in MARIS CDL

	Type	Default	Details
var_names	List		variable names
cdl_path	str		Path to MARIS CDL file (point of truth)
group	Optional	None	Check if the variable names is contained in the group

Exported source

# TBD: Assess if still needed
def has_valid_varname(
    var_names: List[str], # variable names
    cdl_path: str, # Path to MARIS CDL file (point of truth)
    group: Optional[str] = None, # Check if the variable names is contained in the group
):
    "Check that proposed variable names are in MARIS CDL"
    has_valid = True
    with Dataset(cdl_path) as nc:
        cdl_vars={}
        all_vars=[]
        # get variable names in CDL 
        for grp in nc.groups.values():
            # Create a list of var for each group
            vars = list(grp.variables.keys())
            cdl_vars[grp.name] = vars
            all_vars.extend(vars)
        
    if group != None:
        allowed_vars= cdl_vars[group]
    else: 
        # get unique 
        allowed_vars = list(set(all_vars))
        
    for name in var_names:
        if name not in allowed_vars:
            has_valid = False
            if group != None:
                print(f'"{name}" variable name not found in group "{group}" of MARIS CDL')
            else:
                print(f'"{name}" variable name not found in MARIS CDL')
    return has_valid

# VARNAMES = ['lat', 'lon']
# test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), True)

# VARNAMES = ['ba140_invalid', 'ba140_dl']
# test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), False)

Geoprocessing

source

get_bbox

 get_bbox (df, coord_cols:Tuple[str,str]=('LON', 'LAT'))

Get the bounding box of a DataFrame.

Exported source

def get_bbox(df,
             coord_cols: Tuple[str, str] = ('LON', 'LAT')
            ):
    "Get the bounding box of a DataFrame."
    x, y = coord_cols        
    arr = [(row[x], row[y]) for _, row in df.iterrows()]
    return MultiPoint(arr).envelope

df = pd.DataFrame({'LON': np.linspace(-10, 5, 20), 'LAT':  np.linspace(40, 50, 20)})
bbox = get_bbox(df);

# To get `lon_min`, `lon_max`, `lat_min`, `lat_max`
bbox.bounds

(-10.0, 40.0, 5.0, 50.0)

# And its Well-Know Text representation
bbox.wkt

'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'

# If unique (lon, lat)
df = pd.DataFrame({'LON': [0, 0], 'LAT':  [1, 1]})
bbox = get_bbox(df);

bbox.bounds

(0.0, 1.0, 0.0, 1.0)

source

ddmm_to_dd

 ddmm_to_dd (ddmmmm:float)

	Type	Details
ddmmmm	float	Coordinates in degrees/minutes decimal format
Returns	float	Coordinates in degrees decimal format

Exported source

def ddmm_to_dd(
    ddmmmm: float # Coordinates in degrees/minutes decimal format
    ) -> float: # Coordinates in degrees decimal format
    # Convert degrees/minutes decimal to degrees decimal.
    mins, degs = modf(ddmmmm)
    mins = mins * 100
    return round(int(degs) + (mins / 60), 6)

fc.test_close(ddmm_to_dd(45.34), 45.566667)

Downloaders

source

download_file

 download_file (owner, repo, src_dir, dest_dir, fname)

Exported source

def download_files_in_folder(
    owner: str, # GitHub owner
    repo: str, # GitHub repository
    src_dir: str, # Source directory
    dest_dir: str # Destination directory
    ):
    "Make a GET request to the GitHub API to get the contents of the folder."
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

source

download_files_in_folder

 download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)

Make a GET request to the GitHub API to get the contents of the folder.

	Type	Details
owner	str	GitHub owner
repo	str	GitHub repository
src_dir	str	Source directory
dest_dir	str	Destination directory

WorRMS

The World Register of Marine Species (WorMS) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to “fuzzy” match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described here.

source

match_worms

 match_worms (name:str)

Lookup name in WoRMS (fuzzy match).

	Type	Details
name	str	Name of species to look up in WoRMS

Exported source

def match_worms(
    name: str # Name of species to look up in WoRMS
    ):
    "Lookup `name` in WoRMS (fuzzy match)."
    url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
    params = {
        'scientificnames[]': [name],
        'marine_only': 'true'
    }
    headers = {
        'accept': 'application/json'
    }
    
    response = requests.get(url, params=params, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        return -1

For instance:

match_worms('Aristeus antennatus')

[[{'AphiaID': 107083,
   'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
   'scientificname': 'Aristeus antennatus',
   'authority': '(Risso, 1816)',
   'status': 'accepted',
   'unacceptreason': None,
   'taxonRankID': 220,
   'rank': 'Species',
   'valid_AphiaID': 107083,
   'valid_name': 'Aristeus antennatus',
   'valid_authority': '(Risso, 1816)',
   'parentNameUsageID': 106807,
   'kingdom': 'Animalia',
   'phylum': 'Arthropoda',
   'class': 'Malacostraca',
   'order': 'Decapoda',
   'family': 'Aristeidae',
   'genus': 'Aristeus',
   'citation': 'DecaNet eds. (2025). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2025-05-28',
   'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
   'isMarine': 1,
   'isBrackish': 0,
   'isFreshwater': 0,
   'isTerrestrial': 0,
   'isExtinct': 0,
   'match_type': 'exact',
   'modified': '2022-08-24T09:48:14.813Z'}]]

Fuzzy matching for MARIS Lookup Tables

Using https://jamesturk.github.io/jellyfish fuzzy matching distance metrics.

source

Match

 Match (matched_id:int, matched_maris_name:str, source_name:str,
        match_score:int)

Match between a data provider name and a MARIS lookup table.

Exported source

@dataclass
class Match:
    "Match between a data provider name and a MARIS lookup table."
    matched_id: int
    matched_maris_name: str
    source_name: str
    match_score: int

source

match_maris_lut

 match_maris_lut (lut:Union[str,pandas.core.frame.DataFrame,pathlib.Path],
                  data_provider_name:str, maris_id:str, maris_name:str,
                  dist_fn:Callable=<built-in function
                  levenshtein_distance>, nresults:int=10)

Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, …).

	Type	Default	Details
lut	Union		Either str, Path or DataFrame
data_provider_name	str		Name of data provider nomenclature item to look up
maris_id	str		Id of MARIS lookup table nomenclature item to match
maris_name	str		Name of MARIS lookup table nomenclature item to match
dist_fn	Callable	levenshtein_distance	Distance function
nresults	int	10	Maximum number of results to return
Returns	DataFrame

Exported source

def match_maris_lut(
    lut: Union[str, pd.DataFrame, Path], # Either str, Path or DataFrame
    data_provider_name: str, # Name of data provider nomenclature item to look up 
    maris_id: str, # Id of MARIS lookup table nomenclature item to match
    maris_name: str, # Name of MARIS lookup table nomenclature item to match
    dist_fn: Callable = jf.levenshtein_distance, # Distance function
    nresults: int = 10 # Maximum number of results to return
) -> pd.DataFrame:
    "Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, ...)."
    if isinstance(lut, str) or isinstance(lut, Path):
        df = pd.read_excel(lut)  # Load the LUT if a path is provided
    elif isinstance(lut, pd.DataFrame):
        df = lut  # Use the DataFrame directly if provided
    else:
        raise ValueError("lut must be either a file path or a DataFrame")

    df = df.dropna(subset=[maris_name])
    df = df.astype({maris_id: 'int'})
    df['score'] = df[maris_name].str.lower().apply(lambda x: dist_fn(data_provider_name.lower(), x))
    df = df.sort_values(by='score', ascending=True)[:nresults]
    return df[[maris_id, maris_name, 'score']]

Below an example trying to match the name “PLANKTON” with dbo_species_cleaned.xlsx MARIS biota species lookup table:

lut_fname = '../files/lut/dbo_species_cleaned.xlsx'
match_maris_lut(lut_fname, data_provider_name='PLANKTON', 
                maris_id='species_id', maris_name='species')

	species_id	species	score
281	280	Plankton	0
696	695	Zooplankton	3
633	632	Palaemon	4
697	696	Phytoplankton	5
812	811	Chanos	5
160	159	Neuston	5
234	233	Penaeus	6
1458	1457	Lamnidae	6
1438	1437	Labrus	6
1527	1526	Favites	6

Below, we demonstrate matching the laboratory name “Central Mining Institute, Poland” with the MARIS lab lookup table from dbo_lab.xlsx. This example utilizes the lab and country columns. Note that in this instance, df_lut is passed directly as the lut argument.

lut_fname = '../files/lut/dbo_lab.xlsx'
df_lut=pd.read_excel(lut_fname)
df_lut['lab_country'] = df_lut['lab'] + '_' + df_lut['country']

match_maris_lut(lut=df_lut, data_provider_name='Central Mining Institute, Poland', 
                maris_id='lab_id', maris_name='lab_country')

	lab_id	lab_country	score
6	5	Central Mining Institute_Poland	2
203	202	Polytechnic Institute_Romania	18
282	281	Norwegian Polar Institute_Norway	21
113	112	Nuclear Research Institute_Vietnam	22
246	245	Paul Scherrer Institute_Switzerland	22
136	135	Nuclear Energy Board_Ireland	23
471	474	Kobe University_Japan	23
429	432	Qatar University_Qatar	23
174	173	Interfaculty Reactor Institute_Netherlands	23
177	176	RIKILT_Netherlands	23

Below an example trying to match the name “GLACIAL” with dbo_sedtype.xlsx MARIS sediment lookup table:

lut_fname = '../files/lut/dbo_sedtype.xlsx'
match_maris_lut(lut_fname, data_provider_name='GLACIAL', 
                maris_id='sedtype_id', maris_name='sedtype')

	sedtype_id	sedtype	score
26	25	Glacial	0
3	2	Gravel	4
2	1	Clay	5
51	50	Glacial clay	5
4	3	Marsh	6
7	6	Sand	6
13	12	Silt	6
15	14	Sludge	6
27	26	Soft	7
52	51	Soft clay	7

lut_fname = '../files/lut/dbo_nuclide.xlsx'
match_maris_lut(lut_fname, data_provider_name='CS-137', 
                maris_id='nuclide_id', maris_name='nc_name')

	nuclide_id	nc_name	score
31	33	cs137	1
30	31	cs134	2
29	30	cs127	2
99	102	cs136	2
109	112	sb127	3
111	114	ce139	3
25	24	sb125	4
36	38	pm147	4
28	29	i131	4
110	113	ba133	4

Downloaders

source

download_file

 download_file (owner, repo, src_dir, dest_dir, fname)

Exported source

def download_files_in_folder(
    owner: str, # GitHub owner
    repo: str, # GitHub repository
    src_dir: str, # Source directory
    dest_dir: str # Destination directory
    ):
    "Make a GET request to the GitHub API to get the contents of the folder"
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

source

download_files_in_folder

 download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)

Make a GET request to the GitHub API to get the contents of the folder

	Type	Details
owner	str	GitHub owner
repo	str	GitHub repository
src_dir	str	Source directory
dest_dir	str	Destination directory

Test

source

test_dfs

 test_dfs (dfs1:Dict[str,pandas.core.frame.DataFrame],
           dfs2:Dict[str,pandas.core.frame.DataFrame])

Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order).

	Type	Details
dfs1	Dict	First dictionary of DataFrames to compare
dfs2	Dict	Second dictionary of DataFrames to compare
Returns	None	It raises an `AssertionError` if the DataFrames are not equal

Exported source

def test_dfs(
    dfs1: Dict[str, pd.DataFrame], # First dictionary of DataFrames to compare 
    dfs2: Dict[str, pd.DataFrame] # Second dictionary of DataFrames to compare
    ) -> None: # It raises an `AssertionError` if the DataFrames are not equal
    "Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order)."
    for grp in dfs1.keys():
        df1, df2 = (df.sort_index() for df in (dfs1[grp], dfs2[grp]))
        fc.test_eq(df1, df2.reindex(columns=df1.columns))

NetCDF Utilities

Extract NetCDF contents

source

ExtractNetcdfContents

 ExtractNetcdfContents (filename:str, verbose:bool=False)

Initialize and extract data from a NetCDF file.

Exported source

class ExtractNetcdfContents:
    def __init__(self, filename: str, verbose: bool = False):
        "Initialize and extract data from a NetCDF file."
        self.filename = filename
        self.verbose = verbose
        self.dfs = {}  # DataFrames extracted from the NetCDF file
        self.enum_dicts = {}  # Enum dictionaries extracted from the NetCDF file
        self.global_attrs = {}  # Global attributes extracted from the NetCDF file
        self.custom_maps = {}  # Custom maps extracted from the NetCDF file
        self.extract_all()

    def extract_all(self):
        "Extract data, enums, and global attributes from the NetCDF file."
        if not Path(self.filename).exists():
            print(f'File {self.filename} not found.')
            return
        
        with Dataset(self.filename, 'r') as nc:
            self.global_attrs = self.extract_global_attributes(nc)
            for group_name in nc.groups:
                group = nc.groups[group_name]
                self.dfs[group_name.upper()] = self.extract_data(group)
                self.enum_dicts[group_name.upper()] = self.extract_enums(group, group_name)
                self.custom_maps[group_name.upper()] = self.extract_custom_maps(group, group_name)
                
            if self.verbose:
                print("Data extraction complete.")

    def extract_data(self, group) -> pd.DataFrame:
        "Extract data from a group and convert to DataFrame."
        data = {var_name: var[:] for var_name, var in group.variables.items() if var_name not in group.dimensions}
        df = pd.DataFrame(data)
        rename_map = {nc_var: col for col, nc_var in NC_VARS.items() if nc_var in df.columns}
        df = df.rename(columns=rename_map)
        return df

    def extract_enums(self, group, group_name: str) -> Dict:
        "Extract enum dictionaries for variables in a group."
        local_enum_dicts = {}
        for var_name, var in group.variables.items():
            if hasattr(var.datatype, 'enum_dict'):
                local_enum_dicts[var_name] = {str(k): str(v) for k, v in var.datatype.enum_dict.items()}
                if self.verbose:
                    print(f"Extracted enum_dict for {var_name} in {group_name}")
        return local_enum_dicts

    def extract_global_attributes(self, nc) -> Dict:
        "Extract global attributes from the NetCDF file."
        globattrs = {attr: getattr(nc, attr) for attr in nc.ncattrs()}
        return globattrs
    
    def extract_custom_maps(self, group, group_name: str) -> Dict:
        "Extract custom maps from the NetCDF file."
        reverse_nc_vars = {v: k for k, v in NC_VARS.items()}        
        custom_maps = {}
        for var_name, var in group.variables.items():
            attr=f"{var_name}_map"
            if hasattr(var, attr):
                custom_maps[reverse_nc_vars[var.name]] =  literal_eval(getattr(var, attr))
        return custom_maps

# fname = Path('../../_data/output/190-geotraces-2021.nc')
#fname = Path('../../_data/output/tepco.nc')
fname = Path('../../_data/output/tepco.nc')
#fname = Path('./files/nc/encoding-test.nc')
contents= ExtractNetcdfContents(fname)

print(contents.dfs)
print(contents.enum_dicts)
print(contents.global_attrs)
print(contents.custom_maps)

{'SEAWATER':               LON        LAT        TIME STATION  NUCLIDE   VALUE  UNIT  UNC  \
0      140.603882  36.299721  1318512060     T-C       29     NaN     3  NaN   
1      140.603882  36.299721  1318512060     T-C       31     NaN     3  NaN   
2      140.603882  36.299721  1318512060     T-C       33     NaN     3  NaN   
3      140.603882  36.299721  1318512180     T-C       29     NaN     3  NaN   
4      140.603882  36.299721  1318512180     T-C       31     NaN     3  NaN   
...           ...        ...         ...     ...      ...     ...   ...  ...   
81491  141.666672  38.299999  1722930660   T-MG2       33  0.0016     3  NaN   
81492  141.666672  38.299999  1725956340   T-MG2       31     NaN     3  NaN   
81493  141.666672  38.299999  1725956340   T-MG2       33  0.0015     3  NaN   
81494  141.666672  38.299999  1725957120   T-MG2       31     NaN     3  NaN   
81495  141.666672  38.299999  1725957120   T-MG2       33  0.0014     3  NaN   

       DL     DLV  
0       3  4.0000  
1       3  6.0000  
2       3  9.0000  
3       3  4.0000  
4       3  6.0000  
...    ..     ...  
81491   1     NaN  
81492   3  0.0011  
81493   1     NaN  
81494   3  0.0011  
81495   1     NaN  

[81496 rows x 10 columns]}
{'SEAWATER': {'nuclide': {'NOT APPLICABLE': '-1', 'NOT AVAILABLE': '0', 'h3': '1', 'be7': '2', 'c14': '3', 'k40': '4', 'cr51': '5', 'mn54': '6', 'co57': '7', 'co58': '8', 'co60': '9', 'zn65': '10', 'sr89': '11', 'sr90': '12', 'zr95': '13', 'nb95': '14', 'tc99': '15', 'ru103': '16', 'ru106': '17', 'rh106': '18', 'ag106m': '19', 'ag108': '20', 'ag108m': '21', 'ag110m': '22', 'sb124': '23', 'sb125': '24', 'te129m': '25', 'i129': '28', 'i131': '29', 'cs127': '30', 'cs134': '31', 'cs137': '33', 'ba140': '34', 'la140': '35', 'ce141': '36', 'ce144': '37', 'pm147': '38', 'eu154': '39', 'eu155': '40', 'pb210': '41', 'pb212': '42', 'pb214': '43', 'bi207': '44', 'bi211': '45', 'bi214': '46', 'po210': '47', 'rn220': '48', 'rn222': '49', 'ra223': '50', 'ra224': '51', 'ra225': '52', 'ra226': '53', 'ra228': '54', 'ac228': '55', 'th227': '56', 'th228': '57', 'th232': '59', 'th234': '60', 'pa234': '61', 'u234': '62', 'u235': '63', 'u238': '64', 'np237': '65', 'np239': '66', 'pu238': '67', 'pu239': '68', 'pu240': '69', 'pu241': '70', 'am240': '71', 'am241': '72', 'cm242': '73', 'cm243': '74', 'cm244': '75', 'cs134_137_tot': '76', 'pu239_240_tot': '77', 'pu239_240_iii_iv_tot': '78', 'pu239_240_v_vi_tot': '79', 'cm243_244_tot': '80', 'pu238_pu239_240_tot_ratio': '81', 'am241_pu239_240_tot_ratio': '82', 'cs137_134_ratio': '83', 'cd109': '84', 'eu152': '85', 'fe59': '86', 'gd153': '87', 'ir192': '88', 'pu238_240_tot': '89', 'rb86': '90', 'sc46': '91', 'sn113': '92', 'sn117m': '93', 'tl208': '94', 'mo99': '95', 'tc99m': '96', 'ru105': '97', 'te129': '98', 'te132': '99', 'i132': '100', 'i135': '101', 'cs136': '102', 'tbeta': '103', 'talpha': '104', 'i133': '105', 'th230': '106', 'pa231': '107', 'u236': '108', 'ag111': '109', 'in116m': '110', 'te123m': '111', 'sb127': '112', 'ba133': '113', 'ce139': '114', 'tl201': '116', 'hg203': '117', 'na22': '122', 'pa234m': '123', 'am243': '124', 'se75': '126', 'sr85': '127', 'y88': '128', 'ce140': '129', 'bi212': '130', 'u236_238_ratio': '131', 'i125': '132', 'ba137m': '133', 'u232': '134', 'pa233': '135', 'ru106_rh106_tot': '136', 'tu': '137', 'tbeta40k': '138', 'fe55': '139', 'ce144_pr144_tot': '140', 'pu240_pu239_ratio': '141', 'u233': '142', 'pu239_242_tot': '143', 'ac227': '144'}, 'unit': {'Not applicable': '-1', 'NOT AVAILABLE': '0', 'Bq per m3': '1', 'Bq per m2': '2', 'Bq per kg': '3', 'Bq per kgd': '4', 'Bq per kgw': '5', 'kg per kg': '6', 'TU': '7', 'DELTA per mill': '8', 'atom per kg': '9', 'atom per kgd': '10', 'atom per kgw': '11', 'atom per l': '12', 'Bq per kgC': '13'}, 'dlt': {'Not applicable': '-1', 'Not available': '0', 'Detected value': '1', 'Detection limit': '2', 'Not detected': '3', 'Derived': '4'}}}
{'id': 'JEV6HP5A', 'title': "Readings of Sea Area Monitoring - Monitoring of sea water - Sea area close to TEPCO's Fukushima Daiichi NPS / Coastal area - Readings of Sea Area Monitoring [TEPCO]", 'summary': '', 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes, Earth Science > Oceans > Water Quality > Ocean Contaminants, Earth Science > Biological Classification > Animals/Vertebrates > Fish, Earth Science > Biosphere > Ecosystems > Marine Ecosystems, Earth Science > Biological Classification > Animals/Invertebrates > Mollusks, Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans, Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)', 'history': 'TBD', 'keywords_vocabulary': 'GCMD Science Keywords', 'keywords_vocabulary_url': 'https://gcmd.earthdata.nasa.gov/static/kms/', 'record': 'TBD', 'featureType': 'TBD', 'cdm_data_type': 'TBD', 'Conventions': 'CF-1.10 ACDD-1.3', 'publisher_name': 'Paul MCGINNITY, Iolanda OSVATH, Florence DESCROIX-COMANDUCCI', 'publisher_email': 'p.mc-ginnity@iaea.org, i.osvath@iaea.org, F.Descroix-Comanducci@iaea.org', 'publisher_url': 'https://maris.iaea.org', 'publisher_institution': 'International Atomic Energy Agency - IAEA', 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "TEPCO - Tokyo Electric Power Company"}]', 'institution': 'TBD', 'metadata_link': 'TBD', 'creator_email': 'TBD', 'creator_url': 'TBD', 'references': 'TBD', 'license': 'Without prejudice to the applicable Terms and Conditions (https://nucleus.iaea.org/Pages/Others/Disclaimer.aspx), I hereby agree that any use of the data will contain appropriate acknowledgement of the data source(s) and the IAEA Marine Radioactivity Information System (MARIS).', 'comment': 'TBD', 'geospatial_lat_min': '141.66666667', 'geospatial_lon_min': '140.60388889', 'geospatial_lat_max': '38.63333333', 'geospatial_lon_max': '35.79611111', 'geospatial_vertical_min': 'TBD', 'geospatial_vertical_max': 'TBD', 'geospatial_bounds': 'POLYGON ((140.60388889 35.79611111, 141.66666667 35.79611111, 141.66666667 38.63333333, 140.60388889 38.63333333, 140.60388889 35.79611111))', 'geospatial_bounds_crs': 'EPSG:4326', 'time_coverage_start': '2011-03-21T14:30:00', 'time_coverage_end': '2025-01-25T07:24:00', 'local_time_zone': 'TBD', 'date_created': 'TBD', 'date_modified': 'TBD', 'publisher_postprocess_logs': "Remove 約 (about) char, Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean, Select columns of interest., \n    Get TEPCO nuclide names as values not column names \n    to extract contained information (nuclide name, unc, dl, ...).\n    , Extract nuclide name from TEPCO data., Extract unit from TEPCO data., Extract value type from TEPCO data., Reshape: long to wide, \n    Remap `UNIT` name to MARIS id.\n    , Remap `NUCLIDE` name to MARIS id., Remap `DL` name to MARIS id., Cast `VALUE` to float., Parse time column from TEPCO., Encode time as seconds since epoch., Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator."}
{'SEAWATER': {}}