Various utilities

We define below useful constants throughout the package.

Exported source
NA = 'Not available'

Core

Abstracting some common operations.


source

get_unique_across_dfs

 get_unique_across_dfs (dfs:dict, col_name:str='NUCLIDE',
                        as_df:bool=False, include_nchars:bool=False)

Get a list of unique column values across dataframes.

Type Default Details
dfs dict Dictionary of dataframes
col_name str NUCLIDE Column name to extract unique values from
as_df bool False Return a DataFrame of unique values
include_nchars bool False Add a column with the number of characters in the value
Returns list Returns a list of unique column values across dataframes
Exported source
def get_unique_across_dfs(dfs:dict,  # Dictionary of dataframes
                          col_name:str='NUCLIDE', # Column name to extract unique values from
                          as_df:bool=False, # Return a DataFrame of unique values
                          include_nchars:bool=False # Add a column with the number of characters in the value
                          ) -> list: # Returns a list of unique column values across dataframes
    "Get a list of unique column values across dataframes."
    unique_values = list(set().union(*(df[col_name].unique() for df in dfs.values() if col_name in df.columns)))
    if not as_df:
        return unique_values
    else:
        df_uniques = pd.DataFrame(unique_values, columns=['value']).reset_index()
        if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len()
        return df_uniques

Example of use:

dfs_test = {'seawater': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'biota': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'sediment': pd.DataFrame({'NUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))

What if the column name is not in one of the dataframe?

dfs_test = {'seawater': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'biota': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'sediment': pd.DataFrame({'NONUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))
get_unique_across_dfs(dfs_test, col_name='NUCLIDE', as_df=True, include_nchars=True)
index value n_chars
0 0 cs134 5
1 1 cs137 5
2 2 cs134_137_tot 13

source

Remapper

 Remapper (provider_lut_df:pandas.core.frame.DataFrame,
           maris_lut_fn:<built-infunctioncallable>, maris_col_id:str,
           maris_col_name:str, provider_col_to_match:str,
           provider_col_key, fname_cache)

Remap a data provider lookup table to a MARIS lookup table using fuzzy matching.

Type Details
provider_lut_df DataFrame Data provider lookup table to be remapped
maris_lut_fn callable Function that returns the MARIS lookup table path
maris_col_id str MARIS lookup table column name for the id
maris_col_name str MARIS lookup table column name for the name
provider_col_to_match str Data provider lookup table column name for the name to match
provider_col_key Data provider lookup table column name for the key
fname_cache Cache file name
Exported source
class Remapper():
    "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
    def __init__(self,
                 provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped
                 maris_lut_fn:callable, # Function that returns the MARIS lookup table path
                 maris_col_id:str, # MARIS lookup table column name for the id
                 maris_col_name:str, # MARIS lookup table column name for the name
                 provider_col_to_match:str, # Data provider lookup table column name for the name to match
                 provider_col_key, # Data provider lookup table column name for the key
                 fname_cache  # Cache file name
                 ):
        fc.store_attr()
        self.cache_file = cache_path() / fname_cache
        self.maris_lut = maris_lut_fn()
        self.lut = {}

    def generate_lookup_table(self, 
                              fixes={}, # Lookup table fixes
                              as_df=True, # Whether to return a DataFrame
                              overwrite=True):
        "Generate a lookup table from a data provider lookup table to a MARIS lookup table using fuzzy matching."
        self.fixes = fixes
        self.as_df = as_df
        if overwrite or not self.cache_file.exists():
            self._create_lookup_table()
            fc.save_pickle(self.cache_file, self.lut)
        else:
            self.lut = fc.load_pickle(self.cache_file)

        return self._format_output()

    def _create_lookup_table(self):
        df = self.provider_lut_df
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"): 
            self._process_row(row)

    def _process_row(self, row):
        value_to_match = row[self.provider_col_to_match]
        if isinstance(value_to_match, str):  # Only process if value is a string
            # If value is in fixes, use the fixed value
            name_to_match = self.fixes.get(value_to_match, value_to_match)
            
            result = match_maris_lut(self.maris_lut, name_to_match, self.maris_col_id, self.maris_col_name).iloc[0]
            match = Match(result[self.maris_col_id], result[self.maris_col_name], 
                          value_to_match, result['score'])
            self.lut[row[self.provider_col_key]] = match
        else:
            # Handle non-string values (e.g., NaN)
            self.lut[row[self.provider_col_key]] = Match(-1, "Unknown", value_to_match, 0)
            
    def select_match(self, match_score_threshold:int=1):
        self.lut = {k: v for k, v in self.lut.items() if v.match_score >= match_score_threshold}
        return self._format_output()

    def _format_output(self):
        if not self.as_df: return self.lut
        df_lut = pd.DataFrame.from_dict(self.lut, orient='index', 
                                        columns=['matched_maris_name', 'source_name', 'match_score'])
        df_lut.index.name = 'source_key'
        return df_lut.sort_values(by='match_score', ascending=False)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 2
      1 #| exports
----> 2 class Remapper():
      3     "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
      4     def __init__(self,
      5                  provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped
      6                  maris_lut_fn:callable, # Function that returns the MARIS lookup table path
   (...)
     11                  fname_cache  # Cache file name
     12                  ):

Cell In[2], line 5, in Remapper()
      2 class Remapper():
      3     "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
      4     def __init__(self,
----> 5                  provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped
      6                  maris_lut_fn:callable, # Function that returns the MARIS lookup table path
      7                  maris_col_id:str, # MARIS lookup table column name for the id
      8                  maris_col_name:str, # MARIS lookup table column name for the name
      9                  provider_col_to_match:str, # Data provider lookup table column name for the name to match
     10                  provider_col_key, # Data provider lookup table column name for the key
     11                  fname_cache  # Cache file name
     12                  ):
     13         fc.store_attr()
     14         self.cache_file = cache_path() / fname_cache

NameError: name 'pd' is not defined

Validation


source

has_valid_varname

 has_valid_varname (var_names:list, cdl_path:str, group=None)

Check that proposed variable names are in MARIS CDL

Type Default Details
var_names list variable names
cdl_path str Path to MARIS CDL file (point of truth)
group NoneType None Check if the variable names is contained in the group
Exported source
def has_valid_varname(
    var_names:list, # variable names
    cdl_path:str, # Path to MARIS CDL file (point of truth)
    group = None, # Check if the variable names is contained in the group
):
    "Check that proposed variable names are in MARIS CDL"
    has_valid = True
    with Dataset(cdl_path) as nc:
        cdl_vars={}
        all_vars=[]
        # get variable names in CDL 
        for grp in nc.groups.values():
            # Create a list of var for each group
            vars = list(grp.variables.keys())
            cdl_vars[grp.name] = vars
            all_vars.extend(vars)
        
    if group != None:
        allowed_vars= cdl_vars[group]
    else: 
        # get unique 
        allowed_vars = list(set(all_vars))
        
    for name in var_names:
        if name not in allowed_vars:
            has_valid = False
            if group != None:
                print(f'"{name}" variable name not found in group "{group}" of MARIS CDL')
            else:
                print(f'"{name}" variable name not found in MARIS CDL')
    return has_valid
VARNAMES = ['lat', 'lon']
test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), True)
VARNAMES = ['ba140_invalid', 'ba140_dl']
test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), False)
"ba140_invalid" variable name not found in MARIS CDL

Geoprocessing


source

get_bbox

 get_bbox (df, coord_cols=('lon', 'lat'))

Get the bounding box of a DataFrame.

Exported source
def get_bbox(df,
             coord_cols=('lon', 'lat')
            ):
    "Get the bounding box of a DataFrame."
    x, y = coord_cols        
    arr = [(row[x], row[y]) for _, row in df.iterrows()]
    return MultiPoint(arr).envelope
df = pd.DataFrame({'lon': np.linspace(-10, 5, 20), 'lat':  np.linspace(40, 50, 20)})
bbox = get_bbox(df);
# To get `lon_min`, `lon_max`, `lat_min`, `lat_max`
bbox.bounds
(-10.0, 40.0, 5.0, 50.0)
# And its Well-Know Text representation
bbox.wkt
'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'
# If unique (lon, lat)
df = pd.DataFrame({'lon': [0, 0], 'lat':  [1, 1]})
bbox = get_bbox(df);
bbox.bounds
(0.0, 1.0, 0.0, 1.0)

source

ddmm_to_dd

 ddmm_to_dd (ddmmmm:float)
Type Details
ddmmmm float Coordinates in degrees/minutes decimal format
Returns float Coordinates in degrees decimal format
Exported source
def ddmm_to_dd(
    ddmmmm:float # Coordinates in degrees/minutes decimal format
    ) -> float: # Coordinates in degrees decimal format
    # Convert degrees/minutes decimal to degrees decimal.
    mins, degs = modf(ddmmmm)
    mins = mins * 100
    return round(int(degs) + (mins / 60), 6)
fc.test_close(ddmm_to_dd(45.34), 45.566667)

Downloaders


source

download_file

 download_file (owner, repo, src_dir, dest_dir, fname)
Exported source
def download_files_in_folder(owner:str, 
                             repo:str, 
                             src_dir:str, 
                             dest_dir:str
                             ):
    "Make a GET request to the GitHub API to get the contents of the folder."
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

source

download_files_in_folder

 download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)

Make a GET request to the GitHub API to get the contents of the folder.

WorRMS

The World Register of Marine Species (WorMS) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to “fuzzy” match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described here.


source

match_worms

 match_worms (name:str)

Lookup name in WoRMS (fuzzy match).

Type Details
name str Name of species to look up in WoRMS
Exported source
def match_worms(
    name:str # Name of species to look up in WoRMS
    ):
    "Lookup `name` in WoRMS (fuzzy match)."
    url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
    params = {
        'scientificnames[]': [name],
        'marine_only': 'true'
    }
    headers = {
        'accept': 'application/json'
    }
    
    response = requests.get(url, params=params, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        return -1

For instance:

match_worms('Aristeus antennatus')
[[{'AphiaID': 107083,
   'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
   'scientificname': 'Aristeus antennatus',
   'authority': '(Risso, 1816)',
   'status': 'accepted',
   'unacceptreason': None,
   'taxonRankID': 220,
   'rank': 'Species',
   'valid_AphiaID': 107083,
   'valid_name': 'Aristeus antennatus',
   'valid_authority': '(Risso, 1816)',
   'parentNameUsageID': 106807,
   'kingdom': 'Animalia',
   'phylum': 'Arthropoda',
   'class': 'Malacostraca',
   'order': 'Decapoda',
   'family': 'Aristeidae',
   'genus': 'Aristeus',
   'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',
   'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
   'isMarine': 1,
   'isBrackish': 0,
   'isFreshwater': 0,
   'isTerrestrial': 0,
   'isExtinct': 0,
   'match_type': 'exact',
   'modified': '2022-08-24T09:48:14.813Z'}]]

Fuzzy matching for MARIS Lookup Tables

Using https://jamesturk.github.io/jellyfish fuzzy matching distance metrics.


source

Match

 Match (matched_id:int, matched_maris_name:str, source_name:str,
        match_score:int)

Match between a data provider name and a MARIS lookup table.

Exported source
@dataclass
class Match:
    "Match between a data provider name and a MARIS lookup table."
    matched_id: int
    matched_maris_name: str
    source_name: str
    match_score: int

source

match_maris_lut

 match_maris_lut (lut_path:str, data_provider_name:str, maris_id:str,
                  maris_name:str, dist_fn:collections.abc.Callable=<built-
                  in function levenshtein_distance>, nresults:int=10)

Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, …).

Type Default Details
lut_path str Path to MARIS species authoritative species look-up table
data_provider_name str Name of data provider nomenclature item to look up
maris_id str Id of MARIS lookup table nomenclature item to match
maris_name str Name of MARIS lookup table nomenclature item to match
dist_fn Callable levenshtein_distance Distance function
nresults int 10 Maximum number of results to return
Returns DataFrame
Exported source
def match_maris_lut(
    lut_path: str, # Path to MARIS species authoritative species look-up table
    data_provider_name: str, # Name of data provider nomenclature item to look up 
    maris_id: str, # Id of MARIS lookup table nomenclature item to match
    maris_name: str, # Name of MARIS lookup table nomenclature item to match
    dist_fn: Callable = jf.levenshtein_distance, # Distance function
    nresults: int = 10 # Maximum number of results to return
) -> pd.DataFrame:
    "Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, ...)."
    df = pd.read_excel(lut_path)
    df = df.dropna(subset=[maris_name])
    df = df.astype({maris_id: 'int'})
    df['score'] = df[maris_name].str.lower().apply(lambda x: dist_fn(data_provider_name.lower(), x))
    df = df.sort_values(by='score', ascending=True)[:nresults]
    return df[[maris_id, maris_name, 'score']]

Below an example trying to match the name “PLANKTON” with dbo_species_cleaned.xlsx MARIS biota species lookup table:

lut_fname = '../files/lut/dbo_species_cleaned.xlsx'
match_maris_lut(lut_fname, data_provider_name='PLANKTON', 
                maris_id='species_id', maris_name='species')
species_id species score
281 280 Plankton 0
696 695 Zooplankton 3
633 632 Palaemon 4
697 696 Phytoplankton 5
812 811 Chanos 5
160 159 Neuston 5
234 233 Penaeus 6
1458 1457 Lamnidae 6
1438 1437 Labrus 6
1527 1526 Favites 6

Below an example trying to match the name “GLACIAL” with dbo_sedtype.xlsx MARIS sediment lookup table:

lut_fname = '../files/lut/dbo_sedtype.xlsx'
match_maris_lut(lut_fname, data_provider_name='GLACIAL', 
                maris_id='sedtype_id', maris_name='sedtype')
sedtype_id sedtype score
26 25 Glacial 0
3 2 Gravel 4
2 1 Clay 5
51 50 Glacial clay 5
4 3 Marsh 6
7 6 Sand 6
13 12 Silt 6
15 14 Sludge 6
27 26 Soft 7
52 51 Soft clay 7
lut_fname = '../files/lut/dbo_nuclide.xlsx'
match_maris_lut(lut_fname, data_provider_name='CS-137', 
                maris_id='nuclide_id', maris_name='nc_name')
nuclide_id nc_name score
31 33 cs137 1
30 31 cs134 2
99 102 cs136 2
29 30 cs127 2
111 114 ce139 3
109 112 sb127 3
8 7 co57 4
28 29 i131 4
71 74 cm243 4
90 93 sn117m 4

Geoprocessing


source

get_bbox

 get_bbox (df, coord_cols=('lon', 'lat'))
Exported source
def get_bbox(df,
             coord_cols=('lon', 'lat')
            ):
    x, y = coord_cols        
    arr = [(row[x], row[y]) for _, row in df.iterrows()]
    return MultiPoint(arr).envelope
df = pd.DataFrame({'lon': np.linspace(-10, 5, 20), 'lat':  np.linspace(40, 50, 20)})
bbox = get_bbox(df);
# To get `lon_min`, `lon_max`, `lat_min`, `lat_max`
bbox.bounds
(-10.0, 40.0, 5.0, 50.0)
# And its Well-Know Text representation
bbox.wkt
'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'
# If unique (lon, lat)
df = pd.DataFrame({'lon': [0, 0], 'lat':  [1, 1]})
bbox = get_bbox(df);
bbox.bounds
(0.0, 1.0, 0.0, 1.0)

Downloaders


source

download_file

 download_file (owner, repo, src_dir, dest_dir, fname)
Exported source
def download_files_in_folder(owner:str, 
                             repo:str, 
                             src_dir:str, 
                             dest_dir:str
                             ):
    "Make a GET request to the GitHub API to get the contents of the folder"
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

source

download_files_in_folder

 download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)

Make a GET request to the GitHub API to get the contents of the folder

WorRMS

The World Register of Marine Species (WorMS) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to “fuzzy” match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described here.


source

match_worms

 match_worms (name:str)

Lookup name in WoRMS (fuzzy match)

Type Details
name str Name of species to look up in WoRMS
Exported source
def match_worms(
    name:str # Name of species to look up in WoRMS
    ):
    "Lookup `name` in WoRMS (fuzzy match)"
    url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
    params = {
        'scientificnames[]': [name],
        'marine_only': 'true'
    }
    headers = {
        'accept': 'application/json'
    }
    
    response = requests.get(url, params=params, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        return -1

For instance:

match_worms('Aristeus antennatus')
[[{'AphiaID': 107083,
   'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
   'scientificname': 'Aristeus antennatus',
   'authority': '(Risso, 1816)',
   'status': 'accepted',
   'unacceptreason': None,
   'taxonRankID': 220,
   'rank': 'Species',
   'valid_AphiaID': 107083,
   'valid_name': 'Aristeus antennatus',
   'valid_authority': '(Risso, 1816)',
   'parentNameUsageID': 106807,
   'kingdom': 'Animalia',
   'phylum': 'Arthropoda',
   'class': 'Malacostraca',
   'order': 'Decapoda',
   'family': 'Aristeidae',
   'genus': 'Aristeus',
   'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',
   'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
   'isMarine': 1,
   'isBrackish': 0,
   'isFreshwater': 0,
   'isTerrestrial': 0,
   'isExtinct': 0,
   'match_type': 'exact',
   'modified': '2022-08-24T09:48:14.813Z'}]]

Test


source

test_dfs

 test_dfs (dfs1:dict, dfs2:dict)

Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order).

Type Details
dfs1 dict First dictionary of DataFrames to compare
dfs2 dict Second dictionary of DataFrames to compare
Returns None It raises an AssertionError if the DataFrames are not equal
Exported source
def test_dfs(
    dfs1:dict, # First dictionary of DataFrames to compare 
    dfs2:dict # Second dictionary of DataFrames to compare
    ) -> None: # It raises an `AssertionError` if the DataFrames are not equal
    "Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order)."
    for grp in dfs1.keys():
        df1, df2 = (df.sort_index() for df in (dfs1[grp], dfs2[grp]))
        fc.test_eq(df1, df2.reindex(columns=df1.columns))