Exported source
= 'Not available' NA
We define below useful constants throughout the package.
Abstracting some common operations.
get_unique_across_dfs (dfs:dict, col_name:str='NUCLIDE', as_df:bool=False, include_nchars:bool=False)
Get a list of unique column values across dataframes.
Type | Default | Details | |
---|---|---|---|
dfs | dict | Dictionary of dataframes | |
col_name | str | NUCLIDE | Column name to extract unique values from |
as_df | bool | False | Return a DataFrame of unique values |
include_nchars | bool | False | Add a column with the number of characters in the value |
Returns | list | Returns a list of unique column values across dataframes |
def get_unique_across_dfs(dfs:dict, # Dictionary of dataframes
col_name:str='NUCLIDE', # Column name to extract unique values from
as_df:bool=False, # Return a DataFrame of unique values
include_nchars:bool=False # Add a column with the number of characters in the value
) -> list: # Returns a list of unique column values across dataframes
"Get a list of unique column values across dataframes."
unique_values = list(set().union(*(df[col_name].unique() for df in dfs.values() if col_name in df.columns)))
if not as_df:
return unique_values
else:
df_uniques = pd.DataFrame(unique_values, columns=['value']).reset_index()
if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len()
return df_uniques
Example of use:
dfs_test = {'seawater': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
'biota': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
'sediment': pd.DataFrame({'NUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}
fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')),
set(['cs134', 'cs137', 'cs134_137_tot']))
What if the column name is not in one of the dataframe?
dfs_test = {'seawater': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
'biota': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
'sediment': pd.DataFrame({'NONUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}
fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')),
set(['cs134', 'cs137', 'cs134_137_tot']))
index | value | n_chars | |
---|---|---|---|
0 | 0 | cs134 | 5 |
1 | 1 | cs137 | 5 |
2 | 2 | cs134_137_tot | 13 |
Remapper (provider_lut_df:pandas.core.frame.DataFrame, maris_lut_fn:<built-infunctioncallable>, maris_col_id:str, maris_col_name:str, provider_col_to_match:str, provider_col_key, fname_cache)
Remap a data provider lookup table to a MARIS lookup table using fuzzy matching.
Type | Details | |
---|---|---|
provider_lut_df | DataFrame | Data provider lookup table to be remapped |
maris_lut_fn | callable | Function that returns the MARIS lookup table path |
maris_col_id | str | MARIS lookup table column name for the id |
maris_col_name | str | MARIS lookup table column name for the name |
provider_col_to_match | str | Data provider lookup table column name for the name to match |
provider_col_key | Data provider lookup table column name for the key | |
fname_cache | Cache file name |
class Remapper():
"Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
def __init__(self,
provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped
maris_lut_fn:callable, # Function that returns the MARIS lookup table path
maris_col_id:str, # MARIS lookup table column name for the id
maris_col_name:str, # MARIS lookup table column name for the name
provider_col_to_match:str, # Data provider lookup table column name for the name to match
provider_col_key, # Data provider lookup table column name for the key
fname_cache # Cache file name
):
fc.store_attr()
self.cache_file = cache_path() / fname_cache
self.maris_lut = maris_lut_fn()
self.lut = {}
def generate_lookup_table(self,
fixes={}, # Lookup table fixes
as_df=True, # Whether to return a DataFrame
overwrite=True):
"Generate a lookup table from a data provider lookup table to a MARIS lookup table using fuzzy matching."
self.fixes = fixes
self.as_df = as_df
if overwrite or not self.cache_file.exists():
self._create_lookup_table()
fc.save_pickle(self.cache_file, self.lut)
else:
self.lut = fc.load_pickle(self.cache_file)
return self._format_output()
def _create_lookup_table(self):
df = self.provider_lut_df
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
self._process_row(row)
def _process_row(self, row):
value_to_match = row[self.provider_col_to_match]
if isinstance(value_to_match, str): # Only process if value is a string
# If value is in fixes, use the fixed value
name_to_match = self.fixes.get(value_to_match, value_to_match)
result = match_maris_lut(self.maris_lut, name_to_match, self.maris_col_id, self.maris_col_name).iloc[0]
match = Match(result[self.maris_col_id], result[self.maris_col_name],
value_to_match, result['score'])
self.lut[row[self.provider_col_key]] = match
else:
# Handle non-string values (e.g., NaN)
self.lut[row[self.provider_col_key]] = Match(-1, "Unknown", value_to_match, 0)
def select_match(self, match_score_threshold:int=1):
self.lut = {k: v for k, v in self.lut.items() if v.match_score >= match_score_threshold}
return self._format_output()
def _format_output(self):
if not self.as_df: return self.lut
df_lut = pd.DataFrame.from_dict(self.lut, orient='index',
columns=['matched_maris_name', 'source_name', 'match_score'])
df_lut.index.name = 'source_key'
return df_lut.sort_values(by='match_score', ascending=False)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 2 1 #| exports ----> 2 class Remapper(): 3 "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching." 4 def __init__(self, 5 provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped 6 maris_lut_fn:callable, # Function that returns the MARIS lookup table path (...) 11 fname_cache # Cache file name 12 ): Cell In[2], line 5, in Remapper() 2 class Remapper(): 3 "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching." 4 def __init__(self, ----> 5 provider_lut_df:pd.DataFrame, # Data provider lookup table to be remapped 6 maris_lut_fn:callable, # Function that returns the MARIS lookup table path 7 maris_col_id:str, # MARIS lookup table column name for the id 8 maris_col_name:str, # MARIS lookup table column name for the name 9 provider_col_to_match:str, # Data provider lookup table column name for the name to match 10 provider_col_key, # Data provider lookup table column name for the key 11 fname_cache # Cache file name 12 ): 13 fc.store_attr() 14 self.cache_file = cache_path() / fname_cache NameError: name 'pd' is not defined
has_valid_varname (var_names:list, cdl_path:str, group=None)
Check that proposed variable names are in MARIS CDL
Type | Default | Details | |
---|---|---|---|
var_names | list | variable names | |
cdl_path | str | Path to MARIS CDL file (point of truth) | |
group | NoneType | None | Check if the variable names is contained in the group |
def has_valid_varname(
var_names:list, # variable names
cdl_path:str, # Path to MARIS CDL file (point of truth)
group = None, # Check if the variable names is contained in the group
):
"Check that proposed variable names are in MARIS CDL"
has_valid = True
with Dataset(cdl_path) as nc:
cdl_vars={}
all_vars=[]
# get variable names in CDL
for grp in nc.groups.values():
# Create a list of var for each group
vars = list(grp.variables.keys())
cdl_vars[grp.name] = vars
all_vars.extend(vars)
if group != None:
allowed_vars= cdl_vars[group]
else:
# get unique
allowed_vars = list(set(all_vars))
for name in var_names:
if name not in allowed_vars:
has_valid = False
if group != None:
print(f'"{name}" variable name not found in group "{group}" of MARIS CDL')
else:
print(f'"{name}" variable name not found in MARIS CDL')
return has_valid
get_bbox (df, coord_cols=('lon', 'lat'))
Get the bounding box of a DataFrame.
ddmm_to_dd (ddmmmm:float)
Type | Details | |
---|---|---|
ddmmmm | float | Coordinates in degrees/minutes decimal format |
Returns | float | Coordinates in degrees decimal format |
download_file (owner, repo, src_dir, dest_dir, fname)
def download_files_in_folder(owner:str,
repo:str,
src_dir:str,
dest_dir:str
):
"Make a GET request to the GitHub API to get the contents of the folder."
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
response = requests.get(url)
if response.status_code == 200:
contents = response.json()
# Iterate over the files and download them
for item in contents:
if item["type"] == "file":
fname = item["name"]
download_file(owner, repo, src_dir, dest_dir, fname)
else:
print(f"Error: {response.status_code}")
def download_file(owner, repo, src_dir, dest_dir, fname):
# Make a GET request to the GitHub API to get the raw file contents
url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
response = requests.get(url)
if response.status_code == 200:
# Save the file locally
with open(Path(dest_dir) / fname, "wb") as file:
file.write(response.content)
print(f"{fname} downloaded successfully.")
else:
print(f"Error: {response.status_code}")
download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)
Make a GET request to the GitHub API to get the contents of the folder.
The World Register of Marine Species (WorMS) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to “fuzzy” match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described here.
match_worms (name:str)
Lookup name
in WoRMS (fuzzy match).
Type | Details | |
---|---|---|
name | str | Name of species to look up in WoRMS |
def match_worms(
name:str # Name of species to look up in WoRMS
):
"Lookup `name` in WoRMS (fuzzy match)."
url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
params = {
'scientificnames[]': [name],
'marine_only': 'true'
}
headers = {
'accept': 'application/json'
}
response = requests.get(url, params=params, headers=headers)
# Check if the request was successful (status code 200)
if response.status_code == 200:
data = response.json()
return data
else:
return -1
For instance:
[[{'AphiaID': 107083,
'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
'scientificname': 'Aristeus antennatus',
'authority': '(Risso, 1816)',
'status': 'accepted',
'unacceptreason': None,
'taxonRankID': 220,
'rank': 'Species',
'valid_AphiaID': 107083,
'valid_name': 'Aristeus antennatus',
'valid_authority': '(Risso, 1816)',
'parentNameUsageID': 106807,
'kingdom': 'Animalia',
'phylum': 'Arthropoda',
'class': 'Malacostraca',
'order': 'Decapoda',
'family': 'Aristeidae',
'genus': 'Aristeus',
'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',
'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
'isMarine': 1,
'isBrackish': 0,
'isFreshwater': 0,
'isTerrestrial': 0,
'isExtinct': 0,
'match_type': 'exact',
'modified': '2022-08-24T09:48:14.813Z'}]]
Using https://jamesturk.github.io/jellyfish fuzzy matching distance metrics.
Match (matched_id:int, matched_maris_name:str, source_name:str, match_score:int)
Match between a data provider name and a MARIS lookup table.
match_maris_lut (lut_path:str, data_provider_name:str, maris_id:str, maris_name:str, dist_fn:collections.abc.Callable=<built- in function levenshtein_distance>, nresults:int=10)
Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, …).
Type | Default | Details | |
---|---|---|---|
lut_path | str | Path to MARIS species authoritative species look-up table | |
data_provider_name | str | Name of data provider nomenclature item to look up | |
maris_id | str | Id of MARIS lookup table nomenclature item to match | |
maris_name | str | Name of MARIS lookup table nomenclature item to match | |
dist_fn | Callable | levenshtein_distance | Distance function |
nresults | int | 10 | Maximum number of results to return |
Returns | DataFrame |
def match_maris_lut(
lut_path: str, # Path to MARIS species authoritative species look-up table
data_provider_name: str, # Name of data provider nomenclature item to look up
maris_id: str, # Id of MARIS lookup table nomenclature item to match
maris_name: str, # Name of MARIS lookup table nomenclature item to match
dist_fn: Callable = jf.levenshtein_distance, # Distance function
nresults: int = 10 # Maximum number of results to return
) -> pd.DataFrame:
"Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, ...)."
df = pd.read_excel(lut_path)
df = df.dropna(subset=[maris_name])
df = df.astype({maris_id: 'int'})
df['score'] = df[maris_name].str.lower().apply(lambda x: dist_fn(data_provider_name.lower(), x))
df = df.sort_values(by='score', ascending=True)[:nresults]
return df[[maris_id, maris_name, 'score']]
Below an example trying to match the name “PLANKTON” with dbo_species_cleaned.xlsx
MARIS biota species lookup table:
lut_fname = '../files/lut/dbo_species_cleaned.xlsx'
match_maris_lut(lut_fname, data_provider_name='PLANKTON',
maris_id='species_id', maris_name='species')
species_id | species | score | |
---|---|---|---|
281 | 280 | Plankton | 0 |
696 | 695 | Zooplankton | 3 |
633 | 632 | Palaemon | 4 |
697 | 696 | Phytoplankton | 5 |
812 | 811 | Chanos | 5 |
160 | 159 | Neuston | 5 |
234 | 233 | Penaeus | 6 |
1458 | 1457 | Lamnidae | 6 |
1438 | 1437 | Labrus | 6 |
1527 | 1526 | Favites | 6 |
Below an example trying to match the name “GLACIAL” with dbo_sedtype.xlsx MARIS sediment lookup table:
lut_fname = '../files/lut/dbo_sedtype.xlsx'
match_maris_lut(lut_fname, data_provider_name='GLACIAL',
maris_id='sedtype_id', maris_name='sedtype')
sedtype_id | sedtype | score | |
---|---|---|---|
26 | 25 | Glacial | 0 |
3 | 2 | Gravel | 4 |
2 | 1 | Clay | 5 |
51 | 50 | Glacial clay | 5 |
4 | 3 | Marsh | 6 |
7 | 6 | Sand | 6 |
13 | 12 | Silt | 6 |
15 | 14 | Sludge | 6 |
27 | 26 | Soft | 7 |
52 | 51 | Soft clay | 7 |
lut_fname = '../files/lut/dbo_nuclide.xlsx'
match_maris_lut(lut_fname, data_provider_name='CS-137',
maris_id='nuclide_id', maris_name='nc_name')
nuclide_id | nc_name | score | |
---|---|---|---|
31 | 33 | cs137 | 1 |
30 | 31 | cs134 | 2 |
99 | 102 | cs136 | 2 |
29 | 30 | cs127 | 2 |
111 | 114 | ce139 | 3 |
109 | 112 | sb127 | 3 |
8 | 7 | co57 | 4 |
28 | 29 | i131 | 4 |
71 | 74 | cm243 | 4 |
90 | 93 | sn117m | 4 |
get_bbox (df, coord_cols=('lon', 'lat'))
download_file (owner, repo, src_dir, dest_dir, fname)
def download_files_in_folder(owner:str,
repo:str,
src_dir:str,
dest_dir:str
):
"Make a GET request to the GitHub API to get the contents of the folder"
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
response = requests.get(url)
if response.status_code == 200:
contents = response.json()
# Iterate over the files and download them
for item in contents:
if item["type"] == "file":
fname = item["name"]
download_file(owner, repo, src_dir, dest_dir, fname)
else:
print(f"Error: {response.status_code}")
def download_file(owner, repo, src_dir, dest_dir, fname):
# Make a GET request to the GitHub API to get the raw file contents
url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
response = requests.get(url)
if response.status_code == 200:
# Save the file locally
with open(Path(dest_dir) / fname, "wb") as file:
file.write(response.content)
print(f"{fname} downloaded successfully.")
else:
print(f"Error: {response.status_code}")
download_files_in_folder (owner:str, repo:str, src_dir:str, dest_dir:str)
Make a GET request to the GitHub API to get the contents of the folder
The World Register of Marine Species (WorMS) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to “fuzzy” match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described here.
match_worms (name:str)
Lookup name
in WoRMS (fuzzy match)
Type | Details | |
---|---|---|
name | str | Name of species to look up in WoRMS |
def match_worms(
name:str # Name of species to look up in WoRMS
):
"Lookup `name` in WoRMS (fuzzy match)"
url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
params = {
'scientificnames[]': [name],
'marine_only': 'true'
}
headers = {
'accept': 'application/json'
}
response = requests.get(url, params=params, headers=headers)
# Check if the request was successful (status code 200)
if response.status_code == 200:
data = response.json()
return data
else:
return -1
For instance:
[[{'AphiaID': 107083,
'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
'scientificname': 'Aristeus antennatus',
'authority': '(Risso, 1816)',
'status': 'accepted',
'unacceptreason': None,
'taxonRankID': 220,
'rank': 'Species',
'valid_AphiaID': 107083,
'valid_name': 'Aristeus antennatus',
'valid_authority': '(Risso, 1816)',
'parentNameUsageID': 106807,
'kingdom': 'Animalia',
'phylum': 'Arthropoda',
'class': 'Malacostraca',
'order': 'Decapoda',
'family': 'Aristeidae',
'genus': 'Aristeus',
'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',
'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
'isMarine': 1,
'isBrackish': 0,
'isFreshwater': 0,
'isTerrestrial': 0,
'isExtinct': 0,
'match_type': 'exact',
'modified': '2022-08-24T09:48:14.813Z'}]]
test_dfs (dfs1:dict, dfs2:dict)
Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order).
Type | Details | |
---|---|---|
dfs1 | dict | First dictionary of DataFrames to compare |
dfs2 | dict | Second dictionary of DataFrames to compare |
Returns | None | It raises an AssertionError if the DataFrames are not equal |
def test_dfs(
dfs1:dict, # First dictionary of DataFrames to compare
dfs2:dict # Second dictionary of DataFrames to compare
) -> None: # It raises an `AssertionError` if the DataFrames are not equal
"Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order)."
for grp in dfs1.keys():
df1, df2 = (df.sort_index() for df in (dfs1[grp], dfs2[grp]))
fc.test_eq(df1, df2.reindex(columns=df1.columns))