import pandas as pd
from pathlib import Path
from sklearn.metrics import r2_score
from uhina.loading import LoaderFactory, plot_spectra
from uhina.preprocessing import TakeDerivative, SNV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
import warnings
'ignore')
warnings.filterwarnings(
'display.max_rows', 100)
pd.set_option(
import plotly
import plotly.express as px
import numpy as np
from astartes import train_val_test_split
OSSL vs Ringtrial
Explorative Data Analysis of OSSL vs Ringtrial
Loading data
= Path.home() / 'pro/data/woodwell-ringtrial/drive-download-20231013T123706Z-001'
src = LoaderFactory.get_loader(src, 'ringtrial')
loader = loader.load_data(analytes='potassium_cmolkg')
data_rt = np.array([s.split('-rt')[0] for s in data_rt.sample_indices])
data_rt.ds print(f'X shape: {data_rt.X.shape}')
=100, snv=True, alpha=0.2, figsize=(12, 3)); plot_spectra(data_rt, n_spectra
X shape: (1400, 1676)
= Path.home() / '.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz'
src = LoaderFactory.get_loader(src, 'ossl', spectra_type='mir')
loader = loader.load_data(analytes='k.ext_usda.a725_cmolc.kg')
data_ossl print(f'X shape: {data_ossl.X.shape}')
Loading data from /Users/franckalbinet/.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz ...
X shape: (57674, 1676)
=100, snv=True, alpha=0.2, figsize=(12, 3)) plot_spectra(data_ossl, n_spectra
(<Figure size 1200x300 with 1 Axes>,
<Axes: xlabel='Wavenumber ($cm^{-1}$)', ylabel='Absorbance'>)
def pca_rt_ossl(data_rt, data_ossl):
# Combine the spectra from both datasets
= np.vstack((data_rt.X, data_ossl.X))
X_combined
# Create the pipeline
= Pipeline([
pipe 'SNV', SNV()),
('Derivative', TakeDerivative()),
('Scaler', StandardScaler()),
('PCA', PCA(n_components=3))
(
])
# Fit and transform the combined data
= pipe.fit_transform(X_combined)
X_pca_combined
# Split the results back into rt and ossl
= X_pca_combined[:data_rt.X.shape[0]]
data_rt.X_pca = X_pca_combined[data_rt.X.shape[0]:]
data_ossl.X_pca
return data_rt, data_ossl
= pca_rt_ossl(data_rt, data_ossl) data_rt, data_ossl
from matplotlib import pyplot as plt
0], data_ossl.X_pca[:, 1], s=5, alpha=0.1) plt.scatter(data_ossl.X_pca[:,
= 100
n_samples = np.random.choice(data_ossl.X_pca.shape[0],
idx =n_samples, replace=False)
size
= data_ossl.X_pca[idx] X_ossl_subset
= {i: ds for i, ds in enumerate(data_ossl.dataset_labels)} lut_ossl_ds
data_ossl.dataset_names
array([0, 0, 0, ..., 3, 3, 3])
np.vectorize(lut_ossl_ds.get)(data_ossl.dataset_names)
array(['GARRETT.SSL', 'GARRETT.SSL', 'GARRETT.SSL', ...,
'LUCAS.WOODWELL.SSL', 'LUCAS.WOODWELL.SSL', 'LUCAS.WOODWELL.SSL'],
dtype='<U18')
data_rt.X_pca
array([[-37.57209416, 10.0194439 , 1.41901894],
[ 24.05922823, 10.83135662, 12.43538485],
[ 8.09090194, -26.63507173, 7.36763935],
...,
[ 8.44862862, -4.47373073, 2.51161567],
[ 15.9657049 , 8.60238863, -19.86332133],
[ 2.45832757, 6.53856188, 3.22457499]])
= data_rt.ds == 'kssl' mask_rt_ds
data_ossl.X_pca
array([[ 2.45822505, 16.4823169 , 9.00422903],
[ -2.36297831, 10.87565108, 7.6014877 ],
[ -4.66533181, 11.81300838, 7.77196598],
...,
[ 33.2311886 , -6.53547903, 8.0031913 ],
[ 8.18561655, 24.05135848, 14.03594385],
[-11.41377595, 8.67332126, -6.78131327]])
def data_to_df(data_ossl, data_rt,
=100, rt_ds='kssl',
n_samples_ossl=['PC1', 'PC2', 'PC3']):
cols= {i: ds for i, ds in enumerate(data_ossl.dataset_labels)}
lut_ossl_ds = np.random.choice(data_ossl.X_pca.shape[0], size=n_samples_ossl, replace=False)
idx = data_ossl.X_pca[idx,:]
X_ossl_subset = np.vectorize(lut_ossl_ds.get)(data_ossl.dataset_names[idx])
ds_ossl
= pd.DataFrame(X_ossl_subset, columns=cols)
df_ossl 'ds'] = ds_ossl
df_ossl[# return df_ossl
= data_rt.ds == rt_ds
mask
= data_rt.X_pca[mask]
X_rt = pd.DataFrame(X_rt, columns=cols)
df_rt 'ds'] = rt_ds + '-rt'
df_rt[# df_rt = pd.DataFrame(np.c_[X_rt, np.full(np.sum(mask), rt_ds + '-rt')], columns=cols)
return pd.concat([df_ossl, df_rt], axis=0, ignore_index=True)
= data_to_df(data_ossl, data_rt, n_samples_ossl=200) df
def plot_scatter3d(df, size_dict=None, default_opacity=0.7):
"""
Generates a nicely formatted 3D scatter plot of the data with different symbols, colors, and sizes for each dataset.
Args:
df (pd.DataFrame): DataFrame containing the data with columns 'PC1', 'PC2', 'PC3', 'ds'
size_dict (dict, optional): Dictionary mapping dataset names to dot sizes. Defaults to None.
default_opacity (float, optional): Default opacity for all points. Defaults to 0.7.
"""
# Default size
= 20
default_size
# If size_dict is not provided, initialize it with default values
if size_dict is None:
= {}
size_dict
# Create dot_size column based on the ds column, using defaults if not in dict
'dot_size'] = df['ds'].map(lambda x: size_dict.get(x, default_size))
df[
= px.scatter_3d(
fig
df,='PC1',
x='PC2',
y='PC3',
z='ds',
color='ds',
symbol='dot_size',
size=default_opacity,
opacity=['ds'],
hover_data=px.colors.qualitative.Set1,
color_discrete_sequence
)
=dict(line=dict(width=0)))
fig.update_traces(marker
fig.update_layout(=dict(
scene='PC1',
xaxis_title='PC2',
yaxis_title='PC3'
zaxis_title
),='Dataset',
legend_title=800,
height=1000,
width
)
fig.show()
data_ossl.dataset_labels
array(['GARRETT.SSL', 'ICRAF.ISRIC', 'KSSL.SSL', 'LUCAS.WOODWELL.SSL'],
dtype=object)
np.unique(data_rt.ds)
array(['agrocares', 'argonne', 'csu-il', 'eth-alpha-1', 'eth-alpha-2',
'eth-vertex', 'iaea-aug2022', 'kssl', 'landcare', 'lesotho', 'msu',
'osu', 'rothamsted', 'scion', 'ughent', 'uiuc', 'usp',
'uwisc-fine', 'woodwell-alpha', 'woodwell-vertex'], dtype='<U15')
= {
size_dict 'KSSL.SSL': 1,
'GARRETT.SSL': 3,
'ICRAF.ISRIC': 3,
'LUCAS.WOODWELL.SSL': 3,
}
plot_scatter3d(=5000, rt_ds='iaea-aug2022'),
data_to_df(data_ossl, data_rt, n_samples_ossl=size_dict) size_dict
Unable to display output for mime type(s): application/vnd.plotly.v1+json