Example Notebook - Chemoecology Tools documentation

# # Run if on Colab
# !mkdir -p ../example_data/
# !curl -o ../example_data/lbal23_metadata.csv https://raw.githubusercontent.com/Wolfffff/chemoecology-tools/main/docs/example_data/lbal23_metadata.csv
# !curl -o ../example_data/lbal23_relative_abundance.csv https://raw.githubusercontent.com/Wolfffff/chemoecology-tools/main/docs/example_data/lbal23_relative_abundance.csv
# !curl -o ../example_data/user_provided_chemical_metadata.yaml https://raw.githubusercontent.com/Wolfffff/chemoecology-tools/main/docs/example_data/user_provided_chemical_metadata.yaml

# # Run if you have not already installed chemoeocology-tools
# !pip install git+https://github.com/Wolfffff/chemoecology-tools.git

from chemoecology_tools.core import GCMSExperiment
from chemoecology_tools.analysis import (
    calculate_enrichment_table,
    perform_nmds,
    perform_lda,
    perform_pca,
    perform_random_forest,
    calculate_compositional_stats,
)
from chemoecology_tools.visualization import (
    plot_nmds,
    plot_lda,
    plot_pca,
    plot_rf_importance,
)

USER_PROVIDED_CHEMICAL_METADATA_PATH = "../example_data/user_provided_chemical_metadata.yaml"
ABUNDANCE_DATA_PATH = "../example_data/lbal23_relative_abundance.csv"
USER_PROVIDED_SAMPLE_METADATA_PATH = "../example_data/lbal23_metadata.csv"
ID_COLUMN = "Bee #"
FILTER_DICT={
    "CHC File Located": ["NO", "Bleed"],
    "Quality": ["poor", "contaminated"],
    "Nest Code": ["E-JS-030", "E-JS-033", "E-JS-037", "E-KO-140", "E-KO-143"]
}

# Load experimental data with metadata
experiment = GCMSExperiment.from_files(
    abundance_path=ABUNDANCE_DATA_PATH,
    metadata_path=USER_PROVIDED_SAMPLE_METADATA_PATH,
    id_col=ID_COLUMN,
    filter_dict=FILTER_DICT,
    fetch_pubchem=True,
    user_chemical_metadata=USER_PROVIDED_CHEMICAL_METADATA_PATH,
)

Error fetching PubChem data for Unknown RI2496: 'PUGREST.ServerBusy'
Error fetching PubChem data for 11/13-methylpentacosane?: 'PUGREST.ServerBusy'
Error fetching PubChem data for Docosene: 'PUGREST.ServerBusy'
Error fetching PubChem data for Octacosane: 'PUGREST.ServerBusy'
Error fetching PubChem data for Tetracosanolide: 'PUGREST.ServerBusy'
Error fetching PubChem data for Nonacosene: 'PUGREST.ServerBusy'

print(experiment)

Unnamed experiment: 84 samples, 48 chemicals measured

# Filter and process data
filtered = experiment.filter_trace_compounds(threshold=0.005)
relative = filtered.calculate_relative_abundance()

filtered.metadata_df.head()

	Bee #	Nest Code	CHC File Located	All Nestmates?	Social Phenotype	Date	Reproductivity	Caste	Notes	Brood Notes	...	Ovariole Length	Oocyte Index	DF Width	DF Length (mm)	Behavior Score	Behavior Score Subordinate Classifier	Behavioral Notes	Bead Assay	Dissection Type	CHC Notes
0	129	A-KO-053	yes	Yes	Solitary	6/26/23	Some_rep	Sol_rep	NaN	NaN	...	2.103	0.230623	0.531	3.237	No contact	NaN	pilot assay	Pentane	RNAlater ICE	NaN
1	135	A-KO-055	yes	Yes	Solitary	6/26/23	Some_rep	Sol_nr	NaN	1 pupa collected, squished 2 pupa	...	3.148	0.307814	0.476	2.917	NaN	NaN	No assay	No assay	RNAlater ICE	NaN
2	137	F-KO-056	yes	Yes	Solitary	6/26/23	Some_rep	Sol_nr	NaN	Brood squished	...	2.279	0.237385	0.490	3.177	No contact	NaN	pilot assay	Pentane	RNAlater ICE	NaN
3	138	E-SW-001	yes	Yes	Solitary	6/26/23	Some_rep	Sol_rep	NaN	No brood	...	2.070	0.397101	0.479	3.480	NaN	NaN	No assay	No assay	RNAlater ICE	NaN
4	139	E-SW-002	yes	Yes	Solitary	6/26/23	No_rep	Sol_nr	NaN	No brood	...	2.133	0.157525	0.522	3.293	Pass	Neutral	pilot assay, agitated with tapping and q tip	Mono alkenes	RNAlater ICE	NaN

5 rows × 23 columns

import pandas as pd

meta_mask = filtered.metadata_df["Caste"].isin(["Sol_rep", "Sol_nr"])
chem_mask = pd.Series([
    filtered.get_chemical_property(c, "class") == "Alkane"
    for c in filtered.chemical_cols
], index=filtered.chemical_cols)

filtered_exp = experiment.filter(
    metadata_mask=meta_mask,
    chemical_mask=chem_mask
)
print(filtered_exp)

Unnamed experiment: 21 samples, 9 chemicals measured

# Combine abundance data by chemical metadata...
# combined = filtered_exp.combine_chemicals_by_feature("class")

nmds_df = perform_nmds(filtered)
nmds_df.head()

	NMDS1	NMDS2
0	0.670753	2.710467
1	-15.641079	-1.063457
2	-10.074518	4.100717
3	11.310078	3.628408
4	9.600724	7.657540

# Create plots with different groupings
# By caste
fig1 = plot_nmds(
    experiment=filtered,
    nmds_coords=nmds_df,
    group_col="Caste",
    title="Chemical Profiles by Caste"
)

../_images/382965a998edc7b75acbfc12d1f8fe172eb50b10a2a2c9909b039a7fb17b0226.png

clr_output = calculate_compositional_stats(filtered)

results = perform_pca(clr_output) 

pca_coords = results["coords"]
explained_var = results["explained_variance"]

print(f"PCA explained variance ratios: {explained_var}")
# Create PCA plot
fig2 = plot_pca(
    pca_results=results,
    experiment=filtered,
    group_col="Caste",
    title="Chemical Profiles by Caste (PCA)"
)

PCA explained variance ratios: [3.53162722e-01 1.78285414e-01 8.77623197e-02 6.43034507e-02
81054684e-02 3.78831753e-02 2.82389037e-02 2.60710224e-02
17973502e-02 1.83375154e-02 1.61620581e-02 1.37813931e-02
16338884e-02 9.94672142e-03 9.10746467e-03 8.10708761e-03
70011676e-03 7.28735491e-03 6.03808246e-03 5.19693220e-03
75670924e-03 4.60940463e-03 4.10461136e-03 3.90091043e-03
58724810e-03 2.92467841e-03 2.66686918e-03 2.26478208e-03
90540880e-03 1.84722874e-03 1.53303962e-03 1.36000268e-03
71077172e-04 9.04469882e-04 7.52151016e-04 6.42434388e-04
64373939e-04 4.13706697e-04 3.78429755e-04 2.82229042e-04
56582144e-04 1.50006025e-04 1.29181746e-04 8.19861429e-05
06956306e-05 2.92965155e-05 1.40176067e-05 1.00276624e-05
60168581e-31]

../_images/a4609f1645f2a2e488076d3d3b969e3990342a14d6e374add8a70bc622d0194b.png

# Initialize group column and title prefix
group_col = "Caste"
title_prefix = "Chemical Profiles"

# LDA Analysis
transformed_data = calculate_compositional_stats(filtered)
lda_results = perform_lda(
    transformed_data=transformed_data,
    experiment=filtered,
    group_col=group_col
)

fig_lda = plot_lda(
    lda_results=lda_results,
    experiment=filtered,
    group_col=group_col,
    title=f"{title_prefix} by {group_col} (LDA)"
)

pca_results = perform_pca(transformed_data)

fig_pca = plot_pca(
    pca_results=pca_results,
    experiment=filtered,
    group_col=group_col,
    title=f"{title_prefix} by {group_col} (PCA)"
)

../_images/cd5438ea45314e125cf1d15bf9f5f84c81ae9a080a3f3a547196bc298eda9b28.png

# Generate enrichment table
enrichment_table = calculate_enrichment_table(
    experiment=filtered,
    group_column='Caste',
    class_column='class',
    alpha=0.05
)

enrichment_table.head()

	Chemical Class	Compound	KW_pvalue	Group Bias	Sol_rep	Sol_nr	W	Q	W_dev
7	Aldehyde	Eicosanal	0.249934		0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00
2	Aldehyde	Octadecanal	0.390423		0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00	0.00 ± 0.00
3	Alkane	Heneicosane	0.000008	Sol_nr, Sol_rep	1.87 ± 0.58	1.72 ± 0.61	0.05 ± 0.00	0.09 ± 0.02	0.08 ± 0.02
26	Alkane	Heptacosane	0.043065	W_dev	2.29 ± 0.35	2.15 ± 0.33	2.06 ± 0.18	2.84 ± 0.26	3.35 ± 0.33
33	Alkane	Nonacosane	0.001116	Q, Sol_nr, Sol_rep, W_dev	0.24 ± 0.04	0.22 ± 0.04	0.08 ± 0.01	0.25 ± 0.09	0.22 ± 0.03

rf_results = perform_random_forest(
    transformed_data=transformed_data,
    experiment=filtered,
    group_col="Caste",
    test_size=0.3
)

rf_fig = plot_rf_importance(rf_results, title="Random Forest Feature Importance")

/Users/wolf/git/chemoecology-tools/src/chemoecology_tools/visualization/plotting.py:270: UserWarning: The figure layout has changed to tight
  plt.tight_layout()

../_images/3851af91ffd4122d05df8942d3dc9be5abea92800ffe8a7a65a0f0704eda2c99.png

Example Notebook¶