Source code for chemoecology_tools.core.gcms_experiment

"""Container for GCMS experimental data and metadata."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
import yaml

from ..utils.pubchem_utils import fetch_pubchem_data


[docs] class GCMSExperiment: """Gas Chromatography-Mass Spectrometry (GCMS) experimental data container. Manages GCMS abundance data, experimental metadata, and chemical properties. Attributes: abundance_df: DataFrame containing GCMS chemical abundance measurements metadata_df: DataFrame containing sample and experimental metadata id_col: Column name used to join abundance and metadata experiment_name: Optional identifier for the experiment chemical_metadata: Dictionary of chemical properties from config """ def __init__( self, abundance_df: pd.DataFrame, metadata_df: pd.DataFrame, id_col: str = "ID", experiment_name: str | None = None, chemical_metadata: dict[str, dict[str, Any]] | None = None, ) -> None: """Initialize GCMS experiment container. Args: abundance_df: DataFrame with GCMS abundance measurements metadata_df: DataFrame with sample metadata id_col: Column name to join abundance and metadata experiment_name: Optional experiment identifier chemical_metadata: Optional dict mapping chemical names to properties """ self.abundance_df = abundance_df self.metadata_df = metadata_df self.id_col = id_col self.experiment_name = experiment_name self.chemical_metadata = chemical_metadata or {} self._validate() self.chemical_cols = self._get_chemical_cols()
[docs] @classmethod def from_files( cls, abundance_path: str | Path, metadata_path: str | Path, user_chemical_metadata: str | Path | None = None, fetch_pubchem: bool = True, id_col: str = "ID", filter_dict: dict[str, list[str]] | None = None, experiment_name: str | None = None, ) -> GCMSExperiment: """Create experiment from data files. Args: abundance_path: Path to abundance data file metadata_path: Path to metadata file user_chemical_metadata: Optional path to chemical properties YAML fetch_pubchem: Whether to fetch PubChem data for chemicals id_col: Column name to join on filter_dict: Optional filtering criteria {column: values_to_exclude} experiment_name: Optional experiment identifier Returns: New GCMSExperiment instance """ abundance_df = pd.read_csv(abundance_path) metadata_df = pd.read_csv(metadata_path) chemical_cols = [col for col in abundance_df.columns if col != id_col] # Initialize chemical metadata chemical_metadata: dict[str, dict[str, Any]] = {} # Load user metadata if provided if user_chemical_metadata: with open(user_chemical_metadata, encoding="utf-8") as f: chemical_metadata = yaml.safe_load(f) # Fetch PubChem data if requested if fetch_pubchem: for chemical in chemical_cols: if chemical not in chemical_metadata: chemical_metadata[chemical] = {} pubchem_data = fetch_pubchem_data(chemical) chemical_metadata[chemical].update(pubchem_data) if filter_dict: for col, values in filter_dict.items(): if col in metadata_df.columns: metadata_df = metadata_df[~metadata_df[col].isin(values)] abundance_df = abundance_df[abundance_df[id_col].isin(metadata_df[id_col])] return cls( abundance_df, metadata_df, id_col, experiment_name, chemical_metadata )
def _validate(self) -> None: """Validate data consistency. Raises: ValueError: If validation fails """ if self.id_col not in self.abundance_df.columns: raise ValueError(f"ID column {self.id_col!r} not found in abundance data") if self.id_col not in self.metadata_df.columns: raise ValueError(f"ID column {self.id_col!r} not found in metadata") # Validate chemical metadata if provided if self.chemical_metadata: unknown_chemicals = set(self._get_chemical_cols()) - set( self.chemical_metadata.keys() ) if unknown_chemicals: print(f"WARNING: Unknown chemicals in metadata: {unknown_chemicals}") def _get_chemical_cols(self) -> list[str]: """Get chemical measurement columns from abundance data. Returns: List of column names excluding the ID column """ return [col for col in self.abundance_df.columns if col != self.id_col]
[docs] def merge(self) -> pd.DataFrame: """Merge abundance and metadata. Returns: DataFrame with joined abundance and metadata """ return pd.merge( self.metadata_df, self.abundance_df, on=self.id_col, how="inner" )
[docs] def filter_samples(self, criteria: dict[str, list[str]]) -> GCMSExperiment: """Filter samples based on metadata criteria. Args: criteria: Filtering criteria {column: values_to_exclude} Returns: New GCMSExperiment with filtered data """ filtered_meta = self.metadata_df.copy() for col, values in criteria.items(): if col in filtered_meta.columns: filtered_meta = filtered_meta[~filtered_meta[col].isin(values)] valid_ids = filtered_meta[self.id_col] filtered_abundance = self.abundance_df[ self.abundance_df[self.id_col].isin(valid_ids) ] return GCMSExperiment( filtered_abundance, filtered_meta, self.id_col, self.experiment_name, self.chemical_metadata, )
[docs] def get_abundance_matrix(self) -> pd.DataFrame: """Get chemical abundance matrix. Returns: DataFrame containing only chemical abundance measurements """ return self.abundance_df[self.chemical_cols]
[docs] def get_metadata(self, columns: list[str] | None = None) -> pd.DataFrame: """Get metadata columns. Args: columns: Optional list of column names to return Returns: DataFrame containing requested metadata columns """ if columns is None: return self.metadata_df return self.metadata_df[columns]
[docs] def get_chemical_property( self, chemical: str, property_name: str, default: Any = None ) -> Any: """Get property value for a chemical. Args: chemical: Name of the chemical property_name: Name of the property to retrieve default: Value to return if property not found Returns: Property value or default if not found """ return self.chemical_metadata.get(chemical, {}).get(property_name, default)
[docs] def get_chemicals_by_property(self, property_name: str, value: Any) -> list[str]: """Get chemicals that have a specific property value. Args: property_name: Name of the property to match value: Value to match Returns: List of chemical names with matching property """ return [ chem for chem, props in self.chemical_metadata.items() if props.get(property_name) == value ]
def __len__(self) -> int: """Get number of samples. Returns: Integer count of samples in experiment """ return len(self.abundance_df) def __str__(self) -> str: """Get string representation. Returns: String describing experiment contents """ name = self.experiment_name or "Unnamed experiment" return ( f"{name}: {len(self)} samples, " f"{len(self.chemical_cols)} chemicals measured" )
[docs] def filter_trace_compounds(self, threshold: float = 0.005) -> GCMSExperiment: """Filter out trace chemical amounts below threshold. Args: threshold: Minimum abundance value to keep (lower values set to 0) Returns: GCMSExperiment with filtered abundance values Raises: ValueError: If threshold is not between 0 and 1 """ if not 0 <= threshold <= 1: raise ValueError("Threshold must be between 0 and 1") filtered_abundance = self.abundance_df.copy() filtered_abundance[self.chemical_cols] = filtered_abundance[ self.chemical_cols ].apply(lambda x: np.where(x < threshold, 0, x)) return GCMSExperiment( filtered_abundance, self.metadata_df, self.id_col, self.experiment_name, self.chemical_metadata, )
[docs] def calculate_relative_abundance(self) -> GCMSExperiment: """Calculate relative abundance of chemical compounds. Returns: GCMSExperiment with relative abundance values """ relative_abundance = self.abundance_df.copy() row_sums = relative_abundance[self.chemical_cols].sum(axis=1) relative_abundance[self.chemical_cols] = relative_abundance[ self.chemical_cols ].div(row_sums, axis=0) return GCMSExperiment( relative_abundance, self.metadata_df, self.id_col, self.experiment_name, self.chemical_metadata, )
[docs] def filter( self, metadata_mask: pd.Series[bool] | None = None, chemical_mask: pd.Series[bool] | None = None, ) -> GCMSExperiment: """Filter experiment using boolean masks for metadata and/or chemicals. Args: metadata_mask: Boolean Series for filtering metadata rows chemical_mask: Boolean Series for filtering chemical columns Returns: New GCMSExperiment with filtered data Example: # Filter based on both metadata and chemicals meta_mask = exp.metadata_df["Species"].isin(["ant", "bee"]) chem_mask = pd.Series([ exp.get_chemical_property(c, "class") == "terpene" for c in exp.chemical_cols ], index=exp.chemical_cols) filtered_exp = exp.filter( metadata_mask=meta_mask, chemical_mask=chem_mask ) """ filtered_meta = self.metadata_df filtered_abundance = self.abundance_df # Apply metadata filtering if metadata_mask is not None: filtered_meta = filtered_meta[metadata_mask] filtered_abundance = filtered_abundance[ filtered_abundance[self.id_col].isin(filtered_meta[self.id_col]) ] # Apply chemical filtering if chemical_mask is not None: selected_chemicals = [ col for col, include in chemical_mask.items() if include and col in self.chemical_cols ] filtered_abundance = filtered_abundance[[self.id_col] + selected_chemicals] return GCMSExperiment( filtered_abundance, filtered_meta, self.id_col, self.experiment_name, self.chemical_metadata, )