From e43d8fff20d013ca86e29822c7bffa957badf1da Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 4 Jan 2025 16:12:18 +0100 Subject: [PATCH] Switch to formulaic-contrasts (#682) * Switch to formulaic-contrasts * Cleanup * removing design matrix workaround (#691) Co-authored-by: Emma Dann * Fix PyDESeq2 * Update tests * fix typo in gitignore * Remove contrast dataclass, which isnt used anywhere * Fix edgeR rpy2 tests (#692) * fix broken rpy2 edger tests * updated edger tests * Fix tests (scipy) Signed-off-by: zethson * submodule Signed-off-by: zethson * Remove unused code Signed-off-by: zethson * type hints Signed-off-by: zethson --------- Signed-off-by: zethson Co-authored-by: Emma Dann <32264060+emdann@users.noreply.github.com> Co-authored-by: Emma Dann Co-authored-by: zethson --- .gitignore | 6 + docs/tutorials/index.md | 3 + docs/tutorials/notebooks | 2 +- .../_differential_gene_expression/__init__.py | 3 +- .../_differential_gene_expression/_base.py | 128 +++-------- .../_differential_gene_expression/_edger.py | 4 +- .../_formulaic.py | 189 ---------------- .../_pydeseq2.py | 44 ++-- .../_statsmodels.py | 11 - pertpy/tools/_distances/_distances.py | 10 +- pertpy/tools/_milo.py | 6 +- pertpy/tools/_mixscape.py | 14 +- pyproject.toml | 3 +- .../test_edger.py | 35 ++- .../test_formulaic.py | 208 ------------------ .../test_pydeseq2.py | 40 +++- 16 files changed, 146 insertions(+), 560 deletions(-) delete mode 100644 pertpy/tools/_differential_gene_expression/_formulaic.py delete mode 100644 tests/tools/_differential_gene_expression/test_formulaic.py diff --git a/.gitignore b/.gitignore index 025b3030..d02681ff 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,9 @@ dmypy.json # Jetbrains IDE .idea/ +# VSCode +.vscode + # Coala *.orig @@ -160,3 +163,6 @@ node_modules test.ipynb test-perturbation test-bug + +# uv +uv.lock diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 7960b8d2..13e4f0b0 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -14,6 +14,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc ## Quick start: Tool specific tutorials ### Data transformation + ```{eval-rst} .. nbgallery:: @@ -25,6 +26,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc ``` ### Knowledge inference + ```{eval-rst} .. nbgallery:: @@ -43,6 +45,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc ``` ## Use cases + Our use cases showcase a variety of pertpy tools applied to one dataset. They are designed to give you a sense of how to use pertpy in a real-world scenario. The use cases featured here are those we present in the pertpy [preprint](https://www.biorxiv.org/content/10.1101/2024.08.04.606516v1). diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 894e14e9..bd5237a5 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 894e14e92b7901e218c435a908946dc016fc4412 +Subproject commit bd5237a5d46bdff2983cc3bb27b3a3413656f74b diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py index 35ccef51..6cc925dd 100644 --- a/pertpy/tools/_differential_gene_expression/__init__.py +++ b/pertpy/tools/_differential_gene_expression/__init__.py @@ -1,4 +1,4 @@ -from ._base import ContrastType, LinearModelBase, MethodBase +from ._base import LinearModelBase, MethodBase from ._dge_comparison import DGEEVAL from ._edger import EdgeR from ._pydeseq2 import PyDESeq2 @@ -14,7 +14,6 @@ "SimpleComparisonBase", "WilcoxonTest", "TTest", - "ContrastType", ] AVAILABLE_METHODS = [Statsmodels, EdgeR, PyDESeq2, WilcoxonTest, TTest] diff --git a/pertpy/tools/_differential_gene_expression/_base.py b/pertpy/tools/_differential_gene_expression/_base.py index 4c86f02a..88573801 100644 --- a/pertpy/tools/_differential_gene_expression/_base.py +++ b/pertpy/tools/_differential_gene_expression/_base.py @@ -1,9 +1,7 @@ import math -import os from abc import ABC, abstractmethod -from collections.abc import Sequence -from dataclasses import dataclass -from itertools import chain, zip_longest +from collections.abc import Iterable, Mapping, Sequence +from itertools import zip_longest from types import MappingProxyType import adjustText @@ -12,9 +10,8 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -import scipy import seaborn as sns -import statsmodels +from formulaic_contrasts import FormulaicContrasts from lamin_utils import logger from matplotlib.pyplot import Figure from matplotlib.ticker import MaxNLocator @@ -22,24 +19,6 @@ from pertpy._doc import _doc_params, doc_common_plot_args from pertpy.tools import PseudobulkSpace from pertpy.tools._differential_gene_expression._checks import check_is_numeric_matrix -from pertpy.tools._differential_gene_expression._formulaic import ( - AmbiguousAttributeError, - Factor, - get_factor_storage_and_materializer, - resolve_ambiguous, -) - - -@dataclass -class Contrast: - """Simple contrast for comparison between groups""" - - column: str - baseline: str - group_to_compare: str - - -ContrastType = Contrast | tuple[str, str, str] class MethodBase(ABC): @@ -910,8 +889,7 @@ def _get_significance(p_val): class LinearModelBase(MethodBase): def __init__(self, adata, design, *, mask=None, layer=None, **kwargs): - """ - Initialize the method. + """Initialize the method. Args: adata: AnnData object, usually pseudobulked. @@ -923,26 +901,24 @@ def __init__(self, adata, design, *, mask=None, layer=None, **kwargs): super().__init__(adata, mask=mask, layer=layer) self._check_counts() - self.factor_storage = None - self.variable_to_factors = None - + self.formulaic_contrasts = None if isinstance(design, str): - self.factor_storage, self.variable_to_factors, materializer_class = get_factor_storage_and_materializer() - self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design) + self.formulaic_contrasts = FormulaicContrasts(adata.obs, design) + self.design = self.formulaic_contrasts.design_matrix else: self.design = design @classmethod def compare_groups( cls, - adata, - column, - baseline, - groups_to_compare, + adata: ad.AnnData, + column: str, + baseline: str, + groups_to_compare: str | Iterable[str], *, - paired_by=None, - mask=None, - layer=None, + paired_by: str | None = None, + mask: pd.Series | None = None, + layer: str | None = None, fit_kwargs=MappingProxyType({}), test_kwargs=MappingProxyType({}), ): @@ -968,17 +944,16 @@ def compare_groups( @property def variables(self): """Get the names of the variables used in the model definition.""" - try: - return self.design.model_spec.variables_by_source["data"] - except AttributeError: + if self.formulaic_contrasts is None: raise ValueError( "Retrieving variables is only possible if the model was initialized using a formula." ) from None + else: + return self.formulaic_contrasts.variables @abstractmethod def _check_counts(self): - """ - Check that counts are valid for the specific method. + """Check that counts are valid for the specific method. Raises: ValueError: if the data matrix does not comply with the expectations. @@ -987,8 +962,7 @@ def _check_counts(self): @abstractmethod def fit(self, **kwargs): - """ - Fit the model. + """Fit the model. Args: **kwargs: Additional arguments for fitting the specific method. @@ -998,9 +972,8 @@ def fit(self, **kwargs): @abstractmethod def _test_single_contrast(self, contrast, **kwargs): ... - def test_contrasts(self, contrasts, **kwargs): - """ - Perform a comparison as specified in a contrast vector. + def test_contrasts(self, contrasts: np.ndarray | Mapping[str | None, np.ndarray], **kwargs): + """Perform a comparison as specified in a contrast vector. Args: contrasts: Either a numeric contrast vector, or a dictionary of numeric contrast vectors. @@ -1016,11 +989,11 @@ def test_contrasts(self, contrasts, **kwargs): results.append(self._test_single_contrast(contrast, **kwargs).assign(contrast=name)) results_df = pd.concat(results) + return results_df def test_reduced(self, modelB): - """ - Test against a reduced model. + """Test against a reduced model. Args: modelB: the reduced model against which to test. @@ -1034,8 +1007,7 @@ def test_reduced(self, modelB): raise NotImplementedError def cond(self, **kwargs): - """ - Get a contrast vector representing a specific condition. + """Get a contrast vector representing a specific condition. Args: **kwargs: column/value pairs. @@ -1043,52 +1015,14 @@ def cond(self, **kwargs): Returns: A contrast vector that aligns to the columns of the design matrix. """ - if self.factor_storage is None: + if self.formulaic_contrasts is None: raise RuntimeError( "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector." ) - cond_dict = kwargs - if not set(cond_dict.keys()).issubset(self.variables): - raise ValueError( - "You specified a variable that is not part of the model. Available variables: " - + ",".join(self.variables) - ) - for var in self.variables: - if var in cond_dict: - self._check_category(var, cond_dict[var]) - else: - cond_dict[var] = self._get_default_value(var) - df = pd.DataFrame([kwargs]) - return self.design.model_spec.get_model_matrix(df).iloc[0] - - def _get_factor_metadata_for_variable(self, var): - factors = self.variable_to_factors[var] - return list(chain.from_iterable(self.factor_storage[f] for f in factors)) - - def _get_default_value(self, var): - factor_metadata = self._get_factor_metadata_for_variable(var) - if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL: - try: - tmp_base = resolve_ambiguous(factor_metadata, "base") - except AmbiguousAttributeError as e: - raise ValueError( - f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`." - ) from e - return tmp_base if tmp_base is not None else "\0" - else: - return 0 + return self.formulaic_contrasts.cond(**kwargs) - def _check_category(self, var, value): - factor_metadata = self._get_factor_metadata_for_variable(var) - tmp_categories = resolve_ambiguous(factor_metadata, "categories") - if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL and value not in tmp_categories: - raise ValueError( - f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}" - ) - - def contrast(self, column, baseline, group_to_compare): - """ - Build a simple contrast for pairwise comparisons. + def contrast(self, *args, **kwargs): + """Build a simple contrast for pairwise comparisons. Args: column: column in adata.obs to test on. @@ -1098,4 +1032,8 @@ def contrast(self, column, baseline, group_to_compare): Returns: Numeric contrast vector. """ - return self.cond(**{column: group_to_compare}) - self.cond(**{column: baseline}) + if self.formulaic_contrasts is None: + raise RuntimeError( + "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector." + ) + return self.formulaic_contrasts.contrast(*args, **kwargs) diff --git a/pertpy/tools/_differential_gene_expression/_edger.py b/pertpy/tools/_differential_gene_expression/_edger.py index bf72ee9f..15afab6a 100644 --- a/pertpy/tools/_differential_gene_expression/_edger.py +++ b/pertpy/tools/_differential_gene_expression/_edger.py @@ -60,8 +60,8 @@ def fit(self, **kwargs): # adata, design, mask, layer logger.info("Calculating NormFactors") dge = edger.calcNormFactors(dge) - with localconverter(get_conversion() + pandas2ri.converter): - design_r = ro.conversion.py2rpy(pd.DataFrame(self.design)) + with localconverter(get_conversion() + numpy2ri.converter): + design_r = ro.conversion.py2rpy(self.design.values) logger.info("Estimating Dispersions") dge = edger.estimateDisp(dge, design=design_r) diff --git a/pertpy/tools/_differential_gene_expression/_formulaic.py b/pertpy/tools/_differential_gene_expression/_formulaic.py deleted file mode 100644 index b6647491..00000000 --- a/pertpy/tools/_differential_gene_expression/_formulaic.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Helpers to interact with Formulaic Formulas - -Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`): - * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`. - * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`. - * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank. -""" - -from collections import defaultdict -from collections.abc import Mapping, Sequence -from dataclasses import dataclass -from typing import Any - -from formulaic import FactorValues, ModelSpec -from formulaic.materializers import PandasMaterializer -from formulaic.materializers.types import EvaluatedFactor -from formulaic.parser.types import Factor -from interface_meta import override - - -@dataclass -class FactorMetadata: - """Store (relevant) metadata for a factor of a formula.""" - - name: str - """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`""" - - reduced_rank: bool - """Whether a column will be dropped because it is redundant""" - - custom_encoder: bool - """Whether or not a custom encoder (e.g. `C(...)`) was used.""" - - categories: Sequence[str] - """The unique categories in this factor (after applying `drop_rows`)""" - - kind: Factor.Kind - """Type of the factor""" - - drop_field: str = None - """The category that is dropped. - - Note that - * this may also be populated if `reduced_rank = False` - * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`. - """ - - column_names: Sequence[str] = None - """The column names for this factor included in the design matrix. - - This may be the same as `categories` if the default encoder is used, or - categories without the base level if a custom encoder (e.g. `C(...)`) is used. - """ - - colname_format: str = None - """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`""" - - @property - def base(self) -> str | None: - """ - The base category for this categorical factor. - - This is derived from `drop_field` (for default encoding) or by comparing the column names in - the design matrix with all categories (for custom encoding, e.g. `C(...)`). - """ - if not self.reduced_rank: - return None - else: - if self.custom_encoder: - tmp_base = set(self.categories) - set(self.column_names) - assert len(tmp_base) == 1 - return tmp_base.pop() - else: - assert self.drop_field is not None - return self.drop_field - - -def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]: - """Keeps track of categorical factors used in a model specification by generating a custom materializer. - - This materializer reports back metadata upon materialization of the model matrix. - - Returns: - - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`. - - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms - but maps to factors rather than terms, named `variable_to_factors`. - - A materializer class tied to the specific instance of `factor_storage`. - """ - # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction - # term, it generates the factor with both full rank and reduced rank. - factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list) - variable_to_factors: dict[str, set[str]] = defaultdict(set) - - class CustomPandasMaterializer(PandasMaterializer): - """An extension of the PandasMaterializer that records all categorical variables and their (base) categories.""" - - REGISTER_NAME = "custom_pandas" - REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) - REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") - - def __init__( - self, - data: Any, - context: Mapping[str, Any] | None = None, - record_factor_metadata: bool = False, - **params: Any, - ): - """Initialize the Materializer. - - Args: - data: Passed to PandasMaterializer. - context: Passed to PandasMaterializer - record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class - is supposed to record factor metadata. Only the instance that is used for building the design - matrix should record the metadata. All other instances (e.g. used to generate contrast vectors) - should not record metadata to not overwrite the specifications from the design matrix. - **params: Passed to PandasMaterializer - """ - self.factor_metadata_storage = factor_storage if record_factor_metadata else None - self.variable_to_factors = variable_to_factors if record_factor_metadata else None - # temporary pointer to metadata of factor that is currently evaluated - self._current_factor: FactorMetadata = None - super().__init__(data, context, **params) - - @override - def _encode_evaled_factor( - self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False - ) -> dict[str, Any]: - """Function is called just before the factor is evaluated. - - We can record some metadata, before we call the original function. - """ - assert ( - self._current_factor is None - ), "_current_factor should always be None when we start recording metadata" - if self.factor_metadata_storage is not None: - # Don't store if the factor is cached (then we should already have recorded it) - if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache: - assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached" - else: - for var in factor.variables: - self.variable_to_factors[var].add(factor.expr) - self._current_factor = FactorMetadata( - name=factor.expr, - reduced_rank=reduced_rank, - categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), - custom_encoder=factor.metadata.encoder is not None, - kind=factor.metadata.kind, - ) - return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) - - @override - def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]: - """ - Function is called at the end, before the design matrix gets materialized. - - Here we have access to additional metadata, such as `drop_field`. - """ - if self._current_factor is not None: - assert self._current_factor.name == name - self._current_factor.drop_field = values.__formulaic_metadata__.drop_field - self._current_factor.column_names = values.__formulaic_metadata__.column_names - self._current_factor.colname_format = values.__formulaic_metadata__.format - self.factor_metadata_storage[name].append(self._current_factor) - self._current_factor = None - - return super()._flatten_encoded_evaled_factor(name, values) - - return factor_storage, variable_to_factors, CustomPandasMaterializer - - -class AmbiguousAttributeError(ValueError): - pass - - -def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any: - """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error.""" - if not objs: - raise ValueError("Collection is empty") - - first_obj_attr = getattr(objs[0], attr) - - # Check if the attribute is the same for all objects - for obj in objs[1:]: - if getattr(obj, attr) != first_obj_attr: - raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects") - - # If attribute is the same for all objects, return it - return first_obj_attr diff --git a/pertpy/tools/_differential_gene_expression/_pydeseq2.py b/pertpy/tools/_differential_gene_expression/_pydeseq2.py index c82b6a49..d4360e28 100644 --- a/pertpy/tools/_differential_gene_expression/_pydeseq2.py +++ b/pertpy/tools/_differential_gene_expression/_pydeseq2.py @@ -2,6 +2,7 @@ import re import warnings +import numpy as np import pandas as pd from anndata import AnnData from numpy import ndarray @@ -40,33 +41,25 @@ def fit(self, **kwargs) -> pd.DataFrame: Args: **kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed. """ - inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1)) - covars = self.design.columns.tolist() - if "Intercept" not in covars: - warnings.warn( - "Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2 - ) - processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"}) + try: + usable_cpus = len(os.sched_getaffinity(0)) + except AttributeError: + usable_cpus = os.cpu_count() + + inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", usable_cpus)) + dds = DeseqDataSet( - adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs + adata=self.adata, + design=self.design, # initialize using design matrix, not formula + refit_cooks=True, + inference=inference, + **kwargs, ) - # workaround code to insert design array - des_mtx_cols = dds.obsm["design_matrix"].columns - dds.obsm["design_matrix"] = self.design - if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols): - dds.obsm["design_matrix"].columns = des_mtx_cols.copy() dds.deseq2() self.dds = dds - # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet - # see https://github.com/owkin/PyDESeq2/issues/213 - - # Therefore these functions are overridden in a way to make it work with PyDESeq2, - # ingoring the inconsistency of function signatures. Once arbitrary design - # matrices and contrasts are supported by PyDEseq2, we can fully support the - # Linear model interface. - def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore + def _test_single_contrast(self, contrast, alpha=0.05, **kwargs) -> pd.DataFrame: """Conduct a specific test and returns a Pandas DataFrame. Args: @@ -74,6 +67,7 @@ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd alpha: p value threshold used for controlling fdr with independent hypothesis weighting **kwargs: extra arguments to pass to DeseqStats() """ + contrast = np.array(contrast) stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs) # Calling `.summary()` is required to fill the `results_df` data frame stat_res.summary() @@ -85,11 +79,3 @@ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd res_df.index.name = "variable" res_df = res_df.reset_index() return res_df - - def cond(self, **kwargs) -> ndarray: - raise NotImplementedError( - "PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213" - ) - - def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore - return (column, group_to_compare, baseline) diff --git a/pertpy/tools/_differential_gene_expression/_statsmodels.py b/pertpy/tools/_differential_gene_expression/_statsmodels.py index 02bfad98..cd8bc5fa 100644 --- a/pertpy/tools/_differential_gene_expression/_statsmodels.py +++ b/pertpy/tools/_differential_gene_expression/_statsmodels.py @@ -59,14 +59,3 @@ def _test_single_contrast(self, contrast, **kwargs) -> pd.DataFrame: } ) return pd.DataFrame(res).sort_values("p_value") - - def contrast(self, column: str, baseline: str, group_to_compare: str) -> np.ndarray: - """Build a simple contrast for pairwise comparisons. - - This is equivalent to - - ``` - model.cond( = baseline) - model.cond( = group_to_compare) - ``` - """ - return self.cond(**{column: baseline}) - self.cond(**{column: group_to_compare}) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index f2442bd1..ec9001d4 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -344,9 +344,9 @@ def pairwise( else: embedding = adata.obsm[self.obsm_key].copy() for index_x, group_x in enumerate(fct(groups)): - cells_x = embedding[grouping == group_x].copy() + cells_x = embedding[np.asarray(grouping == group_x)].copy() for group_y in groups[index_x:]: # type: ignore - cells_y = embedding[grouping == group_y].copy() + cells_y = embedding[np.asarray(grouping == group_y)].copy() if not bootstrap: # By distance axiom, the distance between a group and itself is 0 dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs) @@ -478,9 +478,9 @@ def onesided_distances( else: embedding = adata.obsm[self.obsm_key].copy() for group_x in fct(groups): - cells_x = embedding[grouping == group_x].copy() + cells_x = embedding[np.asarray(grouping == group_x)].copy() group_y = selected_group - cells_y = embedding[grouping == group_y].copy() + cells_y = embedding[np.asarray(grouping == group_y)].copy() if not bootstrap: # By distance axiom, the distance between a group and itself is 0 dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs) @@ -988,7 +988,7 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: try: nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params return _compute_nll(y, nb_params, epsilon) - except np.linalg.linalg.LinAlgError: + except np.linalg.LinAlgError: if x.mean() < 10 and y.mean() < 10: return 0.0 else: diff --git a/pertpy/tools/_milo.py b/pertpy/tools/_milo.py index 53bda6b9..afef5292 100644 --- a/pertpy/tools/_milo.py +++ b/pertpy/tools/_milo.py @@ -183,7 +183,7 @@ def make_nhoods( knn_dists = adata.obsp[neighbors_key + "_distances"] nhood_ixs = adata.obs["nhood_ixs_refined"] == 1 - dist_mat = knn_dists[nhood_ixs, :] + dist_mat = knn_dists[np.asarray(nhood_ixs), :] k_distances = dist_mat.max(1).toarray().ravel() adata.obs["nhood_kth_distance"] = 0 adata.obs["nhood_kth_distance"] = adata.obs["nhood_kth_distance"].astype(float) @@ -704,8 +704,8 @@ def _graph_spatial_fdr( pvalues = sample_adata.var["PValue"] keep_nhoods = ~pvalues.isna() # Filtering in case of test on subset of nhoods o = pvalues[keep_nhoods].argsort() - pvalues = pvalues[keep_nhoods][o] - w = w[keep_nhoods][o] + pvalues = pvalues.loc[keep_nhoods].iloc[o] + w = w.loc[keep_nhoods].iloc[o] adjp = np.zeros(shape=len(o)) adjp[o] = (sum(w) * pvalues / np.cumsum(w))[::-1].cummin()[::-1] diff --git a/pertpy/tools/_mixscape.py b/pertpy/tools/_mixscape.py index fddc54c2..c015eaaa 100644 --- a/pertpy/tools/_mixscape.py +++ b/pertpy/tools/_mixscape.py @@ -104,7 +104,7 @@ def perturbation_signature( control_mask_split = control_mask & split_mask R_split = representation[split_mask] - R_control = representation[control_mask_split] + R_control = representation[np.asarray(control_mask_split)] from pynndescent import NNDescent @@ -112,7 +112,7 @@ def perturbation_signature( nn_index = NNDescent(R_control, **kwargs) indices, _ = nn_index.query(R_split, k=n_neighbors, epsilon=eps) - X_control = np.expm1(adata.X[control_mask_split]) + X_control = np.expm1(adata.X[np.asarray(control_mask_split)]) n_split = split_mask.sum() n_control = X_control.shape[0] @@ -256,7 +256,7 @@ def mixscape( else: de_genes = perturbation_markers[(category, gene)] de_genes_indices = self._get_column_indices(adata, list(de_genes)) - dat = X[all_cells][:, de_genes_indices] + dat = X[np.asarray(all_cells)][:, de_genes_indices] converged = False n_iter = 0 old_classes = adata.obs[labels][all_cells] @@ -266,8 +266,8 @@ def mixscape( # get average value for each gene over all selected cells # all cells in current split&Gene minus all NT cells in current split # Each row is for each cell, each column is for each gene, get mean for each column - vec = np.mean(X[guide_cells][:, de_genes_indices], axis=0) - np.mean( - X[nt_cells][:, de_genes_indices], axis=0 + vec = np.mean(X[np.asarray(guide_cells)][:, de_genes_indices], axis=0) - np.mean( + X[np.asarray(nt_cells)][:, de_genes_indices], axis=0 ) # project cells onto the perturbation vector if isinstance(dat, spmatrix): @@ -596,7 +596,7 @@ def plot_barplot( # pragma: no cover ax.set_title(gene, bbox={"facecolor": "white", "edgecolor": "black", "pad": 1}, fontsize=axis_title_size) ax.set(xlabel="sgRNA", ylabel="% of cells") sns.despine(ax=ax, top=True, right=True, left=False, bottom=False) - ax.set_xticks(ax.get_xticks(),ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size) + ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size) ax.set_yticks(ax.get_yticks(), ax.get_yticklabels(), rotation=0, fontsize=axis_text_y_size) ax.legend( title="Mixscape Class", @@ -614,7 +614,7 @@ def plot_barplot( # pragma: no cover if show: plt.show() if return_fig: - return fig + return fig return None @_doc_params(common_plot_args=doc_common_plot_args) diff --git a/pyproject.toml b/pyproject.toml index 31673b01..3c1a6b5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,9 @@ coda = [ "pyqt5" # remove this after ete4 got released ] de = [ + "formulaic-contrasts>=0.2.0", "formulaic", - "pydeseq2", + "pydeseq2>=v0.5.0pre1", ] dev = [ "pre-commit", diff --git a/tests/tools/_differential_gene_expression/test_edger.py b/tests/tools/_differential_gene_expression/test_edger.py index 27b4584a..96cbe132 100644 --- a/tests/tools/_differential_gene_expression/test_edger.py +++ b/tests/tools/_differential_gene_expression/test_edger.py @@ -1,4 +1,5 @@ -from pertpy.tools._differential_gene_expression import EdgeR +import numpy.testing as npt +from pertpy.tools._differential_gene_expression import EdgeR, PyDESeq2 def test_edger_simple(test_adata): @@ -13,6 +14,28 @@ def test_edger_simple(test_adata): res_df = method.test_contrasts(method.contrast("condition", "A", "B")) assert len(res_df) == test_adata.n_vars + # Compare against snapshot + npt.assert_almost_equal( + res_df.p_value.values, + [ + 8.0000e-05, + 1.8000e-04, + 5.3000e-04, + 1.1800e-03, + 3.3800e-02, + 3.3820e-02, + 7.7980e-02, + 1.3715e-01, + 2.5052e-01, + 9.2485e-01, + ], + decimal=4, + ) + npt.assert_almost_equal( + res_df.log_fc.values, + [0.61208, -0.39374, 0.57944, 0.7343, -0.58675, 0.42575, -0.23951, -0.20761, 0.17489, 0.0247], + decimal=4, + ) def test_edger_complex(test_adata): @@ -28,6 +51,12 @@ def test_edger_complex(test_adata): # Check that the index of the result matches the var_names of the AnnData object assert set(test_adata.var_names) == set(res_df["variable"]) + # Compare ranking of genes from a different method (without design matrix handling) + down_gene = res_df.set_index("variable").loc["gene3", "log_fc"] + up_gene = res_df.set_index("variable").loc["gene1", "log_fc"] + assert down_gene < up_gene -# TODO: there should be a test checking if, for a concrete example, the output p-values and effect sizes are what -# we expect (-> frozen snapshot, that way we also get a heads-up if something changes upstream) + method = PyDESeq2(adata=test_adata, design="~condition1+group") + method.fit() + deseq_res_df = method.test_contrasts(method.contrast("condition1", "A", "B")) + assert all(res_df.sort_values("log_fc")["variable"].values == deseq_res_df.sort_values("log_fc")["variable"].values) diff --git a/tests/tools/_differential_gene_expression/test_formulaic.py b/tests/tools/_differential_gene_expression/test_formulaic.py deleted file mode 100644 index f5305c5a..00000000 --- a/tests/tools/_differential_gene_expression/test_formulaic.py +++ /dev/null @@ -1,208 +0,0 @@ -import pandas as pd -import pytest -from formulaic.parser.types import Factor -from pertpy.tools._differential_gene_expression._formulaic import ( - AmbiguousAttributeError, - FactorMetadata, - get_factor_storage_and_materializer, - resolve_ambiguous, -) - - -@pytest.mark.parametrize( - "formula,reorder_categorical,expected_factor_metadata", - [ - [ - "~ donor", - None, - {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}}, - ], - [ - "~ donor", - {"donor": ["D2", "D1", "D0", "D3"]}, - {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D2"}}, - ], - [ - "~ C(donor)", - None, - {"C(donor)": {"reduced_rank": True, "custom_encoder": True, "base": "D0"}}, - ], - [ - "~ C(donor, contr.treatment(base='D2'))", - None, - {"C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}}, - ], - [ - "~ C(donor, contr.sum)", - None, - {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D3"}}, - ], - [ - "~ C(donor, contr.sum)", - {"donor": ["D1", "D0", "D3", "D2"]}, - {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}}, - ], - [ - "~ condition", - None, - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}}, - ], - [ - "~ C(condition)", - None, - {"C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"}}, - ], - [ - "~ C(condition, contr.treatment(base='B'))", - None, - {"C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"}}, - ], - [ - "~ C(condition, contr.sum)", - None, - {"C(condition, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "B"}}, - ], - [ - "~ 0 + condition", - None, - {"condition": {"reduced_rank": False, "custom_encoder": False, "base": None}}, - ], - [ - "~ condition + donor", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, - }, - ], - [ - "~ 0 + condition + donor", - None, - { - "condition": {"reduced_rank": False, "custom_encoder": False, "base": None}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, - }, - ], - [ - "~ condition * donor", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, - }, - ], - [ - "~ condition * C(donor, contr.treatment(base='D2'))", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}, - }, - ], - [ - "~ condition + C(condition) + C(condition, contr.treatment(base='B'))", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"}, - "C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"}, - }, - ], - [ - "~ condition + continuous + np.log(continuous)", - None, - { - "condition": { - "reduced_rank": True, - "custom_encoder": False, - "base": "A", - "kind": Factor.Kind.CATEGORICAL, - }, - "continuous": { - "reduced_rank": False, - "custom_encoder": False, - "base": None, - "kind": Factor.Kind.NUMERICAL, - }, - "np.log(continuous)": { - "reduced_rank": False, - "custom_encoder": False, - "base": None, - "kind": Factor.Kind.NUMERICAL, - }, - }, - ], - [ - "~ condition * donor + continuous", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, - "continuous": { - "reduced_rank": False, - "custom_encoder": False, - "base": None, - "kind": Factor.Kind.NUMERICAL, - }, - }, - ], - [ - "~ condition:donor", - None, - { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "donor": { - "custom_encoder": False, - "drop_field": "D0", - }, # `reduced_rank` and `base` will be ambigous here because Formulaic generates both version of the factor internally - }, - ], - ], -) -def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, expected_factor_metadata): - """Test that the custom materializer correctly stores the baseline category. - - Parameters - ---------- - test_adata_minimal - adata fixture - formula - Formula to test - reorder_categorical - Create a pandas categorical for a given column with a certain order of categories - expected_factor_metadata - dict with expected values for each factor - """ - if reorder_categorical is not None: - for col, order in reorder_categorical.items(): - test_adata_minimal.obs[col] = pd.Categorical(test_adata_minimal.obs[col], categories=order) - factor_storage, _, materializer = get_factor_storage_and_materializer() - materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula) - for factor, expected_metadata in expected_factor_metadata.items(): - actual_metadata = factor_storage[factor] - for k in expected_metadata: - assert resolve_ambiguous(actual_metadata, k) == expected_metadata[k] - - -def test_resolve_ambiguous(): - obj1 = FactorMetadata("F1", True, True, ["A", "B"], Factor.Kind.CATEGORICAL) - obj2 = FactorMetadata("F2", True, False, ["A", "B"], Factor.Kind.CATEGORICAL) - obj3 = FactorMetadata("F3", True, False, None, Factor.Kind.NUMERICAL) - - with pytest.raises(ValueError): - resolve_ambiguous([], "foo") - - with pytest.raises(AttributeError): - resolve_ambiguous([obj1, obj2], "doesntexist") - - with pytest.raises(AmbiguousAttributeError): - assert resolve_ambiguous([obj1, obj2], "name") - - assert resolve_ambiguous([obj1, obj2, obj3], "reduced_rank") is True - assert resolve_ambiguous([obj1, obj2], "categories") == ["A", "B"] - - with pytest.raises(AmbiguousAttributeError): - assert resolve_ambiguous([obj1, obj2, obj3], "categories") - - with pytest.raises(AmbiguousAttributeError): - assert resolve_ambiguous([obj1, obj3], "kind") diff --git a/tests/tools/_differential_gene_expression/test_pydeseq2.py b/tests/tools/_differential_gene_expression/test_pydeseq2.py index aab7e3d1..438dcf75 100644 --- a/tests/tools/_differential_gene_expression/test_pydeseq2.py +++ b/tests/tools/_differential_gene_expression/test_pydeseq2.py @@ -1,3 +1,4 @@ +import numpy.testing as npt from pertpy.tools._differential_gene_expression import PyDESeq2 @@ -10,9 +11,20 @@ def test_pydeseq2_simple(test_adata): """ method = PyDESeq2(adata=test_adata, design="~condition") method.fit() - res_df = method.test_contrasts(["condition", "A", "B"]) + res_df = method.test_contrasts(method.contrast("condition", "A", "B")) assert len(res_df) == test_adata.n_vars + # Compare against snapshot + npt.assert_almost_equal( + res_df.p_value.values, + [0.00017, 0.00033, 0.00051, 0.0286, 0.03207, 0.04723, 0.11039, 0.11452, 0.3703, 0.99625], + decimal=4, + ) + npt.assert_almost_equal( + res_df.log_fc.values, + [0.58207, 0.53855, -0.4121, 0.63281, -0.63283, -0.27066, -0.21271, 0.38601, 0.13434, 0.00146], + decimal=4, + ) def test_pydeseq2_complex(test_adata): @@ -22,12 +34,32 @@ def test_pydeseq2_complex(test_adata): test_adata.obs["condition1"] = test_adata.obs["condition"].copy() method = PyDESeq2(adata=test_adata, design="~condition1+group") method.fit() - res_df = method.test_contrasts(["condition1", "A", "B"]) + res_df = method.test_contrasts(method.contrast("condition1", "A", "B")) assert len(res_df) == test_adata.n_vars # Check that the index of the result matches the var_names of the AnnData object assert set(test_adata.var_names) == set(res_df["variable"]) + # Compare against snapshot + npt.assert_almost_equal( + res_df.p_value.values, + [7e-05, 0.00012, 0.00035, 0.01062, 0.01906, 0.03892, 0.10755, 0.11175, 0.36631, 0.94952], + decimal=4, + ) + npt.assert_almost_equal( + res_df.log_fc.values, + [-0.42347, 0.58802, 0.53528, 0.73147, -0.67374, -0.27158, -0.21402, 0.38953, 0.13511, -0.01949], + decimal=4, + ) -# TODO: there should be a test checking if, for a concrete example, the output p-values and effect sizes are what -# we expect (-> frozen snapshot, that way we also get a heads-up if something changes upstream) +def test_pydeseq2_formula(test_adata): + """Check that the pyDESeq2 method gives consistent results when specifying contrasts, regardless of the order of covariates""" + model1 = PyDESeq2(adata=test_adata, design="~condition+group") + model1.fit() + res_1 = model1.test_contrasts(model1.contrast("condition", "A", "B")) + + model2 = PyDESeq2(adata=test_adata, design="~group+condition") + model2.fit() + res_2 = model2.test_contrasts(model2.contrast("condition", "A", "B")) + + npt.assert_almost_equal(res_2.log_fc.values, res_1.log_fc.values)