From e43d8fff20d013ca86e29822c7bffa957badf1da Mon Sep 17 00:00:00 2001
From: Gregor Sturm <mail@gregor-sturm.de>
Date: Sat, 4 Jan 2025 16:12:18 +0100
Subject: [PATCH] Switch to formulaic-contrasts (#682)

* Switch to formulaic-contrasts

* Cleanup

* removing design matrix workaround (#691)

Co-authored-by: Emma Dann <emmadann@comino.stanford.edu>

* Fix PyDESeq2

* Update tests

* fix typo in gitignore

* Remove contrast dataclass, which isnt used anywhere

* Fix edgeR rpy2 tests (#692)

* fix broken rpy2 edger tests

* updated edger tests

* Fix tests (scipy)

Signed-off-by: zethson <lukas.heumos@posteo.net>

* submodule

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Remove unused code

Signed-off-by: zethson <lukas.heumos@posteo.net>

* type hints

Signed-off-by: zethson <lukas.heumos@posteo.net>

---------

Signed-off-by: zethson <lukas.heumos@posteo.net>
Co-authored-by: Emma Dann <32264060+emdann@users.noreply.github.com>
Co-authored-by: Emma Dann <emmadann@comino.stanford.edu>
Co-authored-by: zethson <lukas.heumos@posteo.net>
---
 .gitignore                                    |   6 +
 docs/tutorials/index.md                       |   3 +
 docs/tutorials/notebooks                      |   2 +-
 .../_differential_gene_expression/__init__.py |   3 +-
 .../_differential_gene_expression/_base.py    | 128 +++--------
 .../_differential_gene_expression/_edger.py   |   4 +-
 .../_formulaic.py                             | 189 ----------------
 .../_pydeseq2.py                              |  44 ++--
 .../_statsmodels.py                           |  11 -
 pertpy/tools/_distances/_distances.py         |  10 +-
 pertpy/tools/_milo.py                         |   6 +-
 pertpy/tools/_mixscape.py                     |  14 +-
 pyproject.toml                                |   3 +-
 .../test_edger.py                             |  35 ++-
 .../test_formulaic.py                         | 208 ------------------
 .../test_pydeseq2.py                          |  40 +++-
 16 files changed, 146 insertions(+), 560 deletions(-)
 delete mode 100644 pertpy/tools/_differential_gene_expression/_formulaic.py
 delete mode 100644 tests/tools/_differential_gene_expression/test_formulaic.py

diff --git a/.gitignore b/.gitignore
index 025b3030..d02681ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,6 +137,9 @@ dmypy.json
 # Jetbrains IDE
 .idea/
 
+# VSCode
+.vscode
+
 # Coala
 *.orig
 
@@ -160,3 +163,6 @@ node_modules
 test.ipynb
 test-perturbation
 test-bug
+
+# uv
+uv.lock
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 7960b8d2..13e4f0b0 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -14,6 +14,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc
 ## Quick start: Tool specific tutorials
 
 ### Data transformation
+
 ```{eval-rst}
 .. nbgallery::
 
@@ -25,6 +26,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc
 ```
 
 ### Knowledge inference
+
 ```{eval-rst}
 .. nbgallery::
 
@@ -43,6 +45,7 @@ For questions about the usage of pertpy use the [scverse discourse](https://disc
 ```
 
 ## Use cases
+
 Our use cases showcase a variety of pertpy tools applied to one dataset.
 They are designed to give you a sense of how to use pertpy in a real-world scenario.
 The use cases featured here are those we present in the pertpy [preprint](https://www.biorxiv.org/content/10.1101/2024.08.04.606516v1).
diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks
index 894e14e9..bd5237a5 160000
--- a/docs/tutorials/notebooks
+++ b/docs/tutorials/notebooks
@@ -1 +1 @@
-Subproject commit 894e14e92b7901e218c435a908946dc016fc4412
+Subproject commit bd5237a5d46bdff2983cc3bb27b3a3413656f74b
diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py
index 35ccef51..6cc925dd 100644
--- a/pertpy/tools/_differential_gene_expression/__init__.py
+++ b/pertpy/tools/_differential_gene_expression/__init__.py
@@ -1,4 +1,4 @@
-from ._base import ContrastType, LinearModelBase, MethodBase
+from ._base import LinearModelBase, MethodBase
 from ._dge_comparison import DGEEVAL
 from ._edger import EdgeR
 from ._pydeseq2 import PyDESeq2
@@ -14,7 +14,6 @@
     "SimpleComparisonBase",
     "WilcoxonTest",
     "TTest",
-    "ContrastType",
 ]
 
 AVAILABLE_METHODS = [Statsmodels, EdgeR, PyDESeq2, WilcoxonTest, TTest]
diff --git a/pertpy/tools/_differential_gene_expression/_base.py b/pertpy/tools/_differential_gene_expression/_base.py
index 4c86f02a..88573801 100644
--- a/pertpy/tools/_differential_gene_expression/_base.py
+++ b/pertpy/tools/_differential_gene_expression/_base.py
@@ -1,9 +1,7 @@
 import math
-import os
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
-from dataclasses import dataclass
-from itertools import chain, zip_longest
+from collections.abc import Iterable, Mapping, Sequence
+from itertools import zip_longest
 from types import MappingProxyType
 
 import adjustText
@@ -12,9 +10,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import scipy
 import seaborn as sns
-import statsmodels
+from formulaic_contrasts import FormulaicContrasts
 from lamin_utils import logger
 from matplotlib.pyplot import Figure
 from matplotlib.ticker import MaxNLocator
@@ -22,24 +19,6 @@
 from pertpy._doc import _doc_params, doc_common_plot_args
 from pertpy.tools import PseudobulkSpace
 from pertpy.tools._differential_gene_expression._checks import check_is_numeric_matrix
-from pertpy.tools._differential_gene_expression._formulaic import (
-    AmbiguousAttributeError,
-    Factor,
-    get_factor_storage_and_materializer,
-    resolve_ambiguous,
-)
-
-
-@dataclass
-class Contrast:
-    """Simple contrast for comparison between groups"""
-
-    column: str
-    baseline: str
-    group_to_compare: str
-
-
-ContrastType = Contrast | tuple[str, str, str]
 
 
 class MethodBase(ABC):
@@ -910,8 +889,7 @@ def _get_significance(p_val):
 
 class LinearModelBase(MethodBase):
     def __init__(self, adata, design, *, mask=None, layer=None, **kwargs):
-        """
-        Initialize the method.
+        """Initialize the method.
 
         Args:
             adata: AnnData object, usually pseudobulked.
@@ -923,26 +901,24 @@ def __init__(self, adata, design, *, mask=None, layer=None, **kwargs):
         super().__init__(adata, mask=mask, layer=layer)
         self._check_counts()
 
-        self.factor_storage = None
-        self.variable_to_factors = None
-
+        self.formulaic_contrasts = None
         if isinstance(design, str):
-            self.factor_storage, self.variable_to_factors, materializer_class = get_factor_storage_and_materializer()
-            self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design)
+            self.formulaic_contrasts = FormulaicContrasts(adata.obs, design)
+            self.design = self.formulaic_contrasts.design_matrix
         else:
             self.design = design
 
     @classmethod
     def compare_groups(
         cls,
-        adata,
-        column,
-        baseline,
-        groups_to_compare,
+        adata: ad.AnnData,
+        column: str,
+        baseline: str,
+        groups_to_compare: str | Iterable[str],
         *,
-        paired_by=None,
-        mask=None,
-        layer=None,
+        paired_by: str | None = None,
+        mask: pd.Series | None = None,
+        layer: str | None = None,
         fit_kwargs=MappingProxyType({}),
         test_kwargs=MappingProxyType({}),
     ):
@@ -968,17 +944,16 @@ def compare_groups(
     @property
     def variables(self):
         """Get the names of the variables used in the model definition."""
-        try:
-            return self.design.model_spec.variables_by_source["data"]
-        except AttributeError:
+        if self.formulaic_contrasts is None:
             raise ValueError(
                 "Retrieving variables is only possible if the model was initialized using a formula."
             ) from None
+        else:
+            return self.formulaic_contrasts.variables
 
     @abstractmethod
     def _check_counts(self):
-        """
-        Check that counts are valid for the specific method.
+        """Check that counts are valid for the specific method.
 
         Raises:
             ValueError: if the data matrix does not comply with the expectations.
@@ -987,8 +962,7 @@ def _check_counts(self):
 
     @abstractmethod
     def fit(self, **kwargs):
-        """
-        Fit the model.
+        """Fit the model.
 
         Args:
             **kwargs: Additional arguments for fitting the specific method.
@@ -998,9 +972,8 @@ def fit(self, **kwargs):
     @abstractmethod
     def _test_single_contrast(self, contrast, **kwargs): ...
 
-    def test_contrasts(self, contrasts, **kwargs):
-        """
-        Perform a comparison as specified in a contrast vector.
+    def test_contrasts(self, contrasts: np.ndarray | Mapping[str | None, np.ndarray], **kwargs):
+        """Perform a comparison as specified in a contrast vector.
 
         Args:
             contrasts: Either a numeric contrast vector, or a dictionary of numeric contrast vectors.
@@ -1016,11 +989,11 @@ def test_contrasts(self, contrasts, **kwargs):
             results.append(self._test_single_contrast(contrast, **kwargs).assign(contrast=name))
 
         results_df = pd.concat(results)
+
         return results_df
 
     def test_reduced(self, modelB):
-        """
-        Test against a reduced model.
+        """Test against a reduced model.
 
         Args:
             modelB: the reduced model against which to test.
@@ -1034,8 +1007,7 @@ def test_reduced(self, modelB):
         raise NotImplementedError
 
     def cond(self, **kwargs):
-        """
-        Get a contrast vector representing a specific condition.
+        """Get a contrast vector representing a specific condition.
 
         Args:
             **kwargs: column/value pairs.
@@ -1043,52 +1015,14 @@ def cond(self, **kwargs):
         Returns:
             A contrast vector that aligns to the columns of the design matrix.
         """
-        if self.factor_storage is None:
+        if self.formulaic_contrasts is None:
             raise RuntimeError(
                 "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector."
             )
-        cond_dict = kwargs
-        if not set(cond_dict.keys()).issubset(self.variables):
-            raise ValueError(
-                "You specified a variable that is not part of the model. Available variables: "
-                + ",".join(self.variables)
-            )
-        for var in self.variables:
-            if var in cond_dict:
-                self._check_category(var, cond_dict[var])
-            else:
-                cond_dict[var] = self._get_default_value(var)
-        df = pd.DataFrame([kwargs])
-        return self.design.model_spec.get_model_matrix(df).iloc[0]
-
-    def _get_factor_metadata_for_variable(self, var):
-        factors = self.variable_to_factors[var]
-        return list(chain.from_iterable(self.factor_storage[f] for f in factors))
-
-    def _get_default_value(self, var):
-        factor_metadata = self._get_factor_metadata_for_variable(var)
-        if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL:
-            try:
-                tmp_base = resolve_ambiguous(factor_metadata, "base")
-            except AmbiguousAttributeError as e:
-                raise ValueError(
-                    f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`."
-                ) from e
-            return tmp_base if tmp_base is not None else "\0"
-        else:
-            return 0
+        return self.formulaic_contrasts.cond(**kwargs)
 
-    def _check_category(self, var, value):
-        factor_metadata = self._get_factor_metadata_for_variable(var)
-        tmp_categories = resolve_ambiguous(factor_metadata, "categories")
-        if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL and value not in tmp_categories:
-            raise ValueError(
-                f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}"
-            )
-
-    def contrast(self, column, baseline, group_to_compare):
-        """
-        Build a simple contrast for pairwise comparisons.
+    def contrast(self, *args, **kwargs):
+        """Build a simple contrast for pairwise comparisons.
 
         Args:
             column: column in adata.obs to test on.
@@ -1098,4 +1032,8 @@ def contrast(self, column, baseline, group_to_compare):
         Returns:
             Numeric contrast vector.
         """
-        return self.cond(**{column: group_to_compare}) - self.cond(**{column: baseline})
+        if self.formulaic_contrasts is None:
+            raise RuntimeError(
+                "Building contrasts with `cond` only works if you specified the model using a formulaic formula. Please manually provide a contrast vector."
+            )
+        return self.formulaic_contrasts.contrast(*args, **kwargs)
diff --git a/pertpy/tools/_differential_gene_expression/_edger.py b/pertpy/tools/_differential_gene_expression/_edger.py
index bf72ee9f..15afab6a 100644
--- a/pertpy/tools/_differential_gene_expression/_edger.py
+++ b/pertpy/tools/_differential_gene_expression/_edger.py
@@ -60,8 +60,8 @@ def fit(self, **kwargs):  # adata, design, mask, layer
         logger.info("Calculating NormFactors")
         dge = edger.calcNormFactors(dge)
 
-        with localconverter(get_conversion() + pandas2ri.converter):
-            design_r = ro.conversion.py2rpy(pd.DataFrame(self.design))
+        with localconverter(get_conversion() + numpy2ri.converter):
+            design_r = ro.conversion.py2rpy(self.design.values)
 
         logger.info("Estimating Dispersions")
         dge = edger.estimateDisp(dge, design=design_r)
diff --git a/pertpy/tools/_differential_gene_expression/_formulaic.py b/pertpy/tools/_differential_gene_expression/_formulaic.py
deleted file mode 100644
index b6647491..00000000
--- a/pertpy/tools/_differential_gene_expression/_formulaic.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""Helpers to interact with Formulaic Formulas
-
-Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
- * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
- * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
- * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
-"""
-
-from collections import defaultdict
-from collections.abc import Mapping, Sequence
-from dataclasses import dataclass
-from typing import Any
-
-from formulaic import FactorValues, ModelSpec
-from formulaic.materializers import PandasMaterializer
-from formulaic.materializers.types import EvaluatedFactor
-from formulaic.parser.types import Factor
-from interface_meta import override
-
-
-@dataclass
-class FactorMetadata:
-    """Store (relevant) metadata for a factor of a formula."""
-
-    name: str
-    """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
-
-    reduced_rank: bool
-    """Whether a column will be dropped because it is redundant"""
-
-    custom_encoder: bool
-    """Whether or not a custom encoder (e.g. `C(...)`) was used."""
-
-    categories: Sequence[str]
-    """The unique categories in this factor (after applying `drop_rows`)"""
-
-    kind: Factor.Kind
-    """Type of the factor"""
-
-    drop_field: str = None
-    """The category that is dropped.
-
-    Note that
-      * this may also be populated if `reduced_rank = False`
-      * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
-    """
-
-    column_names: Sequence[str] = None
-    """The column names for this factor included in the design matrix.
-
-    This may be the same as `categories` if the default encoder is used, or
-    categories without the base level if a custom encoder (e.g. `C(...)`) is used.
-    """
-
-    colname_format: str = None
-    """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
-
-    @property
-    def base(self) -> str | None:
-        """
-        The base category for this categorical factor.
-
-        This is derived from `drop_field` (for default encoding) or by comparing the column names in
-        the design matrix with all categories (for custom encoding, e.g. `C(...)`).
-        """
-        if not self.reduced_rank:
-            return None
-        else:
-            if self.custom_encoder:
-                tmp_base = set(self.categories) - set(self.column_names)
-                assert len(tmp_base) == 1
-                return tmp_base.pop()
-            else:
-                assert self.drop_field is not None
-                return self.drop_field
-
-
-def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
-    """Keeps track of categorical factors used in a model specification by generating a custom materializer.
-
-    This materializer reports back metadata upon materialization of the model matrix.
-
-    Returns:
-        - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
-        - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
-            but maps to factors rather than terms, named `variable_to_factors`.
-        - A materializer class tied to the specific instance of `factor_storage`.
-    """
-    # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
-    # term, it generates the factor with both full rank and reduced rank.
-    factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
-    variable_to_factors: dict[str, set[str]] = defaultdict(set)
-
-    class CustomPandasMaterializer(PandasMaterializer):
-        """An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
-
-        REGISTER_NAME = "custom_pandas"
-        REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
-        REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
-
-        def __init__(
-            self,
-            data: Any,
-            context: Mapping[str, Any] | None = None,
-            record_factor_metadata: bool = False,
-            **params: Any,
-        ):
-            """Initialize the Materializer.
-
-            Args:
-                data: Passed to PandasMaterializer.
-                context: Passed to PandasMaterializer
-                record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
-                    is supposed to record factor metadata. Only the instance that is used for building the design
-                    matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
-                    should not record metadata to not overwrite the specifications from the design matrix.
-                **params: Passed to PandasMaterializer
-            """
-            self.factor_metadata_storage = factor_storage if record_factor_metadata else None
-            self.variable_to_factors = variable_to_factors if record_factor_metadata else None
-            # temporary pointer to metadata of factor that is currently evaluated
-            self._current_factor: FactorMetadata = None
-            super().__init__(data, context, **params)
-
-        @override
-        def _encode_evaled_factor(
-            self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
-        ) -> dict[str, Any]:
-            """Function is called just before the factor is evaluated.
-
-            We can record some metadata, before we call the original function.
-            """
-            assert (
-                self._current_factor is None
-            ), "_current_factor should always be None when we start recording metadata"
-            if self.factor_metadata_storage is not None:
-                # Don't store if the factor is cached (then we should already have recorded it)
-                if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
-                    assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
-                else:
-                    for var in factor.variables:
-                        self.variable_to_factors[var].add(factor.expr)
-                    self._current_factor = FactorMetadata(
-                        name=factor.expr,
-                        reduced_rank=reduced_rank,
-                        categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
-                        custom_encoder=factor.metadata.encoder is not None,
-                        kind=factor.metadata.kind,
-                    )
-            return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
-
-        @override
-        def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
-            """
-            Function is called at the end, before the design matrix gets materialized.
-
-            Here we have access to additional metadata, such as `drop_field`.
-            """
-            if self._current_factor is not None:
-                assert self._current_factor.name == name
-                self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
-                self._current_factor.column_names = values.__formulaic_metadata__.column_names
-                self._current_factor.colname_format = values.__formulaic_metadata__.format
-                self.factor_metadata_storage[name].append(self._current_factor)
-                self._current_factor = None
-
-            return super()._flatten_encoded_evaled_factor(name, values)
-
-    return factor_storage, variable_to_factors, CustomPandasMaterializer
-
-
-class AmbiguousAttributeError(ValueError):
-    pass
-
-
-def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
-    """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
-    if not objs:
-        raise ValueError("Collection is empty")
-
-    first_obj_attr = getattr(objs[0], attr)
-
-    # Check if the attribute is the same for all objects
-    for obj in objs[1:]:
-        if getattr(obj, attr) != first_obj_attr:
-            raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
-
-    # If attribute is the same for all objects, return it
-    return first_obj_attr
diff --git a/pertpy/tools/_differential_gene_expression/_pydeseq2.py b/pertpy/tools/_differential_gene_expression/_pydeseq2.py
index c82b6a49..d4360e28 100644
--- a/pertpy/tools/_differential_gene_expression/_pydeseq2.py
+++ b/pertpy/tools/_differential_gene_expression/_pydeseq2.py
@@ -2,6 +2,7 @@
 import re
 import warnings
 
+import numpy as np
 import pandas as pd
 from anndata import AnnData
 from numpy import ndarray
@@ -40,33 +41,25 @@ def fit(self, **kwargs) -> pd.DataFrame:
         Args:
             **kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
         """
-        inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
-        covars = self.design.columns.tolist()
-        if "Intercept" not in covars:
-            warnings.warn(
-                "Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
-            )
-        processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
+        try:
+            usable_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            usable_cpus = os.cpu_count()
+
+        inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", usable_cpus))
+
         dds = DeseqDataSet(
-            adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
+            adata=self.adata,
+            design=self.design,  # initialize using design matrix, not formula
+            refit_cooks=True,
+            inference=inference,
+            **kwargs,
         )
-        # workaround code to insert design array
-        des_mtx_cols = dds.obsm["design_matrix"].columns
-        dds.obsm["design_matrix"] = self.design
-        if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
-            dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
 
         dds.deseq2()
         self.dds = dds
 
-    # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
-    # see https://github.com/owkin/PyDESeq2/issues/213
-
-    # Therefore these functions are overridden in a way to make it work with PyDESeq2,
-    # ingoring the inconsistency of function signatures. Once arbitrary design
-    # matrices and contrasts are supported by PyDEseq2, we can fully support the
-    # Linear model interface.
-    def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame:  # type: ignore
+    def _test_single_contrast(self, contrast, alpha=0.05, **kwargs) -> pd.DataFrame:
         """Conduct a specific test and returns a Pandas DataFrame.
 
         Args:
@@ -74,6 +67,7 @@ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd
             alpha: p value threshold used for controlling fdr with independent hypothesis weighting
             **kwargs: extra arguments to pass to DeseqStats()
         """
+        contrast = np.array(contrast)
         stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
         # Calling `.summary()` is required to fill the `results_df` data frame
         stat_res.summary()
@@ -85,11 +79,3 @@ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd
         res_df.index.name = "variable"
         res_df = res_df.reset_index()
         return res_df
-
-    def cond(self, **kwargs) -> ndarray:
-        raise NotImplementedError(
-            "PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
-        )
-
-    def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]:  # type: ignore
-        return (column, group_to_compare, baseline)
diff --git a/pertpy/tools/_differential_gene_expression/_statsmodels.py b/pertpy/tools/_differential_gene_expression/_statsmodels.py
index 02bfad98..cd8bc5fa 100644
--- a/pertpy/tools/_differential_gene_expression/_statsmodels.py
+++ b/pertpy/tools/_differential_gene_expression/_statsmodels.py
@@ -59,14 +59,3 @@ def _test_single_contrast(self, contrast, **kwargs) -> pd.DataFrame:
                 }
             )
         return pd.DataFrame(res).sort_values("p_value")
-
-    def contrast(self, column: str, baseline: str, group_to_compare: str) -> np.ndarray:
-        """Build a simple contrast for pairwise comparisons.
-
-        This is equivalent to
-
-        ```
-        model.cond(<column> = baseline) - model.cond(<column> = group_to_compare)
-        ```
-        """
-        return self.cond(**{column: baseline}) - self.cond(**{column: group_to_compare})
diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index f2442bd1..ec9001d4 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -344,9 +344,9 @@ def pairwise(
             else:
                 embedding = adata.obsm[self.obsm_key].copy()
             for index_x, group_x in enumerate(fct(groups)):
-                cells_x = embedding[grouping == group_x].copy()
+                cells_x = embedding[np.asarray(grouping == group_x)].copy()
                 for group_y in groups[index_x:]:  # type: ignore
-                    cells_y = embedding[grouping == group_y].copy()
+                    cells_y = embedding[np.asarray(grouping == group_y)].copy()
                     if not bootstrap:
                         # By distance axiom, the distance between a group and itself is 0
                         dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
@@ -478,9 +478,9 @@ def onesided_distances(
             else:
                 embedding = adata.obsm[self.obsm_key].copy()
             for group_x in fct(groups):
-                cells_x = embedding[grouping == group_x].copy()
+                cells_x = embedding[np.asarray(grouping == group_x)].copy()
                 group_y = selected_group
-                cells_y = embedding[grouping == group_y].copy()
+                cells_y = embedding[np.asarray(grouping == group_y)].copy()
                 if not bootstrap:
                     # By distance axiom, the distance between a group and itself is 0
                     dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
@@ -988,7 +988,7 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
             try:
                 nb_params = NegativeBinomialP(x, np.ones_like(x)).fit(disp=False).params
                 return _compute_nll(y, nb_params, epsilon)
-            except np.linalg.linalg.LinAlgError:
+            except np.linalg.LinAlgError:
                 if x.mean() < 10 and y.mean() < 10:
                     return 0.0
                 else:
diff --git a/pertpy/tools/_milo.py b/pertpy/tools/_milo.py
index 53bda6b9..afef5292 100644
--- a/pertpy/tools/_milo.py
+++ b/pertpy/tools/_milo.py
@@ -183,7 +183,7 @@ def make_nhoods(
             knn_dists = adata.obsp[neighbors_key + "_distances"]
 
         nhood_ixs = adata.obs["nhood_ixs_refined"] == 1
-        dist_mat = knn_dists[nhood_ixs, :]
+        dist_mat = knn_dists[np.asarray(nhood_ixs), :]
         k_distances = dist_mat.max(1).toarray().ravel()
         adata.obs["nhood_kth_distance"] = 0
         adata.obs["nhood_kth_distance"] = adata.obs["nhood_kth_distance"].astype(float)
@@ -704,8 +704,8 @@ def _graph_spatial_fdr(
         pvalues = sample_adata.var["PValue"]
         keep_nhoods = ~pvalues.isna()  # Filtering in case of test on subset of nhoods
         o = pvalues[keep_nhoods].argsort()
-        pvalues = pvalues[keep_nhoods][o]
-        w = w[keep_nhoods][o]
+        pvalues = pvalues.loc[keep_nhoods].iloc[o]
+        w = w.loc[keep_nhoods].iloc[o]
 
         adjp = np.zeros(shape=len(o))
         adjp[o] = (sum(w) * pvalues / np.cumsum(w))[::-1].cummin()[::-1]
diff --git a/pertpy/tools/_mixscape.py b/pertpy/tools/_mixscape.py
index fddc54c2..c015eaaa 100644
--- a/pertpy/tools/_mixscape.py
+++ b/pertpy/tools/_mixscape.py
@@ -104,7 +104,7 @@ def perturbation_signature(
             control_mask_split = control_mask & split_mask
 
             R_split = representation[split_mask]
-            R_control = representation[control_mask_split]
+            R_control = representation[np.asarray(control_mask_split)]
 
             from pynndescent import NNDescent
 
@@ -112,7 +112,7 @@ def perturbation_signature(
             nn_index = NNDescent(R_control, **kwargs)
             indices, _ = nn_index.query(R_split, k=n_neighbors, epsilon=eps)
 
-            X_control = np.expm1(adata.X[control_mask_split])
+            X_control = np.expm1(adata.X[np.asarray(control_mask_split)])
 
             n_split = split_mask.sum()
             n_control = X_control.shape[0]
@@ -256,7 +256,7 @@ def mixscape(
                 else:
                     de_genes = perturbation_markers[(category, gene)]
                     de_genes_indices = self._get_column_indices(adata, list(de_genes))
-                    dat = X[all_cells][:, de_genes_indices]
+                    dat = X[np.asarray(all_cells)][:, de_genes_indices]
                     converged = False
                     n_iter = 0
                     old_classes = adata.obs[labels][all_cells]
@@ -266,8 +266,8 @@ def mixscape(
                         # get average value for each gene over all selected cells
                         # all cells in current split&Gene minus all NT cells in current split
                         # Each row is for each cell, each column is for each gene, get mean for each column
-                        vec = np.mean(X[guide_cells][:, de_genes_indices], axis=0) - np.mean(
-                            X[nt_cells][:, de_genes_indices], axis=0
+                        vec = np.mean(X[np.asarray(guide_cells)][:, de_genes_indices], axis=0) - np.mean(
+                            X[np.asarray(nt_cells)][:, de_genes_indices], axis=0
                         )
                         # project cells onto the perturbation vector
                         if isinstance(dat, spmatrix):
@@ -596,7 +596,7 @@ def plot_barplot(  # pragma: no cover
             ax.set_title(gene, bbox={"facecolor": "white", "edgecolor": "black", "pad": 1}, fontsize=axis_title_size)
             ax.set(xlabel="sgRNA", ylabel="% of cells")
             sns.despine(ax=ax, top=True, right=True, left=False, bottom=False)
-            ax.set_xticks(ax.get_xticks(),ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size)
+            ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=0, ha="right", fontsize=axis_text_x_size)
             ax.set_yticks(ax.get_yticks(), ax.get_yticklabels(), rotation=0, fontsize=axis_text_y_size)
             ax.legend(
                 title="Mixscape Class",
@@ -614,7 +614,7 @@ def plot_barplot(  # pragma: no cover
         if show:
             plt.show()
         if return_fig:
-            return  fig
+            return fig
         return None
 
     @_doc_params(common_plot_args=doc_common_plot_args)
diff --git a/pyproject.toml b/pyproject.toml
index 31673b01..3c1a6b5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,8 +75,9 @@ coda = [
     "pyqt5"  # remove this after ete4 got released
 ]
 de = [
+    "formulaic-contrasts>=0.2.0",
     "formulaic",
-    "pydeseq2",
+    "pydeseq2>=v0.5.0pre1",
 ]
 dev = [
     "pre-commit",
diff --git a/tests/tools/_differential_gene_expression/test_edger.py b/tests/tools/_differential_gene_expression/test_edger.py
index 27b4584a..96cbe132 100644
--- a/tests/tools/_differential_gene_expression/test_edger.py
+++ b/tests/tools/_differential_gene_expression/test_edger.py
@@ -1,4 +1,5 @@
-from pertpy.tools._differential_gene_expression import EdgeR
+import numpy.testing as npt
+from pertpy.tools._differential_gene_expression import EdgeR, PyDESeq2
 
 
 def test_edger_simple(test_adata):
@@ -13,6 +14,28 @@ def test_edger_simple(test_adata):
     res_df = method.test_contrasts(method.contrast("condition", "A", "B"))
 
     assert len(res_df) == test_adata.n_vars
+    # Compare against snapshot
+    npt.assert_almost_equal(
+        res_df.p_value.values,
+        [
+            8.0000e-05,
+            1.8000e-04,
+            5.3000e-04,
+            1.1800e-03,
+            3.3800e-02,
+            3.3820e-02,
+            7.7980e-02,
+            1.3715e-01,
+            2.5052e-01,
+            9.2485e-01,
+        ],
+        decimal=4,
+    )
+    npt.assert_almost_equal(
+        res_df.log_fc.values,
+        [0.61208, -0.39374, 0.57944, 0.7343, -0.58675, 0.42575, -0.23951, -0.20761, 0.17489, 0.0247],
+        decimal=4,
+    )
 
 
 def test_edger_complex(test_adata):
@@ -28,6 +51,12 @@ def test_edger_complex(test_adata):
     # Check that the index of the result matches the var_names of the AnnData object
     assert set(test_adata.var_names) == set(res_df["variable"])
 
+    # Compare ranking of genes from a different method (without design matrix handling)
+    down_gene = res_df.set_index("variable").loc["gene3", "log_fc"]
+    up_gene = res_df.set_index("variable").loc["gene1", "log_fc"]
+    assert down_gene < up_gene
 
-# TODO: there should be a test checking if, for a concrete example, the output p-values and effect sizes are what
-# we expect (-> frozen snapshot, that way we also get a heads-up if something changes upstream)
+    method = PyDESeq2(adata=test_adata, design="~condition1+group")
+    method.fit()
+    deseq_res_df = method.test_contrasts(method.contrast("condition1", "A", "B"))
+    assert all(res_df.sort_values("log_fc")["variable"].values == deseq_res_df.sort_values("log_fc")["variable"].values)
diff --git a/tests/tools/_differential_gene_expression/test_formulaic.py b/tests/tools/_differential_gene_expression/test_formulaic.py
deleted file mode 100644
index f5305c5a..00000000
--- a/tests/tools/_differential_gene_expression/test_formulaic.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import pandas as pd
-import pytest
-from formulaic.parser.types import Factor
-from pertpy.tools._differential_gene_expression._formulaic import (
-    AmbiguousAttributeError,
-    FactorMetadata,
-    get_factor_storage_and_materializer,
-    resolve_ambiguous,
-)
-
-
-@pytest.mark.parametrize(
-    "formula,reorder_categorical,expected_factor_metadata",
-    [
-        [
-            "~ donor",
-            None,
-            {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}},
-        ],
-        [
-            "~ donor",
-            {"donor": ["D2", "D1", "D0", "D3"]},
-            {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D2"}},
-        ],
-        [
-            "~ C(donor)",
-            None,
-            {"C(donor)": {"reduced_rank": True, "custom_encoder": True, "base": "D0"}},
-        ],
-        [
-            "~ C(donor, contr.treatment(base='D2'))",
-            None,
-            {"C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}},
-        ],
-        [
-            "~ C(donor, contr.sum)",
-            None,
-            {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D3"}},
-        ],
-        [
-            "~ C(donor, contr.sum)",
-            {"donor": ["D1", "D0", "D3", "D2"]},
-            {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}},
-        ],
-        [
-            "~ condition",
-            None,
-            {"condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}},
-        ],
-        [
-            "~ C(condition)",
-            None,
-            {"C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"}},
-        ],
-        [
-            "~ C(condition, contr.treatment(base='B'))",
-            None,
-            {"C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"}},
-        ],
-        [
-            "~ C(condition, contr.sum)",
-            None,
-            {"C(condition, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "B"}},
-        ],
-        [
-            "~ 0 + condition",
-            None,
-            {"condition": {"reduced_rank": False, "custom_encoder": False, "base": None}},
-        ],
-        [
-            "~ condition + donor",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"},
-            },
-        ],
-        [
-            "~ 0 + condition + donor",
-            None,
-            {
-                "condition": {"reduced_rank": False, "custom_encoder": False, "base": None},
-                "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"},
-            },
-        ],
-        [
-            "~ condition * donor",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"},
-            },
-        ],
-        [
-            "~ condition * C(donor, contr.treatment(base='D2'))",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"},
-            },
-        ],
-        [
-            "~ condition + C(condition) + C(condition, contr.treatment(base='B'))",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"},
-                "C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"},
-            },
-        ],
-        [
-            "~ condition + continuous + np.log(continuous)",
-            None,
-            {
-                "condition": {
-                    "reduced_rank": True,
-                    "custom_encoder": False,
-                    "base": "A",
-                    "kind": Factor.Kind.CATEGORICAL,
-                },
-                "continuous": {
-                    "reduced_rank": False,
-                    "custom_encoder": False,
-                    "base": None,
-                    "kind": Factor.Kind.NUMERICAL,
-                },
-                "np.log(continuous)": {
-                    "reduced_rank": False,
-                    "custom_encoder": False,
-                    "base": None,
-                    "kind": Factor.Kind.NUMERICAL,
-                },
-            },
-        ],
-        [
-            "~ condition * donor + continuous",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"},
-                "continuous": {
-                    "reduced_rank": False,
-                    "custom_encoder": False,
-                    "base": None,
-                    "kind": Factor.Kind.NUMERICAL,
-                },
-            },
-        ],
-        [
-            "~ condition:donor",
-            None,
-            {
-                "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"},
-                "donor": {
-                    "custom_encoder": False,
-                    "drop_field": "D0",
-                },  # `reduced_rank` and `base` will be ambigous here because Formulaic generates both version of the factor internally
-            },
-        ],
-    ],
-)
-def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, expected_factor_metadata):
-    """Test that the custom materializer correctly stores the baseline category.
-
-    Parameters
-    ----------
-    test_adata_minimal
-        adata fixture
-    formula
-        Formula to test
-    reorder_categorical
-        Create a pandas categorical for a given column with a certain order of categories
-    expected_factor_metadata
-        dict with expected values for each factor
-    """
-    if reorder_categorical is not None:
-        for col, order in reorder_categorical.items():
-            test_adata_minimal.obs[col] = pd.Categorical(test_adata_minimal.obs[col], categories=order)
-    factor_storage, _, materializer = get_factor_storage_and_materializer()
-    materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula)
-    for factor, expected_metadata in expected_factor_metadata.items():
-        actual_metadata = factor_storage[factor]
-        for k in expected_metadata:
-            assert resolve_ambiguous(actual_metadata, k) == expected_metadata[k]
-
-
-def test_resolve_ambiguous():
-    obj1 = FactorMetadata("F1", True, True, ["A", "B"], Factor.Kind.CATEGORICAL)
-    obj2 = FactorMetadata("F2", True, False, ["A", "B"], Factor.Kind.CATEGORICAL)
-    obj3 = FactorMetadata("F3", True, False, None, Factor.Kind.NUMERICAL)
-
-    with pytest.raises(ValueError):
-        resolve_ambiguous([], "foo")
-
-    with pytest.raises(AttributeError):
-        resolve_ambiguous([obj1, obj2], "doesntexist")
-
-    with pytest.raises(AmbiguousAttributeError):
-        assert resolve_ambiguous([obj1, obj2], "name")
-
-    assert resolve_ambiguous([obj1, obj2, obj3], "reduced_rank") is True
-    assert resolve_ambiguous([obj1, obj2], "categories") == ["A", "B"]
-
-    with pytest.raises(AmbiguousAttributeError):
-        assert resolve_ambiguous([obj1, obj2, obj3], "categories")
-
-    with pytest.raises(AmbiguousAttributeError):
-        assert resolve_ambiguous([obj1, obj3], "kind")
diff --git a/tests/tools/_differential_gene_expression/test_pydeseq2.py b/tests/tools/_differential_gene_expression/test_pydeseq2.py
index aab7e3d1..438dcf75 100644
--- a/tests/tools/_differential_gene_expression/test_pydeseq2.py
+++ b/tests/tools/_differential_gene_expression/test_pydeseq2.py
@@ -1,3 +1,4 @@
+import numpy.testing as npt
 from pertpy.tools._differential_gene_expression import PyDESeq2
 
 
@@ -10,9 +11,20 @@ def test_pydeseq2_simple(test_adata):
     """
     method = PyDESeq2(adata=test_adata, design="~condition")
     method.fit()
-    res_df = method.test_contrasts(["condition", "A", "B"])
+    res_df = method.test_contrasts(method.contrast("condition", "A", "B"))
 
     assert len(res_df) == test_adata.n_vars
+    # Compare against snapshot
+    npt.assert_almost_equal(
+        res_df.p_value.values,
+        [0.00017, 0.00033, 0.00051, 0.0286, 0.03207, 0.04723, 0.11039, 0.11452, 0.3703, 0.99625],
+        decimal=4,
+    )
+    npt.assert_almost_equal(
+        res_df.log_fc.values,
+        [0.58207, 0.53855, -0.4121, 0.63281, -0.63283, -0.27066, -0.21271, 0.38601, 0.13434, 0.00146],
+        decimal=4,
+    )
 
 
 def test_pydeseq2_complex(test_adata):
@@ -22,12 +34,32 @@ def test_pydeseq2_complex(test_adata):
     test_adata.obs["condition1"] = test_adata.obs["condition"].copy()
     method = PyDESeq2(adata=test_adata, design="~condition1+group")
     method.fit()
-    res_df = method.test_contrasts(["condition1", "A", "B"])
+    res_df = method.test_contrasts(method.contrast("condition1", "A", "B"))
 
     assert len(res_df) == test_adata.n_vars
     # Check that the index of the result matches the var_names of the AnnData object
     assert set(test_adata.var_names) == set(res_df["variable"])
+    # Compare against snapshot
+    npt.assert_almost_equal(
+        res_df.p_value.values,
+        [7e-05, 0.00012, 0.00035, 0.01062, 0.01906, 0.03892, 0.10755, 0.11175, 0.36631, 0.94952],
+        decimal=4,
+    )
+    npt.assert_almost_equal(
+        res_df.log_fc.values,
+        [-0.42347, 0.58802, 0.53528, 0.73147, -0.67374, -0.27158, -0.21402, 0.38953, 0.13511, -0.01949],
+        decimal=4,
+    )
 
 
-# TODO: there should be a test checking if, for a concrete example, the output p-values and effect sizes are what
-# we expect (-> frozen snapshot, that way we also get a heads-up if something changes upstream)
+def test_pydeseq2_formula(test_adata):
+    """Check that the pyDESeq2 method gives consistent results when specifying contrasts, regardless of the order of covariates"""
+    model1 = PyDESeq2(adata=test_adata, design="~condition+group")
+    model1.fit()
+    res_1 = model1.test_contrasts(model1.contrast("condition", "A", "B"))
+
+    model2 = PyDESeq2(adata=test_adata, design="~group+condition")
+    model2.fit()
+    res_2 = model2.test_contrasts(model2.contrast("condition", "A", "B"))
+
+    npt.assert_almost_equal(res_2.log_fc.values, res_1.log_fc.values)