theislab · Zethson · Jan 23, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
@@ -241,6 +241,7 @@ In contrast to a preprocessing function, a tool usually adds an easily interpret
     tools.kmf
     tools.test_kmf_logrank
     tools.test_nested_f_statistic
+    tools.cox_ph
 ```
 
 ### Causal Inference

diff --git a/ehrapy/tools/__init__.py b/ehrapy/tools/__init__.py
@@ -1,4 +1,4 @@
-from ehrapy.tools._sa import anova_glm, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
+from ehrapy.tools._sa import anova_glm, cox_ph, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
 from ehrapy.tools._scanpy_tl_api import *  # noqa: F403
 from ehrapy.tools.causal._dowhy import causal_inference
 from ehrapy.tools.feature_ranking._rank_features_groups import filter_rank_features_groups, rank_features_groups

diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py
@@ -6,10 +6,12 @@
 import pandas as pd
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
-from lifelines import KaplanMeierFitter
+from lifelines import CoxPHFitter, KaplanMeierFitter
 from lifelines.statistics import StatisticalResult, logrank_test
 from scipy import stats
 
+from ehrapy.anndata import anndata_to_df
+
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
@@ -119,8 +121,9 @@ def kmf(
     censoring: Literal["right", "left"] = None,
 ) -> KaplanMeierFitter:
     """Fit the Kaplan-Meier estimate for the survival function.
-
-    See https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter
+    The Kaplan–Meier estimator, also known as the product limit estimator, is a non-parametric statistic used to estimate the survival function from lifetime data. In medical research, it is often used to measure the fraction of patients living for a certain amount of time after treatment.
+    See https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
+        https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter
     Class for fitting the Kaplan-Meier estimate for the survival function.
 
     Args:
@@ -262,3 +265,33 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
     }
     dataframe = pd.DataFrame(data=table)
     return dataframe
+
+
+def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> KaplanMeierFitter:
+    """Fit the Cox’s proportional hazard for the survival function.
+    The prominent assumption with Cox proportional hazards model is that, not surprisingly, the hazard functions are proportional. David Cox noticed that by enforcing that “simple” constraint on the form of the hazard model, a lot of difficult math and unstable optimization can be avoided.
+    See https://www.graphpad.com/guides/survival-analysis
+        https://lifelines.readthedocs.io/en/latest/fitters/regression/CoxPHFitter.html
+
+    Args:
+        adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
+        duration_col: the name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: the name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored.
+        entry_col: a column denoting when a subject entered the study, i.e. left-truncation.
+    Returns:
+        Fitted CoxPHFitter
+
+    Examples:
+        >>> import ehrapy as ep
+        >>> adata = ep.dt.mimic_2(encoded=False)
+        >>> # Because in MIMIC-II database, `censor_fl` is censored or death (binary: 0 = death, 1 = censored).
+        >>> # While in KaplanMeierFitter, `event_observed` is True if the the death was observed, False if the event was lost (right-censored).
+        >>> # So we need to flip `censor_fl` when pass `censor_fl` to KaplanMeierFitter
+        >>> adata[:, ['censor_flg']].X = np.where(adata[:, ['censor_flg']].X == 0, 1, 0)
+        >>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
+    """
+    df = ehrapy_ad.anndata_to_df(adata)
+    df = df[[duration_col, event_col, entry_col]]
+    cph = CoxPHFitter()
+    cph.fit(df, duration_col, event_col, entry_col=entry_col)
+    return cph
diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 import statsmodels
-from lifelines import KaplanMeierFitter
+from lifelines import CoxPHFitter, KaplanMeierFitter
 
 import ehrapy as ep
 
@@ -75,3 +75,12 @@ def test_anova_glm(self):
         assert dataframe.shape == (2, 6)
         assert dataframe.iloc[1, 4] == 2
         assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185
+
+    def test_cox_ph(self):
+        adata = ep.dt.mimic_2(encoded=False)
+        adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+        cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
+
+        assert isinstance(cph, CoxPHFitter)
+        assert len(cph.durations) == 1776
+        assert sum(cph.event_observed) == 497