Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coxphfitter #643

Merged
merged 14 commits into from
Jan 23, 2024
Merged
1 change: 1 addition & 0 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ In contrast to a preprocessing function, a tool usually adds an easily interpret
tools.kmf
tools.test_kmf_logrank
tools.test_nested_f_statistic
tools.cox_ph
```

### Causal Inference
Expand Down
2 changes: 1 addition & 1 deletion ehrapy/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ehrapy.tools._sa import anova_glm, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
from ehrapy.tools._sa import anova_glm, cox_ph, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
from ehrapy.tools._scanpy_tl_api import * # noqa: F403
from ehrapy.tools.causal._dowhy import causal_inference
from ehrapy.tools.feature_ranking._rank_features_groups import filter_rank_features_groups, rank_features_groups
Expand Down
39 changes: 36 additions & 3 deletions ehrapy/tools/_sa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.statistics import StatisticalResult, logrank_test
from scipy import stats

from ehrapy.anndata import anndata_to_df

if TYPE_CHECKING:
from collections.abc import Iterable

Expand Down Expand Up @@ -119,8 +121,9 @@ def kmf(
censoring: Literal["right", "left"] = None,
) -> KaplanMeierFitter:
"""Fit the Kaplan-Meier estimate for the survival function.

See https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter
The Kaplan–Meier estimator, also known as the product limit estimator, is a non-parametric statistic used to estimate the survival function from lifetime data. In medical research, it is often used to measure the fraction of patients living for a certain amount of time after treatment.
See https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter
Class for fitting the Kaplan-Meier estimate for the survival function.

Args:
Expand Down Expand Up @@ -262,3 +265,33 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
}
dataframe = pd.DataFrame(data=table)
return dataframe


def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> KaplanMeierFitter:
"""Fit the Cox’s proportional hazard for the survival function.
fatisati marked this conversation as resolved.
Show resolved Hide resolved
The prominent assumption with Cox proportional hazards model is that, not surprisingly, the hazard functions are proportional. David Cox noticed that by enforcing that “simple” constraint on the form of the hazard model, a lot of difficult math and unstable optimization can be avoided.
See https://www.graphpad.com/guides/survival-analysis
https://lifelines.readthedocs.io/en/latest/fitters/regression/CoxPHFitter.html

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: the name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: the name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored.
entry_col: a column denoting when a subject entered the study, i.e. left-truncation.
Returns:
Fitted CoxPHFitter

fatisati marked this conversation as resolved.
Show resolved Hide resolved
Examples:
>>> import ehrapy as ep
>>> adata = ep.dt.mimic_2(encoded=False)
>>> # Because in MIMIC-II database, `censor_fl` is censored or death (binary: 0 = death, 1 = censored).
>>> # While in KaplanMeierFitter, `event_observed` is True if the the death was observed, False if the event was lost (right-censored).
>>> # So we need to flip `censor_fl` when pass `censor_fl` to KaplanMeierFitter
>>> adata[:, ['censor_flg']].X = np.where(adata[:, ['censor_flg']].X == 0, 1, 0)
>>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
"""
df = ehrapy_ad.anndata_to_df(adata)
fatisati marked this conversation as resolved.
Show resolved Hide resolved
df = df[[duration_col, event_col, entry_col]]
cph = CoxPHFitter()
cph.fit(df, duration_col, event_col, entry_col=entry_col)
return cph
11 changes: 10 additions & 1 deletion tests/tools/test_sa.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest
import statsmodels
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter, KaplanMeierFitter

import ehrapy as ep

Expand Down Expand Up @@ -75,3 +75,12 @@ def test_anova_glm(self):
assert dataframe.shape == (2, 6)
assert dataframe.iloc[1, 4] == 2
assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185

def test_cox_ph(self):
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")

assert isinstance(cph, CoxPHFitter)
assert len(cph.durations) == 1776
assert sum(cph.event_observed) == 497
Loading