Skip to content

Commit

Permalink
refactor sa test, add nelson alen
Browse files Browse the repository at this point in the history
  • Loading branch information
fatisati committed Feb 21, 2024
1 parent c3a11ac commit 20ca992
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 39 deletions.
30 changes: 7 additions & 23 deletions ehrapy/tools/_sa.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ def glm(
adata: AnnData,
var_names: Iterable[str] | None = None,
formula: str | None = None,
family: Literal[
"Gaussian", "Binomial", "Gamma", "Gaussian", "InverseGaussian"
] = "Gaussian",
family: Literal["Gaussian", "Binomial", "Gamma", "Gaussian", "InverseGaussian"] = "Gaussian",
missing: Literal["none", "drop", "raise"] = "none",
as_continuous: Iterable[str] | None | None = None,
) -> sm.GLM:
Expand Down Expand Up @@ -185,9 +183,7 @@ def test_kmf_logrank(
kmf_A: KaplanMeierFitter,
kmf_B: KaplanMeierFitter,
t_0: float | None = -1,
weightings: (
Literal["wilcoxon", "tarone-ware", "peto", "fleming-harrington"] | None
) = None,
weightings: Literal["wilcoxon", "tarone-ware", "peto", "fleming-harrington"] | None = None,
) -> StatisticalResult:
"""Calculates the p-value for the logrank test comparing the survival functions of two groups.
Expand Down Expand Up @@ -224,9 +220,7 @@ def test_kmf_logrank(
return results_pairwise


def test_nested_f_statistic(
small_model: GLMResultsWrapper, big_model: GLMResultsWrapper
) -> float:
def test_nested_f_statistic(small_model: GLMResultsWrapper, big_model: GLMResultsWrapper) -> float:
"""Given two fitted GLMs, the larger of which contains the parameter space of the smaller, return the P value corresponding to the larger model adding explanatory power.
See https://stackoverflow.com/questions/27328623/anova-test-for-glm-in-python/60769343#60769343
Expand All @@ -239,22 +233,15 @@ def test_nested_f_statistic(
float: p_value
"""
addtl_params = big_model.df_model - small_model.df_model
f_stat = (small_model.deviance - big_model.deviance) / (
addtl_params * big_model.scale
)
f_stat = (small_model.deviance - big_model.deviance) / (addtl_params * big_model.scale)
df_numerator = addtl_params
df_denom = big_model.fittedvalues.shape[0] - big_model.df_model
p_value = stats.f.sf(f_stat, df_numerator, df_denom)

return p_value


def anova_glm(
result_1: GLMResultsWrapper,
result_2: GLMResultsWrapper,
formula_1: str,
formula_2: str,
) -> pd.DataFrame:
def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_1: str, formula_2: str) -> pd.DataFrame:
"""Anova table for two fitted generalized linear models.
Args:
Expand All @@ -280,9 +267,7 @@ def anova_glm(
return dataframe


def cox_ph(
adata: AnnData, duration_col: str, event_col: str, entry_col: str = None
) -> CoxPHFitter:
def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> CoxPHFitter:
"""Fit the Cox’s proportional hazard for the survival function.
The Cox proportional hazards model (CoxPH) examines the relationship between the survival time of subjects and one or more predictor variables.
Expand Down Expand Up @@ -316,13 +301,12 @@ def cox_ph(

return cph


def nelson_alen(adata: AnnData, duration_col: str, event_col: str):
df = anndata_to_df(adata)
T = df[duration_col]
E = df[event_col]

naf = NelsonAalenFitter()

naf.fit(T, event_observed=E)
naf.fit(T,event_observed=E)
return naf
37 changes: 21 additions & 16 deletions tests/tools/test_sa.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest
import statsmodels
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter

import ehrapy as ep

Expand Down Expand Up @@ -30,15 +30,6 @@ def test_glm(self):
assert 5.778006344870297 == pytest.approx(Intercept)
assert -0.06523274132877163 == pytest.approx(age)

def test_kmf(self):
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)

assert isinstance(kmf, KaplanMeierFitter)
assert len(kmf.durations) == 1776
assert sum(kmf.event_observed) == 497

@pytest.mark.parametrize("weightings", ["wilcoxon", "tarone-ware", "peto", "fleming-harrington"])
def test_calculate_logrank_pvalue(self, weightings):
durations_A = [1, 2, 3]
Expand Down Expand Up @@ -75,12 +66,26 @@ def test_anova_glm(self):
assert dataframe.shape == (2, 6)
assert dataframe.iloc[1, 4] == 2
assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185

def test_cox_ph(self):
def prepare_mimic2_for_sa_test(self):
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
duration_col, event_col = "mort_day_censored", "censor_flg"
return adata, duration_col, event_col

def sa_func_test(self, sa_function, sa_class):
adata, duration_col, event_col = self.prepare_mimic2_for_sa_test()

assert isinstance(cph, CoxPHFitter)
assert len(cph.durations) == 1776
assert sum(cph.event_observed) == 497
sa = sa_function(adata, duration_col, event_col)
assert isinstance(sa, sa_class)
assert len(sa.durations) == 1776
assert sum(sa.event_observed) == 497

def test_kmf(self):
self.sa_func_test(self.ep.tl.kmf, KaplanMeierFitter)

def test_cox_ph(self):
self.sa_func_test(self.ep.tl.cox_ph, CoxPHFitter)

def test_nelson_alen(self):
self.sa_func_test(self.ep.tl.nelson_alen, NelsonAalenFitter)

0 comments on commit 20ca992

Please sign in to comment.