refactor sa test, add nelson alen

theislab · Feb 21, 2024 · 20ca992 · 20ca992
1 parent c3a11ac
commit 20ca992
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 39 deletions.
diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py
@@ -60,9 +60,7 @@ def glm(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
     formula: str | None = None,
-    family: Literal[
-        "Gaussian", "Binomial", "Gamma", "Gaussian", "InverseGaussian"
-    ] = "Gaussian",
+    family: Literal["Gaussian", "Binomial", "Gamma", "Gaussian", "InverseGaussian"] = "Gaussian",
     missing: Literal["none", "drop", "raise"] = "none",
     as_continuous: Iterable[str] | None | None = None,
 ) -> sm.GLM:
@@ -185,9 +183,7 @@ def test_kmf_logrank(
     kmf_A: KaplanMeierFitter,
     kmf_B: KaplanMeierFitter,
     t_0: float | None = -1,
-    weightings: (
-        Literal["wilcoxon", "tarone-ware", "peto", "fleming-harrington"] | None
-    ) = None,
+    weightings: Literal["wilcoxon", "tarone-ware", "peto", "fleming-harrington"] | None = None,
 ) -> StatisticalResult:
     """Calculates the p-value for the logrank test comparing the survival functions of two groups.
 
@@ -224,9 +220,7 @@ def test_kmf_logrank(
     return results_pairwise
 
 
-def test_nested_f_statistic(
-    small_model: GLMResultsWrapper, big_model: GLMResultsWrapper
-) -> float:
+def test_nested_f_statistic(small_model: GLMResultsWrapper, big_model: GLMResultsWrapper) -> float:
     """Given two fitted GLMs, the larger of which contains the parameter space of the smaller, return the P value corresponding to the larger model adding explanatory power.
 
     See https://stackoverflow.com/questions/27328623/anova-test-for-glm-in-python/60769343#60769343
@@ -239,22 +233,15 @@ def test_nested_f_statistic(
         float: p_value
     """
     addtl_params = big_model.df_model - small_model.df_model
-    f_stat = (small_model.deviance - big_model.deviance) / (
-        addtl_params * big_model.scale
-    )
+    f_stat = (small_model.deviance - big_model.deviance) / (addtl_params * big_model.scale)
     df_numerator = addtl_params
     df_denom = big_model.fittedvalues.shape[0] - big_model.df_model
     p_value = stats.f.sf(f_stat, df_numerator, df_denom)
 
     return p_value
 
 
-def anova_glm(
-    result_1: GLMResultsWrapper,
-    result_2: GLMResultsWrapper,
-    formula_1: str,
-    formula_2: str,
-) -> pd.DataFrame:
+def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_1: str, formula_2: str) -> pd.DataFrame:
     """Anova table for two fitted generalized linear models.
 
     Args:
@@ -280,9 +267,7 @@ def anova_glm(
     return dataframe
 
 
-def cox_ph(
-    adata: AnnData, duration_col: str, event_col: str, entry_col: str = None
-) -> CoxPHFitter:
+def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> CoxPHFitter:
     """Fit the Cox’s proportional hazard for the survival function.
 
     The Cox proportional hazards model (CoxPH) examines the relationship between the survival time of subjects and one or more predictor variables.
@@ -316,13 +301,12 @@ def cox_ph(
 
     return cph
 
-
 def nelson_alen(adata: AnnData, duration_col: str, event_col: str):
     df = anndata_to_df(adata)
     T = df[duration_col]
     E = df[event_col]
 
     naf = NelsonAalenFitter()
 
-    naf.fit(T, event_observed=E)
+    naf.fit(T,event_observed=E)
     return naf
diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 import statsmodels
-from lifelines import CoxPHFitter, KaplanMeierFitter
+from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter
 
 import ehrapy as ep
 
@@ -30,15 +30,6 @@ def test_glm(self):
         assert 5.778006344870297 == pytest.approx(Intercept)
         assert -0.06523274132877163 == pytest.approx(age)
 
-    def test_kmf(self):
-        adata = ep.dt.mimic_2(encoded=False)
-        adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
-        kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)
-
-        assert isinstance(kmf, KaplanMeierFitter)
-        assert len(kmf.durations) == 1776
-        assert sum(kmf.event_observed) == 497
-
     @pytest.mark.parametrize("weightings", ["wilcoxon", "tarone-ware", "peto", "fleming-harrington"])
     def test_calculate_logrank_pvalue(self, weightings):
         durations_A = [1, 2, 3]
@@ -75,12 +66,26 @@ def test_anova_glm(self):
         assert dataframe.shape == (2, 6)
         assert dataframe.iloc[1, 4] == 2
         assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185
-
-    def test_cox_ph(self):
+        
+    def prepare_mimic2_for_sa_test(self):
         adata = ep.dt.mimic_2(encoded=False)
         adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
-        cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
+        duration_col, event_col = "mort_day_censored", "censor_flg"
+        return adata, duration_col, event_col
+
+    def sa_func_test(self, sa_function, sa_class):
+        adata, duration_col, event_col = self.prepare_mimic2_for_sa_test()
 
-        assert isinstance(cph, CoxPHFitter)
-        assert len(cph.durations) == 1776
-        assert sum(cph.event_observed) == 497
+        sa = sa_function(adata, duration_col, event_col)
+        assert isinstance(sa, sa_class)
+        assert len(sa.durations) == 1776
+        assert sum(sa.event_observed) == 497
+
+    def test_kmf(self):
+        self.sa_func_test(self.ep.tl.kmf, KaplanMeierFitter)
+
+    def test_cox_ph(self):
+        self.sa_func_test(self.ep.tl.cox_ph, CoxPHFitter)
+
+    def test_nelson_alen(self):
+        self.sa_func_test(self.ep.tl.nelson_alen, NelsonAalenFitter)