From b1d36b892645732d36a9336f67d0d06f482e9a10 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:08:58 +0100 Subject: [PATCH 01/25] cox_ph add all arguments --- ehrapy/tools/_sa.py | 161 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 12 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index fed63b9e..241e5dee 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -3,7 +3,6 @@ import warnings from typing import TYPE_CHECKING, Literal -import numpy as np # This package is implicitly used import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf @@ -23,6 +22,7 @@ if TYPE_CHECKING: from collections.abc import Iterable + import numpy as np from anndata import AnnData from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper @@ -347,9 +347,7 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_ return dataframe -def _regression_model( - model_class, adata: AnnData, duration_col: str, event_col: str, entry_col: str = None, accept_zero_duration=True -): +def _regression_model_data_frame_preparation(adata: AnnData, duration_col: str, accept_zero_duration=True): """Convenience function for regression models.""" df = anndata_to_df(adata) df = df.dropna() @@ -357,13 +355,35 @@ def _regression_model( if not accept_zero_duration: df.loc[df[duration_col] == 0, duration_col] += 1e-5 - model = model_class() - model.fit(df, duration_col, event_col, entry_col=entry_col) - - return model + return df -def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> CoxPHFitter: +def cox_ph( + adata: AnnData, + duration_col: str, + *, + inplace: bool = True, + key_added_prefix: str | None = None, + alpha: float = 0.05, + label: str | None = None, + baseline_estimation_method: Literal["breslow", "spline", "piecewise"] = "breslow", + penalizer: float | np.ndarray = 0.0, + l1_ratio: float = 0.0, + strata: list[str] | str | None = None, + n_baseline_knots: int = 4, + knots: list[float] | None = None, + breakpoints: list[float] | None = None, + event_col: str = None, + weights_col: str | None = None, + cluster_col: str | None = None, + entry_col: str = None, + robust: bool = False, + formula: str = None, + batch_mode: bool = None, + show_progress: bool = False, + initial_point: np.ndarray | None = None, + fit_options: dict | None = None, +) -> CoxPHFitter: """Fit the Cox’s proportional hazard for the survival function. The Cox proportional hazards model (CoxPH) examines the relationship between the survival time of subjects and one or more predictor variables. @@ -376,7 +396,26 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. event_col: The name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. + inplace: Whether to modify the AnnData object in place. + alpha: The alpha value in the confidence intervals. + label: A string to name the column of the estimate. + baseline_estimation_method: The method used to estimate the baseline hazard. Options are 'breslow', 'spline', and 'piecewise'. + penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. + l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. + strata: specify a list of columns to use in stratification. This is useful if a categorical covariate does not obey the proportional hazard assumption. This is used similar to the strata expression in R. See http://courses.washington.edu/b515/l17.pdf. + n_baseline_knots: Used when baseline_estimation_method="spline". Set the number of knots (interior & exterior) in the baseline hazard, which will be placed evenly along the time axis. Should be at least 2. Royston et. al, the authors of this model, suggest 4 to start, but any values between 2 and 8 are reasonable. If you need to customize the timestamps used to calculate the curve, use the knots parameter instead. + knots: When baseline_estimation_method="spline", this allows customizing the points in the time axis for the baseline hazard curve. To use evenly-spaced points in time, the n_baseline_knots parameter can be employed instead. + breakpoints: Used when baseline_estimation_method="piecewise". Set the positions of the baseline hazard breakpoints. + event_col: he name of the column in DataFrame that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. + weights_col: The name of the column in DataFrame that contains the weights for each subject. + cluster_col: The name of the column in DataFrame that contains the cluster variable. Using this forces the sandwich estimator (robust variance estimator) to be used. entry_col: Column denoting when a subject entered the study, i.e. left-truncation. + robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. + formula: an Wilkinson formula, like in R and statsmodels, for the right-hand-side. If left as None, all columns not assigned as durations, weights, etc. are used. Uses the library Formulaic for parsing. + batch_mode: enabling batch_mode can be faster for datasets with a large number of ties. If left as None, lifelines will choose the best option. + show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. + initial_point: set the starting point for the iterative solver. + fit_options: Additional keyword arguments to pass into the estimator. Returns: Fitted CoxPHFitter. @@ -388,10 +427,80 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) >>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg") """ - return _regression_model(CoxPHFitter, adata, duration_col, event_col, entry_col) + df = _regression_model_data_frame_preparation(adata, duration_col) + cox_ph = CoxPHFitter( + alpha=alpha, + label=label, + strata=strata, + baseline_estimation_method=baseline_estimation_method, + penalizer=penalizer, + l1_ratio=l1_ratio, + n_baseline_knots=n_baseline_knots, + knots=knots, + breakpoints=breakpoints, + ) + cox_ph.fit( + df, + duration_col=duration_col, + event_col=event_col, + entry_col=entry_col, + robust=robust, + initial_point=initial_point, + weights_col=weights_col, + cluster_col=cluster_col, + batch_mode=batch_mode, + formula=formula, + fit_options=fit_options, + show_progress=show_progress, + ) + + # Add the results to the AnnData object + if inplace: + if key_added_prefix is None: + key_added_prefix = "" + else: + key_added_prefix = key_added_prefix + "_" + + cox_ph_summary = cox_ph.summary + print(cox_ph_summary) + + full_results = pd.DataFrame(index=adata.var.index) + + # Populate with CoxPH summary data + for key in cox_ph_summary.columns: + full_results[key_added_prefix + key] = cox_ph_summary[key] + + # Add a boolean column indicating rows populated by this function + full_results[key_added_prefix + "cox_ph_populated"] = full_results.notna().any(axis=1) + + # Assign results back to adata.var + for col in full_results.columns: + adata.var[col] = full_results[col] + + return cox_ph -def weibull_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> WeibullAFTFitter: +def weibull_aft( + adata: AnnData, + duration_col: str, + *, + inplace: bool = True, + key_added_prefix: str | None = None, + alpha: float = 0.05, + fit_intercept: bool = True, + penalizer: float | np.ndarray = 0.0, + l1_ratio: float = 0.0, + model_ancillary: bool = True, + event_col: str | None = None, + ancillary: bool | pd.DataFrame | None = None, + show_progress: bool = False, + weights_col: str | None = None, + robust: bool = False, + initial_point=None, + entry_col: str | None = None, + formula: str | None = None, + fit_options: dict | None = None, +) -> WeibullAFTFitter: """Fit the Weibull accelerated failure time regression for the survival function. The Weibull Accelerated Failure Time (AFT) survival regression model is a statistical method used to analyze time-to-event data, @@ -417,10 +526,22 @@ def weibull_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: st >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) >>> aft = ep.tl.weibull_aft(adata, "mort_day_censored", "censor_flg") """ + return _regression_model(WeibullAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False) -def log_logistic_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> LogLogisticAFTFitter: +def log_logistic_aft( + adata: AnnData, + duration_col: str, + *, + alpha: float = 0.05, + fit_intercept: bool = True, + penalizer: float | np.ndarray = 0.0, + l1_ratio: float = 0.0, + model_ancillary: bool = False, + event_col: str = None, + entry_col: str = None, +) -> LogLogisticAFTFitter: """Fit the log logistic accelerated failure time regression for the survival function. The Log-Logistic Accelerated Failure Time (AFT) survival regression model is a powerful statistical tool employed in the analysis of time-to-event data. This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times. @@ -450,6 +571,22 @@ def log_logistic_aft(adata: AnnData, duration_col: str, event_col: str, entry_co ) +def _regression_model( + model_class, adata: AnnData, duration_col: str, event_col: str, entry_col: str = None, accept_zero_duration=True +): + """Convenience function for regression models.""" + df = anndata_to_df(adata) + df = df.dropna() + + if not accept_zero_duration: + df.loc[df[duration_col] == 0, duration_col] += 1e-5 + + model = model_class() + model.fit(df, duration_col, event_col, entry_col=entry_col) + + return model + + def _univariate_model( adata: AnnData, duration_col: str, From 35dbacf085a7ae3d7965b05254d7aa16cbe91ef3 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:29:57 +0100 Subject: [PATCH 02/25] updated test to use keywords --- tests/tools/test_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py index 48d85b36..e2bfec29 100644 --- a/tests/tools/test_sa.py +++ b/tests/tools/test_sa.py @@ -92,7 +92,7 @@ def _sa_function_assert(self, model, model_class): def _sa_func_test(self, sa_function, sa_class, mimic_2_sa): adata, duration_col, event_col = mimic_2_sa - sa = sa_function(adata, duration_col, event_col) + sa = sa_function(adata, duration_col=duration_col, event_col=event_col) self._sa_function_assert(sa, sa_class) def test_kmf(self, mimic_2_sa): From 22d190a19c07aa0b8fab49cc5bfa36e33f4bacce Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:47:09 +0100 Subject: [PATCH 03/25] weibull_aft arguments update --- ehrapy/tools/_sa.py | 102 +++++++++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 25 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 241e5dee..13b74a8f 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -358,6 +358,26 @@ def _regression_model_data_frame_preparation(adata: AnnData, duration_col: str, return df +def _regression_model_populate_adata(adata: AnnData, model_summary: pd.DataFrame, key_added_prefix: str = None): + if key_added_prefix is None: + key_added_prefix = "" + else: + key_added_prefix = key_added_prefix + "_" + + full_results = pd.DataFrame(index=adata.var.index) + + # Populate with CoxPH summary data + for key in model_summary.columns: + full_results[key_added_prefix + key] = model_summary[key] + + # Add a boolean column indicating rows populated by this function + full_results[key_added_prefix + "cox_ph_populated"] = full_results.notna().any(axis=1) + + # Assign results back to adata.var + for col in full_results.columns: + adata.var[col] = full_results[col] + + def cox_ph( adata: AnnData, duration_col: str, @@ -397,6 +417,7 @@ def cox_ph( event_col: The name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. inplace: Whether to modify the AnnData object in place. + key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column alpha: The alpha value in the confidence intervals. label: A string to name the column of the estimate. baseline_estimation_method: The method used to estimate the baseline hazard. Options are 'breslow', 'spline', and 'piecewise'. @@ -456,26 +477,7 @@ def cox_ph( # Add the results to the AnnData object if inplace: - if key_added_prefix is None: - key_added_prefix = "" - else: - key_added_prefix = key_added_prefix + "_" - - cox_ph_summary = cox_ph.summary - print(cox_ph_summary) - - full_results = pd.DataFrame(index=adata.var.index) - - # Populate with CoxPH summary data - for key in cox_ph_summary.columns: - full_results[key_added_prefix + key] = cox_ph_summary[key] - - # Add a boolean column indicating rows populated by this function - full_results[key_added_prefix + "cox_ph_populated"] = full_results.notna().any(axis=1) - - # Assign results back to adata.var - for col in full_results.columns: - adata.var[col] = full_results[col] + _regression_model_populate_adata(adata, cox_ph.summary, key_added_prefix) return cox_ph @@ -492,7 +494,7 @@ def weibull_aft( l1_ratio: float = 0.0, model_ancillary: bool = True, event_col: str | None = None, - ancillary: bool | pd.DataFrame | None = None, + ancillary: bool | pd.DataFrame | str | None = None, show_progress: bool = False, weights_col: str | None = None, robust: bool = False, @@ -512,9 +514,29 @@ def weibull_aft( Args: adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: Name of the column in anndata that contains the subjects’ death observation. + inplace: Whether to modify the AnnData object in place. + key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column name. + alpha: The alpha value in the confidence intervals. + fit_intercept: Whether to fit an intercept term in the model. + penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. + l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. + model_ancillary: set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. + event_col: Name of the column in anndata that contains the subjects’ death observation. 1 if observed, 0 else (censored). If left as None, assume all individuals are uncensored. + ancillary: Choose to model the ancillary parameters. + If None or False, explicitly do not fit the ancillary parameters using any covariates. + If True, model the ancillary parameters with the same covariates as ``df``. + If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``. + If str, should be a formula + show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. + weights_col: The name of the column in DataFrame that contains the weights for each subject. + robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. + initial_point: set the starting point for the iterative solver. entry_col: Column denoting when a subject entered the study, i.e. left-truncation. + formula: Use an R-style formula for modeling the dataset. See formula syntax: https://matthewwardrop.github.io/formulaic/basic/grammar/ + If a formula is not provided, all variables in the dataframe are used (minus those used for other purposes like event_col, etc.) + fit_options: Additional keyword arguments to pass into the estimator. + Returns: Fitted WeibullAFTFitter. @@ -522,12 +544,41 @@ def weibull_aft( Examples: >>> import ehrapy as ep >>> adata = ep.dt.mimic_2(encoded=False) - >>> # Flip 'censor_fl' because 0 = death and 1 = censored >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) - >>> aft = ep.tl.weibull_aft(adata, "mort_day_censored", "censor_flg") + >>> adata = adata[:, ["mort_day_censored", "censor_flg"]] + >>> aft = ep.tl.weibull_aft(adata, duration_col="mort_day_censored", event_col="censor_flg") + >>> aft.print_summary() """ - return _regression_model(WeibullAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False) + df = _regression_model_data_frame_preparation(adata, duration_col, accept_zero_duration=False) + + weibull_aft = WeibullAFTFitter( + alpha=alpha, + fit_intercept=fit_intercept, + penalizer=penalizer, + l1_ratio=l1_ratio, + model_ancillary=model_ancillary, + ) + + weibull_aft.fit( + df, + duration_col=duration_col, + event_col=event_col, + entry_col=entry_col, + ancillary=ancillary, + show_progress=show_progress, + weights_col=weights_col, + robust=robust, + initial_point=initial_point, + formula=formula, + fit_options=fit_options, + ) + + # Add the results to the AnnData object + if inplace: + _regression_model_populate_adata(adata, weibull_aft.summary, key_added_prefix) + + return weibull_aft def log_logistic_aft( @@ -566,6 +617,7 @@ def log_logistic_aft( >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) >>> llf = ep.tl.log_logistic_aft(adata, "mort_day_censored", "censor_flg") """ + return _regression_model( LogLogisticAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False ) From 742d38ceb16519bbf41b8bbe15c1c46bf7453ebb Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:14:51 +0100 Subject: [PATCH 04/25] log_logistic update --- ehrapy/tools/_sa.py | 74 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 13b74a8f..395a54bf 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -585,13 +585,22 @@ def log_logistic_aft( adata: AnnData, duration_col: str, *, + inplace: bool = True, + key_added_prefix: str | None = None, alpha: float = 0.05, fit_intercept: bool = True, penalizer: float | np.ndarray = 0.0, l1_ratio: float = 0.0, model_ancillary: bool = False, - event_col: str = None, - entry_col: str = None, + event_col: str | None = None, + ancillary: bool | pd.DataFrame | str | None = None, + show_progress: bool = False, + weights_col: str | None = None, + robust: bool = False, + initial_point=None, + entry_col: str | None = None, + formula: str | None = None, + fit_options: dict | None = None, ) -> LogLogisticAFTFitter: """Fit the log logistic accelerated failure time regression for the survival function. The Log-Logistic Accelerated Failure Time (AFT) survival regression model is a powerful statistical tool employed in the analysis of time-to-event data. @@ -603,9 +612,29 @@ def log_logistic_aft( Args: adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: Name of the column in anndata that contains the subjects’ death observation. + inplace: Whether to modify the AnnData object in place. + key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column + alpha: The alpha value in the confidence intervals. + alpha: The alpha value in the confidence intervals. + fit_intercept: Whether to fit an intercept term in the model. + penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. + l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. + model_ancillary: set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. + event_col: Name of the column in anndata that contains the subjects’ death observation. 1 if observed, 0 else (censored). If left as None, assume all individuals are uncensored. + ancillary: Choose to model the ancillary parameters. + If None or False, explicitly do not fit the ancillary parameters using any covariates. + If True, model the ancillary parameters with the same covariates as ``df``. + If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``. + If str, should be a formula + show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. + weights_col: The name of the column in DataFrame that contains the weights for each subject. + robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. + initial_point: set the starting point for the iterative solver. entry_col: Column denoting when a subject entered the study, i.e. left-truncation. + formula: Use an R-style formula for modeling the dataset. See formula syntax: https://matthewwardrop.github.io/formulaic/basic/grammar/ + If a formula is not provided, all variables in the dataframe are used (minus those used for other purposes like event_col, etc.) + fit_options: Additional keyword arguments to pass into the estimator. Returns: Fitted LogLogisticAFTFitter. @@ -617,26 +646,35 @@ def log_logistic_aft( >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) >>> llf = ep.tl.log_logistic_aft(adata, "mort_day_censored", "censor_flg") """ + df = _regression_model_data_frame_preparation(adata, duration_col, accept_zero_duration=False) - return _regression_model( - LogLogisticAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False + log_logistic_aft = LogLogisticAFTFitter( + alpha=alpha, + fit_intercept=fit_intercept, + penalizer=penalizer, + l1_ratio=l1_ratio, + model_ancillary=model_ancillary, ) + log_logistic_aft.fit( + df, + duration_col=duration_col, + event_col=event_col, + entry_col=entry_col, + ancillary=ancillary, + show_progress=show_progress, + weights_col=weights_col, + robust=robust, + initial_point=initial_point, + formula=formula, + fit_options=fit_options, + ) -def _regression_model( - model_class, adata: AnnData, duration_col: str, event_col: str, entry_col: str = None, accept_zero_duration=True -): - """Convenience function for regression models.""" - df = anndata_to_df(adata) - df = df.dropna() - - if not accept_zero_duration: - df.loc[df[duration_col] == 0, duration_col] += 1e-5 - - model = model_class() - model.fit(df, duration_col, event_col, entry_col=entry_col) + # Add the results to the AnnData object + if inplace: + _regression_model_populate_adata(adata, log_logistic_aft.summary, key_added_prefix) - return model + return log_logistic_aft def _univariate_model( From 02e343de3675138ae65b76498eea0948562e37b8 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:28:55 +0100 Subject: [PATCH 05/25] updated log logistic example --- ehrapy/tools/_sa.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 395a54bf..b8e23f03 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -644,7 +644,8 @@ def log_logistic_aft( >>> adata = ep.dt.mimic_2(encoded=False) >>> # Flip 'censor_fl' because 0 = death and 1 = censored >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) - >>> llf = ep.tl.log_logistic_aft(adata, "mort_day_censored", "censor_flg") + >>> adata = adata[:, ["mort_day_censored", "censor_flg"]] + >>> llf = ep.tl.log_logistic_aft(adata, duration_col="mort_day_censored", event_col="censor_flg") """ df = _regression_model_data_frame_preparation(adata, duration_col, accept_zero_duration=False) From 8e1baa50937ef8c2db386ca0f2610b07df3690dc Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:56:56 +0100 Subject: [PATCH 06/25] store summary df in adata.uns --- ehrapy/tools/_sa.py | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index b8e23f03..a2a1029f 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -358,32 +358,12 @@ def _regression_model_data_frame_preparation(adata: AnnData, duration_col: str, return df -def _regression_model_populate_adata(adata: AnnData, model_summary: pd.DataFrame, key_added_prefix: str = None): - if key_added_prefix is None: - key_added_prefix = "" - else: - key_added_prefix = key_added_prefix + "_" - - full_results = pd.DataFrame(index=adata.var.index) - - # Populate with CoxPH summary data - for key in model_summary.columns: - full_results[key_added_prefix + key] = model_summary[key] - - # Add a boolean column indicating rows populated by this function - full_results[key_added_prefix + "cox_ph_populated"] = full_results.notna().any(axis=1) - - # Assign results back to adata.var - for col in full_results.columns: - adata.var[col] = full_results[col] - - def cox_ph( adata: AnnData, duration_col: str, *, inplace: bool = True, - key_added_prefix: str | None = None, + uns_key: str = "cox_ph", alpha: float = 0.05, label: str | None = None, baseline_estimation_method: Literal["breslow", "spline", "piecewise"] = "breslow", @@ -417,7 +397,7 @@ def cox_ph( event_col: The name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. inplace: Whether to modify the AnnData object in place. - key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column + uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. label: A string to name the column of the estimate. baseline_estimation_method: The method used to estimate the baseline hazard. Options are 'breslow', 'spline', and 'piecewise'. @@ -477,7 +457,8 @@ def cox_ph( # Add the results to the AnnData object if inplace: - _regression_model_populate_adata(adata, cox_ph.summary, key_added_prefix) + summary = cox_ph.summary + adata.uns[uns_key] = summary return cox_ph @@ -487,7 +468,7 @@ def weibull_aft( duration_col: str, *, inplace: bool = True, - key_added_prefix: str | None = None, + uns_key: str = "weibull_aft", alpha: float = 0.05, fit_intercept: bool = True, penalizer: float | np.ndarray = 0.0, @@ -515,7 +496,7 @@ def weibull_aft( adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. inplace: Whether to modify the AnnData object in place. - key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column name. + uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. @@ -576,7 +557,8 @@ def weibull_aft( # Add the results to the AnnData object if inplace: - _regression_model_populate_adata(adata, weibull_aft.summary, key_added_prefix) + summary = weibull_aft.summary + adata.uns[uns_key] = summary return weibull_aft @@ -586,7 +568,7 @@ def log_logistic_aft( duration_col: str, *, inplace: bool = True, - key_added_prefix: str | None = None, + uns_key: str = "log_logistic_aft", alpha: float = 0.05, fit_intercept: bool = True, penalizer: float | np.ndarray = 0.0, @@ -613,7 +595,7 @@ def log_logistic_aft( adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. inplace: Whether to modify the AnnData object in place. - key_added_prefix: Prefix to add to the column names in the AnnData object. An underscore will be added between the prefix and the column + uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. @@ -673,7 +655,8 @@ def log_logistic_aft( # Add the results to the AnnData object if inplace: - _regression_model_populate_adata(adata, log_logistic_aft.summary, key_added_prefix) + summary = log_logistic_aft.summary + adata.uns[uns_key] = summary return log_logistic_aft From eb9dabaffdba670cceaa8f481cacc5a9a1f9cf29 Mon Sep 17 00:00:00 2001 From: eroell Date: Wed, 8 Jan 2025 16:29:55 +0100 Subject: [PATCH 07/25] try moving np --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index a2a1029f..b8f0b60c 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -3,6 +3,7 @@ import warnings from typing import TYPE_CHECKING, Literal +import numpy as np # noqa: TC002 import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf @@ -22,7 +23,6 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numpy as np from anndata import AnnData from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper From e340a2848273411b4ae4e138194808dbf203de42 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 8 Jan 2025 17:48:40 +0100 Subject: [PATCH 08/25] omit inplace keyword --- ehrapy/tools/_sa.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index b8f0b60c..fbd72a8b 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -362,7 +362,6 @@ def cox_ph( adata: AnnData, duration_col: str, *, - inplace: bool = True, uns_key: str = "cox_ph", alpha: float = 0.05, label: str | None = None, @@ -396,7 +395,6 @@ def cox_ph( duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. event_col: The name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. - inplace: Whether to modify the AnnData object in place. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. label: A string to name the column of the estimate. @@ -455,10 +453,9 @@ def cox_ph( show_progress=show_progress, ) - # Add the results to the AnnData object - if inplace: - summary = cox_ph.summary - adata.uns[uns_key] = summary + # Save the summary to the uns slot + summary = cox_ph.summary + adata.uns[uns_key] = summary return cox_ph @@ -467,7 +464,6 @@ def weibull_aft( adata: AnnData, duration_col: str, *, - inplace: bool = True, uns_key: str = "weibull_aft", alpha: float = 0.05, fit_intercept: bool = True, @@ -495,7 +491,6 @@ def weibull_aft( Args: adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - inplace: Whether to modify the AnnData object in place. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. @@ -555,10 +550,9 @@ def weibull_aft( fit_options=fit_options, ) - # Add the results to the AnnData object - if inplace: - summary = weibull_aft.summary - adata.uns[uns_key] = summary + # Save the summary to the uns slot + summary = weibull_aft.summary + adata.uns[uns_key] = summary return weibull_aft @@ -567,7 +561,6 @@ def log_logistic_aft( adata: AnnData, duration_col: str, *, - inplace: bool = True, uns_key: str = "log_logistic_aft", alpha: float = 0.05, fit_intercept: bool = True, @@ -594,7 +587,6 @@ def log_logistic_aft( Args: adata: AnnData object with necessary columns `duration_col` and `event_col`. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - inplace: Whether to modify the AnnData object in place. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. alpha: The alpha value in the confidence intervals. @@ -653,10 +645,9 @@ def log_logistic_aft( fit_options=fit_options, ) - # Add the results to the AnnData object - if inplace: - summary = log_logistic_aft.summary - adata.uns[uns_key] = summary + # Save the summary to the uns slot + summary = log_logistic_aft.summary + adata.uns[uns_key] = summary return log_logistic_aft From c6a81df587386b03f8239f0e09ff3fd9f8d3cd2b Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:02:28 +0100 Subject: [PATCH 09/25] added explanation, as to where the results are stored --- ehrapy/tools/_sa.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index fbd72a8b..c7904c08 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -387,6 +387,7 @@ def cox_ph( The Cox proportional hazards model (CoxPH) examines the relationship between the survival time of subjects and one or more predictor variables. It models the hazard rate as a product of a baseline hazard function and an exponential function of the predictors, assuming proportional hazards over time. + The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/CoxPHFitter.html @@ -486,6 +487,8 @@ def weibull_aft( where the underlying assumption is that the logarithm of survival time follows a Weibull distribution. It models the survival time as an exponential function of the predictors, assuming a specific shape parameter for the distribution and allowing for accelerated or decelerated failure times based on the covariates. + The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. + See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html Args: @@ -582,6 +585,8 @@ def log_logistic_aft( This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times. By modeling survival time as a function of predictors, the Log-Logistic AFT model enables researchers to explore how specific factors influence the acceleration or deceleration of failure times, providing valuable insights into the underlying mechanisms driving event occurrence. + The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. + See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html Args: From 38f4efb73301723be69e8197e650fef6a7353496 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:03:20 +0100 Subject: [PATCH 10/25] corrected spelling --- ehrapy/tools/_sa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index c7904c08..fe33d1e7 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -487,7 +487,7 @@ def weibull_aft( where the underlying assumption is that the logarithm of survival time follows a Weibull distribution. It models the survival time as an exponential function of the predictors, assuming a specific shape parameter for the distribution and allowing for accelerated or decelerated failure times based on the covariates. - The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. + The results will be stored in the uns slot of the AnnData object under the key 'weibull_aft' unless specified otherwise in the uns_key parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html @@ -585,7 +585,7 @@ def log_logistic_aft( This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times. By modeling survival time as a function of predictors, the Log-Logistic AFT model enables researchers to explore how specific factors influence the acceleration or deceleration of failure times, providing valuable insights into the underlying mechanisms driving event occurrence. - The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. + The results will be stored in the uns slot of the AnnData object under the key 'log_logistic_aft' unless specified otherwise in the uns_key parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html From 501b864cd09fc76f060262bd1999a9cba2ce314c Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:17:11 +0100 Subject: [PATCH 11/25] updated tests to check for .uns ( should be removed later, when the univariates are updated ) --- tests/tools/test_sa.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py index e2bfec29..b2d1f6e5 100644 --- a/tests/tools/test_sa.py +++ b/tests/tools/test_sa.py @@ -84,15 +84,23 @@ def test_anova_glm(self): assert dataframe.iloc[1, 4] == 2 assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185 - def _sa_function_assert(self, model, model_class): + def _sa_function_assert(self, model, model_class, adata=None): assert isinstance(model, model_class) assert len(model.durations) == 1776 assert sum(model.event_observed) == 497 - def _sa_func_test(self, sa_function, sa_class, mimic_2_sa): + if adata is not None: + model_summary = adata.uns.get("test") + assert model_summary is not None + assert model_summary.equals(model.summary) + + def _sa_func_test(self, sa_function, sa_class, mimic_2_sa, regression=False): adata, duration_col, event_col = mimic_2_sa + if regression: + sa = sa_function(adata, duration_col=duration_col, event_col=event_col, uns_key="test") + else: + sa = sa_function(adata, duration_col=duration_col, event_col=event_col) - sa = sa_function(adata, duration_col=duration_col, event_col=event_col) self._sa_function_assert(sa, sa_class) def test_kmf(self, mimic_2_sa): From eb0b404b0c114c7f498d486290b952ca13df3d40 Mon Sep 17 00:00:00 2001 From: eroell Date: Wed, 8 Jan 2025 23:01:56 +0100 Subject: [PATCH 12/25] fix argument order, doc fixes --- ehrapy/tools/_sa.py | 55 +++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index fe33d1e7..f975075f 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -217,9 +217,11 @@ def kaplan_meier( https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. - duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in anndata that contains the subjects’ death observation. + adata: AnnData object. + duration_col: The name of the column in the AnnData object that contains the subjects’ lifetimes. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -361,6 +363,7 @@ def _regression_model_data_frame_preparation(adata: AnnData, duration_col: str, def cox_ph( adata: AnnData, duration_col: str, + event_col: str = None, *, uns_key: str = "cox_ph", alpha: float = 0.05, @@ -372,7 +375,6 @@ def cox_ph( n_baseline_knots: int = 4, knots: list[float] | None = None, breakpoints: list[float] | None = None, - event_col: str = None, weights_col: str | None = None, cluster_col: str | None = None, entry_col: str = None, @@ -392,10 +394,11 @@ def cox_ph( See https://lifelines.readthedocs.io/en/latest/fitters/regression/CoxPHFitter.html Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. + adata: AnnData object. duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in anndata that contains the subjects’ death observation. - If left as None, assume all individuals are uncensored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. label: A string to name the column of the estimate. @@ -406,7 +409,6 @@ def cox_ph( n_baseline_knots: Used when baseline_estimation_method="spline". Set the number of knots (interior & exterior) in the baseline hazard, which will be placed evenly along the time axis. Should be at least 2. Royston et. al, the authors of this model, suggest 4 to start, but any values between 2 and 8 are reasonable. If you need to customize the timestamps used to calculate the curve, use the knots parameter instead. knots: When baseline_estimation_method="spline", this allows customizing the points in the time axis for the baseline hazard curve. To use evenly-spaced points in time, the n_baseline_knots parameter can be employed instead. breakpoints: Used when baseline_estimation_method="piecewise". Set the positions of the baseline hazard breakpoints. - event_col: he name of the column in DataFrame that contains the subjects’ death observation. If left as None, assume all individuals are uncensored. weights_col: The name of the column in DataFrame that contains the weights for each subject. cluster_col: The name of the column in DataFrame that contains the cluster variable. Using this forces the sandwich estimator (robust variance estimator) to be used. entry_col: Column denoting when a subject entered the study, i.e. left-truncation. @@ -454,7 +456,6 @@ def cox_ph( show_progress=show_progress, ) - # Save the summary to the uns slot summary = cox_ph.summary adata.uns[uns_key] = summary @@ -464,6 +465,7 @@ def cox_ph( def weibull_aft( adata: AnnData, duration_col: str, + event_col: str, *, uns_key: str = "weibull_aft", alpha: float = 0.05, @@ -471,7 +473,6 @@ def weibull_aft( penalizer: float | np.ndarray = 0.0, l1_ratio: float = 0.0, model_ancillary: bool = True, - event_col: str | None = None, ancillary: bool | pd.DataFrame | str | None = None, show_progress: bool = False, weights_col: str | None = None, @@ -492,16 +493,17 @@ def weibull_aft( See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. + adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. model_ancillary: set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. - event_col: Name of the column in anndata that contains the subjects’ death observation. 1 if observed, 0 else (censored). - If left as None, assume all individuals are uncensored. ancillary: Choose to model the ancillary parameters. If None or False, explicitly do not fit the ancillary parameters using any covariates. If True, model the ancillary parameters with the same covariates as ``df``. @@ -553,7 +555,6 @@ def weibull_aft( fit_options=fit_options, ) - # Save the summary to the uns slot summary = weibull_aft.summary adata.uns[uns_key] = summary @@ -563,6 +564,7 @@ def weibull_aft( def log_logistic_aft( adata: AnnData, duration_col: str, + event_col: str | None = None, *, uns_key: str = "log_logistic_aft", alpha: float = 0.05, @@ -570,7 +572,6 @@ def log_logistic_aft( penalizer: float | np.ndarray = 0.0, l1_ratio: float = 0.0, model_ancillary: bool = False, - event_col: str | None = None, ancillary: bool | pd.DataFrame | str | None = None, show_progress: bool = False, weights_col: str | None = None, @@ -590,17 +591,17 @@ def log_logistic_aft( See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. + adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. - alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. model_ancillary: set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. - event_col: Name of the column in anndata that contains the subjects’ death observation. 1 if observed, 0 else (censored). - If left as None, assume all individuals are uncensored. ancillary: Choose to model the ancillary parameters. If None or False, explicitly do not fit the ancillary parameters using any covariates. If True, model the ancillary parameters with the same covariates as ``df``. @@ -650,7 +651,6 @@ def log_logistic_aft( fit_options=fit_options, ) - # Save the summary to the uns slot summary = log_logistic_aft.summary adata.uns[uns_key] = summary @@ -722,10 +722,11 @@ def nelson_aalen( See https://lifelines.readthedocs.io/en/latest/fitters/univariate/NelsonAalenFitter.html Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. + adata: AnnData object. duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in anndata that contains the subjects’ death observation. - If left as None, assume all individuals are uncensored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -790,11 +791,11 @@ def weibull( See https://lifelines.readthedocs.io/en/latest/fitters/univariate/WeibullFitter.html Args: - adata: AnnData object with necessary columns `duration_col` and `event_col`. + adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: Name of the column in the AnnData object that contains the subjects’ death observation. - If left as None, assume all individuals are uncensored. - adata: AnnData object with necessary columns `duration_col` and `event_col`. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + Column values are `True` if the event was observed, `False` if the event was lost (right-censored). + If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". From cffed4ddc8f0338c423d3d8592c620bc2e0064a6 Mon Sep 17 00:00:00 2001 From: eroell Date: Wed, 8 Jan 2025 23:15:30 +0100 Subject: [PATCH 13/25] slightly simpler wording --- ehrapy/tools/_sa.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index f975075f..351231d8 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -219,7 +219,7 @@ def kaplan_meier( Args: adata: AnnData object. duration_col: The name of the column in the AnnData object that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) @@ -396,7 +396,7 @@ def cox_ph( Args: adata: AnnData object. duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. @@ -495,7 +495,7 @@ def weibull_aft( Args: adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. @@ -593,7 +593,7 @@ def log_logistic_aft( Args: adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. @@ -724,7 +724,7 @@ def nelson_aalen( Args: adata: AnnData object. duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) @@ -793,7 +793,7 @@ def weibull( Args: adata: AnnData object. duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes. - event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or the individual has been censored. + event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. timeline: Return the best estimate at the values in timelines (positively increasing) From 3b2198863013482d916fd4d17381eb602fb6a81e Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:12:30 +0100 Subject: [PATCH 14/25] fiexed spelling Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 351231d8..e144f137 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -607,7 +607,7 @@ def log_logistic_aft( If True, model the ancillary parameters with the same covariates as ``df``. If DataFrame, provide covariates to model the ancillary parameters. Must be the same row count as ``df``. If str, should be a formula - show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. + show_progress: Since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. weights_col: The name of the column in DataFrame that contains the weights for each subject. robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. initial_point: set the starting point for the iterative solver. From 58ce15701618fa335a0d1375bf3957e77717431c Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:42:47 +0100 Subject: [PATCH 15/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index e144f137..f759c692 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -389,7 +389,7 @@ def cox_ph( The Cox proportional hazards model (CoxPH) examines the relationship between the survival time of subjects and one or more predictor variables. It models the hazard rate as a product of a baseline hazard function and an exponential function of the predictors, assuming proportional hazards over time. - The results will be stored in the uns slot of the AnnData object under the key 'cox_ph' unless specified otherwise in the uns_key parameter. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'cox_ph' unless specified otherwise in the `uns_key` parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/CoxPHFitter.html From ee97f31f53112db5e34c247e3fead95a1a4ac298 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:42:56 +0100 Subject: [PATCH 16/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index f759c692..6954da23 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -488,7 +488,7 @@ def weibull_aft( where the underlying assumption is that the logarithm of survival time follows a Weibull distribution. It models the survival time as an exponential function of the predictors, assuming a specific shape parameter for the distribution and allowing for accelerated or decelerated failure times based on the covariates. - The results will be stored in the uns slot of the AnnData object under the key 'weibull_aft' unless specified otherwise in the uns_key parameter. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'weibull_aft' unless specified otherwise in the `uns_key` parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html From 6dc783140468ee2bb497fec647e1fe3b8ae19851 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:43:06 +0100 Subject: [PATCH 17/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 6954da23..e50bc611 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -414,7 +414,7 @@ def cox_ph( entry_col: Column denoting when a subject entered the study, i.e. left-truncation. robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. formula: an Wilkinson formula, like in R and statsmodels, for the right-hand-side. If left as None, all columns not assigned as durations, weights, etc. are used. Uses the library Formulaic for parsing. - batch_mode: enabling batch_mode can be faster for datasets with a large number of ties. If left as None, lifelines will choose the best option. + batch_mode: Enabling batch_mode can be faster for datasets with a large number of ties. If left as `None`, lifelines will choose the best option. show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. initial_point: set the starting point for the iterative solver. fit_options: Additional keyword arguments to pass into the estimator. From 540b79f9a3fa844debc5e2cacb76bf980fdcd67b Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:43:17 +0100 Subject: [PATCH 18/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index e50bc611..a0ff0c40 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -401,7 +401,7 @@ def cox_ph( If left `None`, all individuals are assumed to be uncensored. uns_key: The key to use for the uns slot in the AnnData object. alpha: The alpha value in the confidence intervals. - label: A string to name the column of the estimate. + label: The name of the column of the estimate. baseline_estimation_method: The method used to estimate the baseline hazard. Options are 'breslow', 'spline', and 'piecewise'. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. From 09484d99b51b9c7b9528fb832027c035e1d6e715 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:43:25 +0100 Subject: [PATCH 19/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index a0ff0c40..2ceaf201 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -415,7 +415,7 @@ def cox_ph( robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle ties, so if there are high number of ties, results may significantly differ. formula: an Wilkinson formula, like in R and statsmodels, for the right-hand-side. If left as None, all columns not assigned as durations, weights, etc. are used. Uses the library Formulaic for parsing. batch_mode: Enabling batch_mode can be faster for datasets with a large number of ties. If left as `None`, lifelines will choose the best option. - show_progress: since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. + show_progress: Since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. initial_point: set the starting point for the iterative solver. fit_options: Additional keyword arguments to pass into the estimator. From 96db288537542a8556f25e8cfa2aa3a0736ca702 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:43:36 +0100 Subject: [PATCH 20/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 2ceaf201..c0157cbd 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -601,7 +601,7 @@ def log_logistic_aft( fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. l1_ratio: Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See penalizer above. - model_ancillary: set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. + model_ancillary: Set the model instance to always model the ancillary parameter with the supplied Dataframe. This is useful for grid-search optimization. ancillary: Choose to model the ancillary parameters. If None or False, explicitly do not fit the ancillary parameters using any covariates. If True, model the ancillary parameters with the same covariates as ``df``. From 568b84b2ddcdcb87fbb5bff4bf07ea16c052458c Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:43:46 +0100 Subject: [PATCH 21/25] Update ehrapy/tools/_sa.py Co-authored-by: Lukas Heumos --- ehrapy/tools/_sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index c0157cbd..d7c996c7 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -586,7 +586,7 @@ def log_logistic_aft( This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times. By modeling survival time as a function of predictors, the Log-Logistic AFT model enables researchers to explore how specific factors influence the acceleration or deceleration of failure times, providing valuable insights into the underlying mechanisms driving event occurrence. - The results will be stored in the uns slot of the AnnData object under the key 'log_logistic_aft' unless specified otherwise in the uns_key parameter. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'log_logistic_aft' unless specified otherwise in the `uns_key` parameter. See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html From cf00a3fa25cb37903fed103ea11fbb0ce438a832 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Fri, 10 Jan 2025 14:43:28 +0100 Subject: [PATCH 22/25] renamed function to be clearer --- ehrapy/tools/_sa.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index d7c996c7..f3203a44 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -349,7 +349,7 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_ return dataframe -def _regression_model_data_frame_preparation(adata: AnnData, duration_col: str, accept_zero_duration=True): +def _build_model_input_dataframe(adata: AnnData, duration_col: str, accept_zero_duration=True): """Convenience function for regression models.""" df = anndata_to_df(adata) df = df.dropna() @@ -429,7 +429,7 @@ def cox_ph( >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0) >>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg") """ - df = _regression_model_data_frame_preparation(adata, duration_col) + df = _build_model_input_dataframe(adata, duration_col) cox_ph = CoxPHFitter( alpha=alpha, label=label, @@ -531,7 +531,7 @@ def weibull_aft( >>> aft.print_summary() """ - df = _regression_model_data_frame_preparation(adata, duration_col, accept_zero_duration=False) + df = _build_model_input_dataframe(adata, duration_col, accept_zero_duration=False) weibull_aft = WeibullAFTFitter( alpha=alpha, @@ -627,7 +627,7 @@ def log_logistic_aft( >>> adata = adata[:, ["mort_day_censored", "censor_flg"]] >>> llf = ep.tl.log_logistic_aft(adata, duration_col="mort_day_censored", event_col="censor_flg") """ - df = _regression_model_data_frame_preparation(adata, duration_col, accept_zero_duration=False) + df = _build_model_input_dataframe(adata, duration_col, accept_zero_duration=False) log_logistic_aft = LogLogisticAFTFitter( alpha=alpha, @@ -673,10 +673,7 @@ def _univariate_model( censoring: Literal["right", "left"] = "right", ): """Convenience function for univariate models.""" - df = anndata_to_df(adata) - - if not accept_zero_duration: - df.loc[df[duration_col] == 0, duration_col] += 1e-5 + df = _build_model_input_dataframe(adata, duration_col, accept_zero_duration) T = df[duration_col] E = df[event_col] From b574978640ae3ae335da98ec60cb1648f31ea579 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:35:36 +0100 Subject: [PATCH 23/25] Add uns_key parameter to Kaplan-Meier, Nelson-Aalen, and Weibull functions for customizable storage in AnnData object --- ehrapy/tools/_sa.py | 23 ++++++++++++++++++++++- tests/tools/test_sa.py | 9 +++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index f3203a44..659c5b35 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -199,6 +199,7 @@ def kaplan_meier( duration_col: str, event_col: str | None = None, *, + uns_key: str = "kaplan_meier", timeline: list[float] | None = None, entry: str | None = None, label: str | None = None, @@ -212,6 +213,7 @@ def kaplan_meier( The Kaplan–Meier estimator, also known as the product limit estimator, is a non-parametric statistic used to estimate the survival function from lifetime data. In medical research, it is often used to measure the fraction of patients living for a certain amount of time after treatment. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'kaplan_meier' unless specified otherwise in the `uns_key` parameter. See https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#module-lifelines.fitters.kaplan_meier_fitter @@ -222,6 +224,7 @@ def kaplan_meier( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. + uns_key: The key to use for the uns slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -249,6 +252,7 @@ def kaplan_meier( duration_col, event_col, KaplanMeierFitter, + uns_key, True, timeline, entry, @@ -662,6 +666,7 @@ def _univariate_model( duration_col: str, event_col: str, model_class, + uns_key: str, accept_zero_duration=True, timeline: list[float] | None = None, entry: str | None = None, @@ -694,6 +699,14 @@ def _univariate_model( fit_options=fit_options, ) + if isinstance(model, NelsonAalenFitter) or isinstance( + model, KaplanMeierFitter + ): # NelsonAalenFitter and KaplanMeierFitter have no summary attribute + summary = model.event_table + else: + summary = model.summary + adata.uns[uns_key] = summary + return model @@ -702,6 +715,7 @@ def nelson_aalen( duration_col: str, event_col: str | None = None, *, + uns_key: str = "nelson_aalen", timeline: list[float] | None = None, entry: str | None = None, label: str | None = None, @@ -716,6 +730,7 @@ def nelson_aalen( The Nelson-Aalen estimator is a non-parametric method used in survival analysis to estimate the cumulative hazard function. This technique is particularly useful when dealing with censored data, as it accounts for the presence of individuals whose event times are unknown due to censoring. By estimating the cumulative hazard function, the Nelson-Aalen estimator allows researchers to assess the risk of an event occurring over time, providing valuable insights into the underlying dynamics of the survival process. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'nelson_aalen' unless specified otherwise in the `uns_key` parameter. See https://lifelines.readthedocs.io/en/latest/fitters/univariate/NelsonAalenFitter.html Args: @@ -724,6 +739,7 @@ def nelson_aalen( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. + uns_key: The key to use for the uns slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -752,7 +768,8 @@ def nelson_aalen( duration_col, event_col, NelsonAalenFitter, - True, + uns_key=uns_key, + accept_zero_duration=True, timeline=timeline, entry=entry, label=label, @@ -769,6 +786,7 @@ def weibull( duration_col: str, event_col: str, *, + uns_key: str = "weibull", timeline: list[float] | None = None, entry: str | None = None, label: str | None = None, @@ -785,6 +803,7 @@ def weibull( By fitting the Weibull model to censored survival data, researchers can estimate these parameters and gain insights into the hazard rate over time, facilitating comparisons between different groups or treatments. This method provides a comprehensive framework for examining survival data and offers valuable insights into the factors influencing event occurrence dynamics. + The results will be stored in the `.uns` slot of the :class:`AnnData` object under the key 'weibull' unless specified otherwise in the `uns_key` parameter. See https://lifelines.readthedocs.io/en/latest/fitters/univariate/WeibullFitter.html Args: @@ -793,6 +812,7 @@ def weibull( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. + uns_key: The key to use for the uns slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -818,6 +838,7 @@ def weibull( duration_col, event_col, WeibullFitter, + uns_key=uns_key, accept_zero_duration=False, timeline=timeline, entry=entry, diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py index b2d1f6e5..6f74ee48 100644 --- a/tests/tools/test_sa.py +++ b/tests/tools/test_sa.py @@ -89,17 +89,14 @@ def _sa_function_assert(self, model, model_class, adata=None): assert len(model.durations) == 1776 assert sum(model.event_observed) == 497 - if adata is not None: + if adata is not None: # doing it disway, due to legacy kmf function model_summary = adata.uns.get("test") assert model_summary is not None assert model_summary.equals(model.summary) - def _sa_func_test(self, sa_function, sa_class, mimic_2_sa, regression=False): + def _sa_func_test(self, sa_function, sa_class, mimic_2_sa): adata, duration_col, event_col = mimic_2_sa - if regression: - sa = sa_function(adata, duration_col=duration_col, event_col=event_col, uns_key="test") - else: - sa = sa_function(adata, duration_col=duration_col, event_col=event_col) + sa = sa_function(adata, duration_col=duration_col, event_col=event_col, uns_key="test") self._sa_function_assert(sa, sa_class) From ace1baf974ab3888484ac9cfab5bd598dd7eac11 Mon Sep 17 00:00:00 2001 From: Carl Buchholz <32228189+aGuyLearning@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:38:27 +0100 Subject: [PATCH 24/25] Update test assertions in TestSA for event_table handling and pass adata to assertion method --- tests/tools/test_sa.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py index 6f74ee48..a383bfc0 100644 --- a/tests/tools/test_sa.py +++ b/tests/tools/test_sa.py @@ -92,13 +92,18 @@ def _sa_function_assert(self, model, model_class, adata=None): if adata is not None: # doing it disway, due to legacy kmf function model_summary = adata.uns.get("test") assert model_summary is not None - assert model_summary.equals(model.summary) + if isinstance(model, KaplanMeierFitter) or isinstance( + model, NelsonAalenFitter + ): # kmf and nelson_aalen have event_table + assert model_summary.equals(model.event_table) + else: + assert model_summary.equals(model.summary) def _sa_func_test(self, sa_function, sa_class, mimic_2_sa): adata, duration_col, event_col = mimic_2_sa sa = sa_function(adata, duration_col=duration_col, event_col=event_col, uns_key="test") - self._sa_function_assert(sa, sa_class) + self._sa_function_assert(sa, sa_class, adata) def test_kmf(self, mimic_2_sa): # check for deprecation warning From 3de5678bc4f86dcf51e8d97b924718ad466e449d Mon Sep 17 00:00:00 2001 From: eroell Date: Fri, 10 Jan 2025 16:22:47 +0100 Subject: [PATCH 25/25] uns to in doc --- ehrapy/tools/_sa.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 659c5b35..e436e73d 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -224,7 +224,7 @@ def kaplan_meier( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -403,7 +403,7 @@ def cox_ph( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. alpha: The alpha value in the confidence intervals. label: The name of the column of the estimate. baseline_estimation_method: The method used to estimate the baseline hazard. Options are 'breslow', 'spline', and 'piecewise'. @@ -502,7 +502,7 @@ def weibull_aft( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. @@ -600,7 +600,7 @@ def log_logistic_aft( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. alpha: The alpha value in the confidence intervals. fit_intercept: Whether to fit an intercept term in the model. penalizer: Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. @@ -739,7 +739,7 @@ def nelson_aalen( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born". @@ -812,7 +812,7 @@ def weibull( event_col: The name of the column in the AnnData object that specifies whether the event has been observed, or censored. Column values are `True` if the event was observed, `False` if the event was lost (right-censored). If left `None`, all individuals are assumed to be uncensored. - uns_key: The key to use for the uns slot in the AnnData object. + uns_key: The key to use for the `.uns` slot in the AnnData object. timeline: Return the best estimate at the values in timelines (positively increasing) entry: Relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population entered study when they were "born".