@@ -1122,6 +1122,7 @@ def _compute_robust_vcov_numpy(
11221122# in the logistic model (predicted probabilities collapse to 0/1).
11231123_LOGIT_SEPARATION_COEF_THRESHOLD = 10
11241124_LOGIT_SEPARATION_PROB_THRESHOLD = 1e-5
1125+ _DEFAULT_EPV_THRESHOLD = 10
11251126
11261127
11271128def solve_logit (
@@ -1132,6 +1133,9 @@ def solve_logit(
11321133 check_separation : bool = True ,
11331134 rank_deficient_action : str = "warn" ,
11341135 weights : Optional [np .ndarray ] = None ,
1136+ epv_threshold : float = _DEFAULT_EPV_THRESHOLD ,
1137+ context_label : str = "" ,
1138+ diagnostics_out : Optional [dict ] = None ,
11351139) -> Tuple [np .ndarray , np .ndarray ]:
11361140 """
11371141 Fit logistic regression via IRLS (Fisher scoring).
@@ -1164,6 +1168,18 @@ def solve_logit(
11641168 maximum likelihood estimator, matching R's ``svyglm(family=binomial)``.
11651169 When None (default), behavior is identical to unweighted logistic
11661170 regression.
1171+ epv_threshold : float, default 10
1172+ Events Per Variable threshold. When the ratio of minority-class
1173+ observations to predictor variables (excluding intercept) falls
1174+ below this value, a warning is
1175+ emitted (or ValueError raised if ``rank_deficient_action="error"``).
1176+ Based on Peduzzi et al. (1996).
1177+ context_label : str, default ""
1178+ Optional label for warning messages (e.g., "cohort g=4") to help
1179+ users identify which logit estimation triggered the warning.
1180+ diagnostics_out : dict, optional
1181+ If provided, populated with EPV diagnostic info:
1182+ ``{"epv": float, "n_events": int, "k": int, "is_low": bool}``.
11671183
11681184 Returns
11691185 -------
@@ -1273,6 +1289,45 @@ def solve_logit(
12731289 kept_cols = np .arange (k )
12741290 X_solve = X_with_intercept
12751291
1292+ # Events Per Variable (EPV) check — Peduzzi et al. (1996)
1293+ # Use effective (positive-weight) sample when weights have zeros,
1294+ # since zero-weight rows don't contribute to the likelihood.
1295+ k_solve = X_solve .shape [1 ]
1296+ if weights is not None and np .any (weights == 0 ):
1297+ y_eff = y [weights > 0 ]
1298+ n_eff = len (y_eff )
1299+ else :
1300+ y_eff = y
1301+ n_eff = n
1302+ n_pos_y = int (np .sum (y_eff ))
1303+ n_neg_y = n_eff - n_pos_y
1304+ n_events = min (n_pos_y , n_neg_y )
1305+ # Peduzzi et al. (1996) define EPV using predictor variables, excluding
1306+ # the intercept. k_solve includes the intercept column, so use k_solve - 1.
1307+ n_predictors = k_solve - 1 # exclude intercept
1308+ epv = n_events / n_predictors if n_predictors > 0 else float ("inf" )
1309+
1310+ if diagnostics_out is not None :
1311+ diagnostics_out ["epv" ] = epv
1312+ diagnostics_out ["n_events" ] = n_events
1313+ diagnostics_out ["k" ] = n_predictors
1314+ diagnostics_out ["is_low" ] = epv < epv_threshold
1315+
1316+ if epv < epv_threshold :
1317+ ctx = f" for { context_label } " if context_label else ""
1318+ msg = (
1319+ f"Low Events Per Variable (EPV = { epv :.1f} ) in propensity score "
1320+ f"model{ ctx } . { n_events } minority-class observations for "
1321+ f"{ n_predictors } predictor variable(s). "
1322+ f"Peduzzi et al. (1996) recommend EPV >= { epv_threshold :.0f} . "
1323+ f"Estimates may be unreliable (overfitting, biased coefficients, "
1324+ f"inflated standard errors). "
1325+ f"Consider estimation_method='reg' to avoid propensity scores."
1326+ )
1327+ if rank_deficient_action == "error" :
1328+ raise ValueError (msg )
1329+ warnings .warn (msg , UserWarning , stacklevel = 2 )
1330+
12761331 # IRLS (Fisher scoring)
12771332 beta_solve = np .zeros (X_solve .shape [1 ])
12781333 converged = False
0 commit comments