Skip to content

Commit e1cd6f5

Browse files
authored
Merge pull request #251 from igerber/logit-assessment
Add EPV diagnostics for propensity score logit
2 parents 19bad25 + 67bc6db commit e1cd6f5

11 files changed

Lines changed: 1476 additions & 206 deletions

diff_diff/linalg.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ def _compute_robust_vcov_numpy(
11221122
# in the logistic model (predicted probabilities collapse to 0/1).
11231123
_LOGIT_SEPARATION_COEF_THRESHOLD = 10
11241124
_LOGIT_SEPARATION_PROB_THRESHOLD = 1e-5
1125+
_DEFAULT_EPV_THRESHOLD = 10
11251126

11261127

11271128
def solve_logit(
@@ -1132,6 +1133,9 @@ def solve_logit(
11321133
check_separation: bool = True,
11331134
rank_deficient_action: str = "warn",
11341135
weights: Optional[np.ndarray] = None,
1136+
epv_threshold: float = _DEFAULT_EPV_THRESHOLD,
1137+
context_label: str = "",
1138+
diagnostics_out: Optional[dict] = None,
11351139
) -> Tuple[np.ndarray, np.ndarray]:
11361140
"""
11371141
Fit logistic regression via IRLS (Fisher scoring).
@@ -1164,6 +1168,18 @@ def solve_logit(
11641168
maximum likelihood estimator, matching R's ``svyglm(family=binomial)``.
11651169
When None (default), behavior is identical to unweighted logistic
11661170
regression.
1171+
epv_threshold : float, default 10
1172+
Events Per Variable threshold. When the ratio of minority-class
1173+
observations to predictor variables (excluding intercept) falls
1174+
below this value, a warning is
1175+
emitted (or ValueError raised if ``rank_deficient_action="error"``).
1176+
Based on Peduzzi et al. (1996).
1177+
context_label : str, default ""
1178+
Optional label for warning messages (e.g., "cohort g=4") to help
1179+
users identify which logit estimation triggered the warning.
1180+
diagnostics_out : dict, optional
1181+
If provided, populated with EPV diagnostic info:
1182+
``{"epv": float, "n_events": int, "k": int, "is_low": bool}``.
11671183
11681184
Returns
11691185
-------
@@ -1273,6 +1289,45 @@ def solve_logit(
12731289
kept_cols = np.arange(k)
12741290
X_solve = X_with_intercept
12751291

1292+
# Events Per Variable (EPV) check — Peduzzi et al. (1996)
1293+
# Use effective (positive-weight) sample when weights have zeros,
1294+
# since zero-weight rows don't contribute to the likelihood.
1295+
k_solve = X_solve.shape[1]
1296+
if weights is not None and np.any(weights == 0):
1297+
y_eff = y[weights > 0]
1298+
n_eff = len(y_eff)
1299+
else:
1300+
y_eff = y
1301+
n_eff = n
1302+
n_pos_y = int(np.sum(y_eff))
1303+
n_neg_y = n_eff - n_pos_y
1304+
n_events = min(n_pos_y, n_neg_y)
1305+
# Peduzzi et al. (1996) define EPV using predictor variables, excluding
1306+
# the intercept. k_solve includes the intercept column, so use k_solve - 1.
1307+
n_predictors = k_solve - 1 # exclude intercept
1308+
epv = n_events / n_predictors if n_predictors > 0 else float("inf")
1309+
1310+
if diagnostics_out is not None:
1311+
diagnostics_out["epv"] = epv
1312+
diagnostics_out["n_events"] = n_events
1313+
diagnostics_out["k"] = n_predictors
1314+
diagnostics_out["is_low"] = epv < epv_threshold
1315+
1316+
if epv < epv_threshold:
1317+
ctx = f" for {context_label}" if context_label else ""
1318+
msg = (
1319+
f"Low Events Per Variable (EPV = {epv:.1f}) in propensity score "
1320+
f"model{ctx}. {n_events} minority-class observations for "
1321+
f"{n_predictors} predictor variable(s). "
1322+
f"Peduzzi et al. (1996) recommend EPV >= {epv_threshold:.0f}. "
1323+
f"Estimates may be unreliable (overfitting, biased coefficients, "
1324+
f"inflated standard errors). "
1325+
f"Consider estimation_method='reg' to avoid propensity scores."
1326+
)
1327+
if rank_deficient_action == "error":
1328+
raise ValueError(msg)
1329+
warnings.warn(msg, UserWarning, stacklevel=2)
1330+
12761331
# IRLS (Fisher scoring)
12771332
beta_solve = np.zeros(X_solve.shape[1])
12781333
converged = False

0 commit comments

Comments
 (0)