Skip to content

Commit 00635ee

Browse files
igerberclaude
andcommitted
Fix EPV denominator: use predictor count excluding intercept (Peduzzi convention)
Peduzzi et al. (1996) define EPV using independent predictor variables, not including the intercept. Change denominator from k_solve (which includes the intercept column) to n_predictors = k_solve - 1. Also fix TripleDifference fallback warning to use correct API keyword estimation_method (not est_method). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f48c65b commit 00635ee

5 files changed

Lines changed: 16 additions & 13 deletions

File tree

diff_diff/linalg.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,20 +1301,23 @@ def solve_logit(
13011301
n_pos_y = int(np.sum(y_eff))
13021302
n_neg_y = n_eff - n_pos_y
13031303
n_events = min(n_pos_y, n_neg_y)
1304-
epv = n_events / k_solve if k_solve > 0 else float("inf")
1304+
# Peduzzi et al. (1996) define EPV using predictor variables, excluding
1305+
# the intercept. k_solve includes the intercept column, so use k_solve - 1.
1306+
n_predictors = k_solve - 1 # exclude intercept
1307+
epv = n_events / n_predictors if n_predictors > 0 else float("inf")
13051308

13061309
if diagnostics_out is not None:
13071310
diagnostics_out["epv"] = epv
13081311
diagnostics_out["n_events"] = n_events
1309-
diagnostics_out["k"] = k_solve
1312+
diagnostics_out["k"] = n_predictors
13101313
diagnostics_out["is_low"] = epv < epv_threshold
13111314

13121315
if epv < epv_threshold:
13131316
ctx = f" for {context_label}" if context_label else ""
13141317
msg = (
13151318
f"Low Events Per Variable (EPV = {epv:.1f}) in propensity score "
13161319
f"model{ctx}. {n_events} minority-class observations for "
1317-
f"{k_solve} parameters (including intercept). "
1320+
f"{n_predictors} predictor variable(s). "
13181321
f"Peduzzi et al. (1996) recommend EPV >= {epv_threshold:.0f}. "
13191322
f"Estimates may be unreliable (overfitting, biased coefficients, "
13201323
f"inflated standard errors). "

diff_diff/staggered.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ def diagnose_propensity(
456456
never_treated_mask = precomputed["never_treated_mask"]
457457
unit_cohorts = precomputed["unit_cohorts"]
458458
n_covariates = len(covariates)
459-
n_params = n_covariates + 1 # +1 for intercept
459+
n_params = n_covariates # predictor count, excluding intercept (Peduzzi convention)
460460

461461
rows = []
462462
for g in sorted(cohort_masks.keys()):

diff_diff/triple_diff.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1041,7 +1041,7 @@ def _estimate_ddd_decomposition(
10411041
f"Propensity score estimation failed for subgroup "
10421042
f"{j} vs 4; dropping covariates and using "
10431043
f"unconditional probability. "
1044-
f"Consider est_method='reg' to avoid propensity "
1044+
f"Consider estimation_method='reg' to avoid propensity "
10451045
f"scores entirely.",
10461046
UserWarning,
10471047
stacklevel=3,

docs/methodology/REGISTRY.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1:
405405
- Trimming: Propensity scores clipped to `[pscore_trim, 1-pscore_trim]` (default
406406
0.01) before weight computation. Warning emitted when scores are trimmed.
407407
- **Events Per Variable (EPV) diagnostics:** Per-cohort EPV =
408-
min(n_treated, n_control) / (n_covariates + 1) checked before IRLS.
408+
min(n_treated, n_control) / n_covariates checked before IRLS.
409409
Default threshold: 10 (Peduzzi et al. 1996). Warns when EPV < threshold;
410410
errors when `rank_deficient_action="error"`. Pre-estimation check via
411411
`diagnose_propensity()`. Results stored in `results.epv_diagnostics`.
@@ -1247,7 +1247,7 @@ has no additional effect.
12471247
function), emits UserWarning. When `rank_deficient_action="error"`, errors
12481248
are always re-raised regardless of `pscore_fallback`.
12491249
- **Events Per Variable (EPV) diagnostics:** Per-logit EPV =
1250-
min(n_subgroup_j, n_subgroup_4) / (n_covariates + 1) checked before IRLS.
1250+
min(n_subgroup_j, n_subgroup_4) / n_covariates checked before IRLS.
12511251
Default threshold: 10 (Peduzzi et al. 1996). Warns when EPV < threshold;
12521252
errors when `rank_deficient_action="error"`.
12531253
- **Note:** `pscore_fallback` default changed from unconditional to error.
@@ -1295,7 +1295,7 @@ has no additional effect.
12951295
- Warns if no valid comparison groups exist for a (g, t) pair (skips that pair)
12961296
- Propensity score overlap enforced by clipping at `pscore_trim` (default 0.01)
12971297
- **Events Per Variable (EPV) diagnostics:** Per-DiD EPV =
1298-
min(n_subgroup_j, n_subgroup_4) / (n_covariates + 1) checked before IRLS.
1298+
min(n_subgroup_j, n_subgroup_4) / n_covariates checked before IRLS.
12991299
Default threshold: 10 (Peduzzi et al. 1996). Warns when EPV < threshold;
13001300
errors when `rank_deficient_action="error"`.
13011301
- **Note:** When multiple comparison cohorts `g_c` contribute to the same

tests/test_linalg.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,8 +1772,8 @@ def test_epv_diagnostics_out_populated(self):
17721772
assert "k" in diag
17731773
assert "is_low" in diag
17741774
assert diag["n_events"] == 20 # minority class
1775-
assert diag["k"] == 5 # 4 covariates + intercept
1776-
assert abs(diag["epv"] - 4.0) < 0.01
1775+
assert diag["k"] == 4 # 4 predictor variables (excluding intercept)
1776+
assert abs(diag["epv"] - 5.0) < 0.01 # 20 events / 4 predictors
17771777
assert diag["is_low"] is True
17781778

17791779
def test_epv_uses_post_drop_k(self):
@@ -1794,9 +1794,9 @@ def test_epv_uses_post_drop_k(self):
17941794
solve_logit(X, y, diagnostics_out=diag, rank_deficient_action="silent")
17951795

17961796
# Should be 3 params (2 kept covariates + intercept), not 4
1797-
assert diag["k"] == 3
1797+
assert diag["k"] == 2 # 2 kept predictor variables (excluding intercept)
17981798
assert diag["n_events"] == 30
1799-
assert abs(diag["epv"] - 10.0) < 0.01
1799+
assert abs(diag["epv"] - 15.0) < 0.01 # 30 events / 2 predictors
18001800

18011801
def test_epv_uses_positive_weight_sample(self):
18021802
"""EPV computed on positive-weight sample, not padded rows."""
@@ -1828,7 +1828,7 @@ def test_epv_uses_positive_weight_sample(self):
18281828

18291829
# EPV should reflect the 10-event effective sample, not 260
18301830
assert diag["n_events"] == 10 # min(10, 190) from real sample
1831-
assert diag["epv"] == 10 / 5 # 10 events / 5 params = 2.0
1831+
assert diag["epv"] == 10 / 4 # 10 events / 4 predictors = 2.5
18321832
assert diag["is_low"] is True
18331833

18341834

0 commit comments

Comments
 (0)