From fa8be27b9dce9a3952d3e1e6d1b236ada13d3094 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 12:23:26 -0400 Subject: [PATCH 1/6] =?UTF-8?q?efficient=5Fdid:=20align=20REGISTRY=20note?= =?UTF-8?q?=20with=20last=5Fcohort=20=C3=97=20anticipation=20trim?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `control_group="last_cohort"` path in EfficientDiD.fit (line 470) trims periods at `last_g - anticipation`, excluding anticipation-contaminated periods from the pseudo-control's pre-treatment window. REGISTRY.md previously described only the `anticipation=0` case. Aligns both the Edge Cases bullet and the algorithm Note to the code's actual behavior, cross-references the interaction in the `control_group` and `anticipation` docstring entries, and adds a regression test asserting the trim cuts at `last_g - anticipation` rather than `last_g`. Closes Tier-A backlog item 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/efficient_did.py | 12 ++++++--- docs/methodology/REGISTRY.md | 4 +-- tests/test_efficient_did.py | 51 +++++++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py index 4dcb93e42..1f3e48cac 100644 --- a/diff_diff/efficient_did.py +++ b/diff_diff/efficient_did.py @@ -162,9 +162,11 @@ class EfficientDiD(EfficientDiDBootstrapMixin): Which units serve as the comparison group: ``"never_treated"`` requires a never-treated cohort (raises if none exist); ``"last_cohort"`` reclassifies the latest treatment - cohort as pseudo-never-treated and drops post-treatment periods - for that cohort. Distinct from CallawaySantAnna's - ``"not_yet_treated"`` — see REGISTRY.md for details. + cohort as pseudo-never-treated and drops periods at + ``t >= last_g - anticipation`` so the pseudo-control's + pre-treatment window excludes anticipation-contaminated periods. + Distinct from CallawaySantAnna's ``"not_yet_treated"`` — see + REGISTRY.md for details. n_bootstrap : int, default 0 Number of multiplier bootstrap iterations (0 = analytical only). bootstrap_weights : str, default ``"rademacher"`` @@ -173,7 +175,9 @@ class EfficientDiD(EfficientDiDBootstrapMixin): Random seed for reproducibility. anticipation : int, default 0 Number of anticipation periods (shifts the effective treatment - boundary forward by this amount). + boundary forward by this amount). When combined with + ``control_group="last_cohort"``, also trims the pseudo-control + period set at ``t >= last_g - anticipation`` (see REGISTRY.md). sieve_k_max : int or None Maximum polynomial degree for sieve ratio estimation. None = auto (``min(floor(n_gp^{1/5}), 5)``). Only used with covariates. diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index d8bc1d0ed..4e1495789 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -877,7 +877,7 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`. - **Near-zero propensity scores**: Ratio `p_g(X)/p_{g'}(X)` explodes. Overlap assumption (O) rules this out in population; implement trimming or warn on finite-sample instability - **Note:** When no sieve degree K succeeds for ratio estimation (basis dimension exceeds comparison group size, or all linear systems are singular), the estimator falls back to a constant ratio of 1 for all units with a UserWarning. The outcome regression adjustment remains active, so the generated outcomes (Eq 4.4) still incorporate covariate information via the m_hat terms. The DR property ensures consistency as long as the outcome regression is correctly specified. - **Note:** When no sieve degree K succeeds for inverse propensity estimation (algorithm step 4), the estimator falls back to unconditional n/n_group scaling with a UserWarning, which reduces to the unconditional Omega* approximation for the affected group. -- **All units eventually treated**: Last cohort serves as "never-treated" by dropping time periods from the last cohort's treatment onset onward. Use `control_group="last_cohort"` to enable; default `"never_treated"` raises ValueError if no never-treated units exist +- **All units eventually treated**: Last cohort serves as "never-treated" by dropping time periods from `last_g - anticipation` onward (where `last_g` is the latest treatment cohort). This excludes anticipation-contaminated periods from the pseudo-control's pre-treatment window. With `anticipation=0` (default), this equals `last_g`. Use `control_group="last_cohort"` to enable; default `"never_treated"` raises ValueError if no never-treated units exist - **Negative weights**: Explicitly stated as harmless for bias and beneficial for precision; arise from efficiency optimization under overidentification (Section 5.2) - **PT-Post regime (just-identified)**: Under PT-Post, EDiD automatically reduces to standard single-baseline estimator (Corollary 3.2). No downside to using EDiD -- it subsumes standard estimators - **Duplicate rows**: Duplicate `(unit, time)` entries are rejected with `ValueError`. The estimator requires exactly one observation per unit-period @@ -936,7 +936,7 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`. - **Note:** EfficientDiD covariates (DR path) with survey weights supported — WLS outcome regression, weighted sieve normal equations for propensity ratios/inverse propensities, survey-weighted Nadaraya-Watson kernel for conditional Omega*(X), and survey-weighted ATT averaging. Silverman bandwidth uses unweighted statistics (survey-weighted bandwidth deferred as second-order refinement). - **Note:** Cluster-robust SEs use the standard Liang-Zeger clustered sandwich estimator applied to EIF values: aggregate EIF within clusters, center, and compute variance with G/(G-1) small-sample correction. Cluster bootstrap generates multiplier weights at the cluster level (all units in a cluster share the same weight). Analytical clustered SEs are the default when `cluster` is set; cluster bootstrap is opt-in via `n_bootstrap > 0`. - **Note:** Hausman pretest operates on the post-treatment event-study vector ES(e) per Theorem A.1. Both PT-All and PT-Post fits are aggregated to ES(e) using cohort-size weights before computing the test statistic H = delta' V^{-1} delta where delta = ES_post - ES_all and V = Cov(ES_post) - Cov(ES_all). Covariance is computed from aggregated ES(e)-level EIF values. The variance-difference matrix V is inverted via Moore-Penrose pseudoinverse to handle finite-sample non-positive-definiteness. Effective rank of V (number of positive eigenvalues) is used as degrees of freedom. -- **Note:** Last-cohort-as-control (`control_group="last_cohort"`) reclassifies the latest treatment cohort as pseudo-never-treated and drops time periods at/after that cohort's treatment start. This is distinct from CallawaySantAnna's `not_yet_treated` option which dynamically selects not-yet-treated units per (g,t) pair. +- **Note:** Last-cohort-as-control (`control_group="last_cohort"`) reclassifies the latest treatment cohort as pseudo-never-treated and drops time periods at `t >= last_g - anticipation`, excluding anticipation-contaminated periods from the pseudo-control's pre-treatment window. This is distinct from CallawaySantAnna's `not_yet_treated` option which dynamically selects not-yet-treated units per (g,t) pair. --- diff --git a/tests/test_efficient_did.py b/tests/test_efficient_did.py index ddce55a63..3bd192fbc 100644 --- a/tests/test_efficient_did.py +++ b/tests/test_efficient_did.py @@ -1125,6 +1125,39 @@ def test_last_cohort_no_treated_raises(self): with pytest.raises(ValueError, match="No treated cohorts"): EfficientDiD(control_group="last_cohort").fit(df, "y", "unit", "time", "first_treat") + def test_last_cohort_with_anticipation_trims_at_last_g_minus_anticipation(self): + """last_cohort + anticipation>0 trims at `last_g - anticipation`, not `last_g`. + + Regression guard for PR #230 deferral: the code at efficient_did.py:470 uses + `effective_last = last_g - self.anticipation` so anticipation-contaminated periods + are excluded from the pseudo-control's pre-treatment window. If a future change + reverts to `t < last_g`, this test will catch it by checking the trimmed + `time_periods` set exposed on EfficientDiDResults. + """ + df = _make_staggered_panel( + n_per_group=60, + n_control=0, + groups=(3, 5, 7), + effects={3: 2.0, 5: 1.5, 7: 1.0}, + ) + # _make_staggered_panel default n_periods=7, last_g=7, times 1..7. + # anticipation=0: effective_last=7, time_periods=[1..6] + # anticipation=1: effective_last=6, time_periods=[1..5] + result_a0 = EfficientDiD( + pt_assumption="all", control_group="last_cohort", anticipation=0 + ).fit(df, "y", "unit", "time", "first_treat") + result_a1 = EfficientDiD( + pt_assumption="all", control_group="last_cohort", anticipation=1 + ).fit(df, "y", "unit", "time", "first_treat") + + assert max(result_a0.time_periods) == 6 + assert max(result_a1.time_periods) == 5 + assert len(result_a1.time_periods) == len(result_a0.time_periods) - 1 + assert np.isfinite(result_a0.overall_att) + assert np.isfinite(result_a1.overall_att) + assert 7 not in result_a0.groups + assert 7 not in result_a1.groups + def test_last_cohort_aggregate_event_study(self): """last_cohort with aggregate='event_study' should produce finite results.""" df = _make_staggered_panel( @@ -2084,9 +2117,7 @@ def test_ratio_sieve_partial_skip_warns(self): with pytest.warns(UserWarning) as caught: ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3) assert np.all(np.isfinite(ratio)) - partial_skip_msgs = [ - str(w.message) for w in caught if "skipped K=" in str(w.message) - ] + partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)] assert partial_skip_msgs, ( "Expected a partial-K-skip warning when some K's are rank deficient " "but at least one succeeds; got none." @@ -2106,9 +2137,7 @@ def test_inverse_propensity_sieve_partial_skip_warns(self): with pytest.warns(UserWarning) as caught: s_hat = estimate_inverse_propensity_sieve(X, mask, k_max=3) assert np.all(np.isfinite(s_hat)) - partial_skip_msgs = [ - str(w.message) for w in caught if "skipped K=" in str(w.message) - ] + partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)] assert partial_skip_msgs def test_ratio_sieve_no_warning_when_no_skips(self): @@ -2128,9 +2157,7 @@ def test_ratio_sieve_no_warning_when_no_skips(self): _w.simplefilter("always") ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3) assert np.all(np.isfinite(ratio)) - partial_skip_msgs = [ - str(w.message) for w in caught if "skipped K=" in str(w.message) - ] - assert partial_skip_msgs == [], ( - f"Unexpected partial-skip warning on clean data: {partial_skip_msgs}" - ) + partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)] + assert ( + partial_skip_msgs == [] + ), f"Unexpected partial-skip warning on clean data: {partial_skip_msgs}" From 77fe4daa05e3dabc0ea0acb19e2295bc8cc5b0a2 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 12:23:37 -0400 Subject: [PATCH 2/6] prep_dgp: add generate_ddd_panel_data for panel DDD power analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new public DGP function that generates a balanced panel of n_units observed over n_periods with two unit-level binary dimensions (`group`, `partition`, time-invariant) and a derived `post` indicator. The DDD-CPT identifying assumption holds because `group_partition_interaction` enters only as a unit-level (time-invariant) effect, leaving the triple interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. The existing cross-sectional `generate_ddd_data` remains unchanged. Compatible with `TripleDifference.fit(..., time="post")` directly; the binary 2×2×2 estimator surface is unchanged. Auto-routing of `power.simulate_power` to the panel DGP for `n_periods > 2` is deferred to a follow-up (TODO.md row added). Exports: top-level `diff_diff` and `diff_diff.prep` re-export; new autofunction stub in docs/api/prep.rst. Tests in tests/test_prep.py::TestGenerateDddPanelData (14 tests) including a deterministic recovery test (noise_sd=0, ATT recovery to ~1e-15) and a finite-sample recovery test. Closes Tier-A backlog item 3. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + diff_diff/__init__.py | 2 + diff_diff/prep.py | 1 + diff_diff/prep_dgp.py | 207 ++++++++++++++++++++++++++++- docs/api/prep.rst | 7 + tests/test_prep.py | 296 ++++++++++++++++++++++++++++++++++-------- 6 files changed, 459 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 964e35b9b..19003f9d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 14 tests). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction. - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422. diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py index cee940432..a0f2a38ab 100644 --- a/diff_diff/__init__.py +++ b/diff_diff/__init__.py @@ -125,6 +125,7 @@ generate_continuous_did_data, generate_did_data, generate_ddd_data, + generate_ddd_panel_data, generate_event_study_data, generate_factor_data, generate_panel_data, @@ -409,6 +410,7 @@ "generate_staggered_data", "generate_factor_data", "generate_ddd_data", + "generate_ddd_panel_data", "generate_panel_data", "generate_event_study_data", "generate_staggered_ddd_data", diff --git a/diff_diff/prep.py b/diff_diff/prep.py index 3c47832c7..1f1048a6e 100644 --- a/diff_diff/prep.py +++ b/diff_diff/prep.py @@ -19,6 +19,7 @@ from diff_diff.prep_dgp import ( # noqa: F401 generate_continuous_did_data, generate_ddd_data, + generate_ddd_panel_data, generate_did_data, generate_event_study_data, generate_factor_data, diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py index 45377b87b..7e271c72f 100644 --- a/diff_diff/prep_dgp.py +++ b/diff_diff/prep_dgp.py @@ -1129,6 +1129,209 @@ def generate_staggered_ddd_data( return pd.DataFrame(records) +def generate_ddd_panel_data( + n_units: int = 200, + n_periods: int = 8, + treatment_period: int = 4, + group_frac: float = 0.5, + partition_frac: float = 0.5, + treatment_effect: float = 2.0, + group_effect: float = 2.0, + partition_effect: float = 1.0, + time_effect: float = 0.5, + group_time_interaction: float = 1.0, + partition_time_interaction: float = 0.5, + group_partition_interaction: float = 1.5, + unit_fe_sd: float = 1.5, + noise_sd: float = 1.0, + add_covariates: bool = False, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic panel data for Triple Difference (DDD) power analysis. + + Creates a balanced panel of n_units observed over n_periods with two + time-invariant binary dimensions (``group`` and ``partition``) and a + derived binary ``post`` indicator. The triple-interaction effect + (``group * partition * post``) is the identifying ATT under DDD-CPT. + + The DGP equation is:: + + Y_{i,t} = unit_fe_i + + group_i * group_effect + + partition_i * partition_effect + + post_t * time_effect + + (group_i * partition_i) * group_partition_interaction + + (group_i * post_t) * group_time_interaction + + (partition_i * post_t) * partition_time_interaction + + treatment_effect * group_i * partition_i * post_t + + epsilon_{i,t} + + where ``group_i`` and ``partition_i`` are unit-level (constant in t) + and ``post_t = 1[period >= treatment_period]``. DDD-CPT identification + holds because ``group_partition_interaction`` enters only as a + unit-level (time-invariant) effect, leaving the triple-interaction as + the sole source of differential group × partition trend. + + Unlike the cross-sectional ``generate_ddd_data``, this DGP provides + panel-realistic unit fixed effects and within-unit serial structure, + making it suitable for panel-aware power-analysis simulations or + sanity-checking estimators that ignore the panel dimension. + + Parameters + ---------- + n_units : int, default=200 + Number of units in the panel. + n_periods : int, default=8 + Number of time periods. + treatment_period : int, default=4 + Period (0-indexed) at which ``post`` switches from 0 to 1. + Must satisfy ``1 <= treatment_period < n_periods``. + group_frac : float, default=0.5 + Fraction of units with ``group=1``. Must be in ``(0, 1)``. + partition_frac : float, default=0.5 + Fraction of units with ``partition=1`` (assigned independently of + ``group``). Must be in ``(0, 1)``. + treatment_effect : float, default=2.0 + True ATT for the triple-interaction cell (group=1, partition=1, + post=1). + group_effect : float, default=2.0 + Main effect of ``group=1`` (unit-level). + partition_effect : float, default=1.0 + Main effect of ``partition=1`` (unit-level). + time_effect : float, default=0.5 + Main effect of ``post=1`` (time-level). + group_time_interaction : float, default=1.0 + Coefficient on ``group * post`` (differential trend for the group + dimension). + partition_time_interaction : float, default=0.5 + Coefficient on ``partition * post`` (differential trend for the + partition dimension). + group_partition_interaction : float, default=1.5 + Coefficient on the unit-level ``group * partition`` interaction. + Must be time-invariant for DDD-CPT to hold. + unit_fe_sd : float, default=1.5 + Standard deviation of the unit fixed effect. + noise_sd : float, default=1.0 + Standard deviation of the idiosyncratic noise term. + add_covariates : bool, default=False + If True, add unit-level covariates ``x1`` (continuous) and ``x2`` + (binary) that affect the outcome. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Long-format panel with columns: + + - ``unit``: integer unit ID. + - ``period``: integer time period (0-indexed). + - ``outcome``: outcome variable. + - ``group``: unit-level binary group indicator (time-invariant). + - ``partition``: unit-level binary partition indicator + (time-invariant, orthogonal to group). + - ``post``: binary indicator, ``1`` if ``period >= treatment_period``. + - ``treated``: ``group * partition * post`` (binary). + - ``true_effect``: ``treatment_effect`` when treated, else 0. + - ``x1``, ``x2``: optional unit-level covariates (only if + ``add_covariates=True``). + + Examples + -------- + Generate a balanced panel with default parameters: + + >>> data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42) + >>> data.shape + (1600, 8) + >>> data.groupby('unit')['period'].count().eq(8).all() + True + + Fit with TripleDifference (note ``time="post"``): + + >>> from diff_diff import TripleDifference + >>> result = TripleDifference().fit( + ... data, outcome='outcome', group='group', + ... partition='partition', time='post', + ... ) + """ + if not (1 <= treatment_period < n_periods): + raise ValueError( + f"treatment_period must satisfy 1 <= treatment_period < n_periods; " + f"got treatment_period={treatment_period}, n_periods={n_periods}." + ) + if not (0.0 < group_frac < 1.0): + raise ValueError(f"group_frac must be in (0, 1); got {group_frac}.") + if not (0.0 < partition_frac < 1.0): + raise ValueError(f"partition_frac must be in (0, 1); got {partition_frac}.") + if n_units < 4: + raise ValueError( + f"n_units must be >= 4 to populate all four (group, partition) cells; " + f"got {n_units}." + ) + + rng = np.random.default_rng(seed) + + n_group_1 = max(1, min(n_units - 1, int(round(n_units * group_frac)))) + n_partition_1 = max(1, min(n_units - 1, int(round(n_units * partition_frac)))) + group = np.zeros(n_units, dtype=int) + group_idx = rng.choice(n_units, size=n_group_1, replace=False) + group[group_idx] = 1 + partition = np.zeros(n_units, dtype=int) + partition_idx = rng.choice(n_units, size=n_partition_1, replace=False) + partition[partition_idx] = 1 + + unit_fe = rng.normal(0.0, unit_fe_sd, size=n_units) + + if add_covariates: + x1 = rng.normal(0.0, 1.0, size=n_units) + x2 = rng.choice([0, 1], size=n_units) + else: + x1 = None + x2 = None + + records = [] + for i in range(n_units): + g_i = int(group[i]) + p_i = int(partition[i]) + for t in range(n_periods): + post_t = 1 if t >= treatment_period else 0 + y = ( + unit_fe[i] + + g_i * group_effect + + p_i * partition_effect + + post_t * time_effect + + (g_i * p_i) * group_partition_interaction + + (g_i * post_t) * group_time_interaction + + (p_i * post_t) * partition_time_interaction + ) + treated = int(g_i == 1 and p_i == 1 and post_t == 1) + true_eff = treatment_effect if treated else 0.0 + if treated: + y += treatment_effect + if add_covariates: + y += 0.5 * x1[i] + 0.3 * x2[i] + y += rng.normal(0.0, noise_sd) + + row = { + "unit": i, + "period": t, + "outcome": float(y), + "group": g_i, + "partition": p_i, + "post": post_t, + "treated": treated, + "true_effect": float(true_eff), + } + if add_covariates: + row["x1"] = float(x1[i]) + row["x2"] = int(x2[i]) + + records.append(row) + + return pd.DataFrame(records) + + def _rank_pair_weights( unit_weight: np.ndarray, unit_stratum: np.ndarray, @@ -1432,9 +1635,7 @@ def generate_survey_did_data( raise ValueError("te_covariate_interaction requires add_covariates=True") if not np.isfinite(conditional_pt): - raise ValueError( - f"conditional_pt must be finite, got {conditional_pt}" - ) + raise ValueError(f"conditional_pt must be finite, got {conditional_pt}") if conditional_pt != 0.0 and not add_covariates: raise ValueError("conditional_pt requires add_covariates=True") if conditional_pt != 0.0: diff --git a/docs/api/prep.rst b/docs/api/prep.rst index 555100d6d..e01c80b4d 100644 --- a/docs/api/prep.rst +++ b/docs/api/prep.rst @@ -70,6 +70,13 @@ Generate synthetic Triple Difference data. .. autofunction:: diff_diff.generate_ddd_data +generate_ddd_panel_data +~~~~~~~~~~~~~~~~~~~~~~~ + +Generate synthetic panel-structured Triple Difference data for power analysis. + +.. autofunction:: diff_diff.generate_ddd_panel_data + generate_factor_data ~~~~~~~~~~~~~~~~~~~~ diff --git a/tests/test_prep.py b/tests/test_prep.py index 0e4bf5d39..696f507ce 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -792,9 +792,9 @@ def test_extreme_Y_scale_synthetic_weight_column(self): weights = result["synthetic_weight"].to_numpy() # Valid simplex: non-negative, sums to 1. assert np.all(weights >= 0), "synthetic_weight must be non-negative" - assert abs(weights.sum() - 1.0) < 1e-10, ( - f"synthetic_weight should sum to 1.0, got {weights.sum()}" - ) + assert ( + abs(weights.sum() - 1.0) < 1e-10 + ), f"synthetic_weight should sum to 1.0, got {weights.sum()}" # Non-degenerate: at least 2 controls receive non-trivial weight. # This guards the Rust-PGD collapse-to-one-vertex bug that # previously fired at Y ~ 1e9 under the deleted wrapper. @@ -833,9 +833,7 @@ def test_synthetic_weight_column_with_lambda_reg(self): for w, label in [(w_unreg, "unregularized"), (w_reg, "regularized")]: assert np.all(w >= 0), f"{label} weights must be non-negative" - assert abs(w.sum() - 1.0) < 1e-10, ( - f"{label} weights should sum to 1.0, got {w.sum()}" - ) + assert abs(w.sum() - 1.0) < 1e-10, f"{label} weights should sum to 1.0, got {w.sum()}" # Regularization should increase entropy (pull toward uniform) or at # least not collapse the simplex — a valid regularized solution must @@ -1138,6 +1136,178 @@ def test_reproducibility(self): pd.testing.assert_frame_equal(data1, data2) +class TestGenerateDddPanelData: + """Tests for generate_ddd_panel_data function (panel-structured DDD DGP).""" + + def test_shape(self): + """len(data) == n_units * n_periods (balanced panel).""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42) + assert len(data) == 200 * 8 + expected_cols = { + "unit", + "period", + "outcome", + "group", + "partition", + "post", + "treated", + "true_effect", + } + assert expected_cols.issubset(set(data.columns)) + + def test_balanced_cells(self): + """Each (group, partition) cell has roughly n_units * group_frac * partition_frac units.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data( + n_units=400, n_periods=8, group_frac=0.5, partition_frac=0.5, seed=42 + ) + # Cell counts measured at unit level (not row level) + unit_cells = data.groupby("unit")[["group", "partition"]].first() + cell_counts = unit_cells.groupby(["group", "partition"]).size() + assert len(cell_counts) == 4 + expected_per_cell = 400 * 0.5 * 0.5 # = 100 + for count in cell_counts.values: + assert abs(count - expected_per_cell) <= 2 + + def test_treatment_effect_location(self): + """true_effect != 0 only when group==1 AND partition==1 AND post==1.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data( + n_units=200, n_periods=8, treatment_period=4, treatment_effect=3.0, seed=42 + ) + treated_mask = (data["group"] == 1) & (data["partition"] == 1) & (data["post"] == 1) + assert (data.loc[treated_mask, "true_effect"] == 3.0).all() + assert (data.loc[~treated_mask, "true_effect"] == 0.0).all() + assert (data["treated"] == treated_mask.astype(int)).all() + + def test_panel_structure(self): + """Each unit observed in exactly n_periods periods.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data(n_units=100, n_periods=6, seed=42) + assert data.groupby("unit")["period"].count().eq(6).all() + + def test_post_derived_from_period(self): + """post is exactly 1[period >= treatment_period].""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=3, seed=42) + expected_post = (data["period"] >= 3).astype(int) + assert (data["post"] == expected_post).all() + + def test_group_partition_time_invariant(self): + """group and partition are unit-level (constant within unit) — DDD-CPT requirement.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42) + assert data.groupby("unit")["group"].nunique().eq(1).all() + assert data.groupby("unit")["partition"].nunique().eq(1).all() + + def test_treatment_period_validation(self): + """treatment_period must satisfy 1 <= treatment_period < n_periods.""" + from diff_diff.prep import generate_ddd_panel_data + + with pytest.raises(ValueError, match="treatment_period"): + generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=0, seed=42) + with pytest.raises(ValueError, match="treatment_period"): + generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=8, seed=42) + + def test_group_frac_validation(self): + """group_frac and partition_frac must be strictly in (0, 1).""" + from diff_diff.prep import generate_ddd_panel_data + + with pytest.raises(ValueError, match="group_frac"): + generate_ddd_panel_data(n_units=100, group_frac=0.0, seed=42) + with pytest.raises(ValueError, match="group_frac"): + generate_ddd_panel_data(n_units=100, group_frac=1.0, seed=42) + with pytest.raises(ValueError, match="partition_frac"): + generate_ddd_panel_data(n_units=100, partition_frac=0.0, seed=42) + + def test_n_units_validation(self): + """n_units must be >= 4.""" + from diff_diff.prep import generate_ddd_panel_data + + with pytest.raises(ValueError, match="n_units"): + generate_ddd_panel_data(n_units=3, seed=42) + + def test_reproducibility(self): + """Same seed produces identical DataFrames.""" + from diff_diff.prep import generate_ddd_panel_data + + data1 = generate_ddd_panel_data(n_units=100, n_periods=6, seed=123) + data2 = generate_ddd_panel_data(n_units=100, n_periods=6, seed=123) + pd.testing.assert_frame_equal(data1, data2) + + def test_ddd_effect_recovery_deterministic(self): + """noise_sd=0 deterministic recovery within 1e-6 (catches DGP transcription bugs).""" + from diff_diff import TripleDifference + from diff_diff.prep import generate_ddd_panel_data + + true_effect = 2.0 + data = generate_ddd_panel_data( + n_units=400, + n_periods=8, + treatment_period=4, + treatment_effect=true_effect, + noise_sd=0.0, + seed=42, + ) + result = TripleDifference().fit( + data, + outcome="outcome", + group="group", + partition="partition", + time="post", + ) + assert abs(result.att - true_effect) < 1e-6 + + def test_ddd_effect_recovery(self): + """Finite-sample recovery within tolerance under default noise.""" + from diff_diff import TripleDifference + from diff_diff.prep import generate_ddd_panel_data + + true_effect = 2.0 + data = generate_ddd_panel_data( + n_units=1000, + n_periods=8, + treatment_period=4, + treatment_effect=true_effect, + seed=42, + ) + result = TripleDifference().fit( + data, + outcome="outcome", + group="group", + partition="partition", + time="post", + ) + assert abs(result.att - true_effect) < 0.7 + + def test_covariates_added(self): + """add_covariates=True adds x1 and x2 columns.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data( + n_units=100, n_periods=6, treatment_period=3, add_covariates=True, seed=42 + ) + assert "x1" in data.columns + assert "x2" in data.columns + + def test_covariates_omitted(self): + """add_covariates=False omits x1 and x2 columns.""" + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data( + n_units=100, n_periods=6, treatment_period=3, add_covariates=False, seed=42 + ) + assert "x1" not in data.columns + assert "x2" not in data.columns + + class TestGeneratePanelData: """Tests for generate_panel_data function.""" @@ -2095,20 +2265,27 @@ def test_conditional_pt_requires_both_groups(self): # Zero never-treated (exact) with pytest.raises(ValueError, match="conditional_pt requires at least one"): generate_survey_did_data( - add_covariates=True, conditional_pt=0.3, - never_treated_frac=0.0, seed=42, + add_covariates=True, + conditional_pt=0.3, + never_treated_frac=0.0, + seed=42, ) # Small fraction that floors to zero never-treated units with pytest.raises(ValueError, match="conditional_pt requires at least one"): generate_survey_did_data( - n_units=50, add_covariates=True, conditional_pt=0.3, - never_treated_frac=0.01, seed=42, + n_units=50, + add_covariates=True, + conditional_pt=0.3, + never_treated_frac=0.01, + seed=42, ) # All never-treated (no ever-treated units) with pytest.raises(ValueError, match="conditional_pt requires at least one"): generate_survey_did_data( - add_covariates=True, conditional_pt=0.3, - never_treated_frac=1.0, seed=42, + add_covariates=True, + conditional_pt=0.3, + never_treated_frac=1.0, + seed=42, ) def test_conditional_pt_requires_covariates(self): @@ -2123,13 +2300,9 @@ def test_conditional_pt_nonfinite_rejected(self): from diff_diff.prep_dgp import generate_survey_did_data with pytest.raises(ValueError, match="conditional_pt must be finite"): - generate_survey_did_data( - add_covariates=True, conditional_pt=np.inf, seed=42 - ) + generate_survey_did_data(add_covariates=True, conditional_pt=np.inf, seed=42) with pytest.raises(ValueError, match="conditional_pt must be finite"): - generate_survey_did_data( - add_covariates=True, conditional_pt=np.nan, seed=42 - ) + generate_survey_did_data(add_covariates=True, conditional_pt=np.nan, seed=42) def test_conditional_pt_x1_distribution_shift(self): """Treated units should have higher x1 when conditional_pt is active.""" @@ -2295,9 +2468,7 @@ def test_conditional_pt_crosssection_conditional_did(self): uncond_did = abs(beta_uncond[3]) # Conditional DID: add x1 and x1*post - X_cond = np.column_stack([ - np.ones(n), treated, post, treated_post, x1, x1_post - ]) + X_cond = np.column_stack([np.ones(n), treated, post, treated_post, x1, x1_post]) beta_cond = np.linalg.lstsq(X_cond, y, rcond=None)[0] cond_did = abs(beta_cond[3]) @@ -2311,9 +2482,7 @@ def test_conditional_pt_backward_compatible(self): """conditional_pt=0.0 should produce identical output to default.""" from diff_diff.prep_dgp import generate_survey_did_data - df_default = generate_survey_did_data( - n_units=100, add_covariates=True, seed=99 - ) + df_default = generate_survey_did_data(n_units=100, add_covariates=True, seed=99) df_explicit = generate_survey_did_data( n_units=100, add_covariates=True, conditional_pt=0.0, seed=99 ) @@ -2380,9 +2549,7 @@ def test_conditional_pt_panel_and_crosssection(self): p1 = df[df["period"] == 1] x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].mean() x1_control = p1.loc[p1["first_treat"] == 0, "x1"].mean() - assert x1_treated > x1_control, ( - f"panel={panel_mode}: treated x1 not shifted" - ) + assert x1_treated > x1_control, f"panel={panel_mode}: treated x1 not shifted" class TestAggregateSurvey: @@ -3355,9 +3522,7 @@ def test_pweight_values(self, micro_data, design): for state in panel["state"].unique(): state_rows = panel[panel["state"] == state] expected_weight = state_rows["cell_sum_w"].mean() - np.testing.assert_allclose( - state_rows["y_weight"].values, expected_weight, rtol=1e-10 - ) + np.testing.assert_allclose(state_rows["y_weight"].values, expected_weight, rtol=1e-10) # Must be constant within unit assert state_rows["y_weight"].nunique() == 1 @@ -3459,15 +3624,21 @@ def test_pweight_callaway_santanna_integration(self): n_resp = rng.randint(15, 25) # varying respondents per cell te = 2.0 if (first_treats[geo] > 0 and period >= first_treats[geo]) else 0.0 for _ in range(n_resp): - rows.append({ - "geo": geo, "period": period, - "wt": rng.uniform(0.5, 3.0), - "y": rng.normal(10 + te, 2), - }) + rows.append( + { + "geo": geo, + "period": period, + "wt": rng.uniform(0.5, 3.0), + "y": rng.normal(10 + te, 2), + } + ) micro = pd.DataFrame(rows) design = SurveyDesign(weights="wt") panel, stage2 = aggregate_survey( - micro, by=["geo", "period"], outcomes="y", survey_design=design, + micro, + by=["geo", "period"], + outcomes="y", + survey_design=design, ) assert stage2.weight_type == "pweight" @@ -3475,14 +3646,18 @@ def test_pweight_callaway_santanna_integration(self): # but y_weight must be unit-constant for geo in panel["geo"].unique(): geo_rows = panel[panel["geo"] == geo] - assert geo_rows["y_weight"].nunique() == 1, ( - f"Geo {geo}: y_weight not constant within unit" - ) + assert ( + geo_rows["y_weight"].nunique() == 1 + ), f"Geo {geo}: y_weight not constant within unit" panel["first_treat"] = panel["geo"].map(first_treats) result = CallawaySantAnna().fit( - panel, outcome="y_mean", unit="geo", time="period", - first_treat="first_treat", survey_design=stage2, + panel, + outcome="y_mean", + unit="geo", + time="period", + first_treat="first_treat", + survey_design=stage2, ) assert np.isfinite(result.overall_att), f"ATT not finite: {result.overall_att}" assert result.overall_se > 0, f"SE not positive: {result.overall_se}" @@ -3546,7 +3721,10 @@ def test_pweight_retains_zero_precision_geo(self): # Pweight mode: state 0 retained (cell_sum_w > 0 despite NaN precision) panel_p, _ = aggregate_survey( - data, by=["state", "period"], outcomes="y", survey_design=design, + data, + by=["state", "period"], + outcomes="y", + survey_design=design, second_stage_weights="pweight", ) assert 0 in panel_p["state"].values @@ -3555,7 +3733,10 @@ def test_pweight_retains_zero_precision_geo(self): # Aweight mode: state 0 dropped (all precision NaN -> weight 0) with pytest.warns(UserWarning, match="zero total weight"): panel_a, _ = aggregate_survey( - data, by=["state", "period"], outcomes="y", survey_design=design, + data, + by=["state", "period"], + outcomes="y", + survey_design=design, second_stage_weights="aweight", ) assert 0 not in panel_a["state"].values @@ -3584,9 +3765,7 @@ def _build_microdata(self, mode, seed=42): n = len(state) wt = rng.uniform(0.5, 2.5, n) y = rng.normal(5.0, 1.5, n) - df_base = pd.DataFrame( - {"state": state, "year": year, "wt": wt, "y": y} - ) + df_base = pd.DataFrame({"state": state, "year": year, "wt": wt, "y": y}) if mode == "stratified_fpc": df = df_base.copy() @@ -3643,7 +3822,10 @@ def _build_microdata(self, mode, seed=42): df["psu"] = psu policy = mode.split("_", 1)[1] sd = SurveyDesign( - weights="wt", strata="stratum", psu="psu", lonely_psu=policy, + weights="wt", + strata="stratum", + psu="psu", + lonely_psu=policy, ) return df, sd @@ -3660,8 +3842,10 @@ def _assert_panels_equivalent(p_fast, p_legacy, outcome="y"): nan_a, nan_b = np.isnan(a), np.isnan(b) assert np.array_equal(nan_a, nan_b), f"NaN pattern mismatch in {col}" np.testing.assert_allclose( - a[~nan_a], b[~nan_b], - atol=1e-14, rtol=1e-14, + a[~nan_a], + b[~nan_b], + atol=1e-14, + rtol=1e-14, err_msg=f"{col} diverges between fast and legacy paths", ) @@ -3685,16 +3869,24 @@ def test_fast_path_equals_legacy(self, mode, monkeypatch): data, sd = self._build_microdata(mode) panel_fast, _ = aggregate_survey( - data, by=["state", "year"], outcomes="y", survey_design=sd, + data, + by=["state", "year"], + outcomes="y", + survey_design=sd, ) # Force the legacy code path by disabling the scaffolding precompute. # _cell_mean_variance falls back to compute_survey_if_variance when # scaffolding is None. monkeypatch.setattr( - prep, "_precompute_psu_scaffolding", lambda resolved: None, + prep, + "_precompute_psu_scaffolding", + lambda resolved: None, ) panel_legacy, _ = aggregate_survey( - data, by=["state", "year"], outcomes="y", survey_design=sd, + data, + by=["state", "year"], + outcomes="y", + survey_design=sd, ) self._assert_panels_equivalent(panel_fast, panel_legacy) From a897e3a9046e94e5b0e924f04d7aeae6e5cde9bc Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 12:23:50 -0400 Subject: [PATCH 3/6] trop: extract shared data-setup helper from fit() and _fit_global() `TROP.fit()` (local path) and `_fit_global()` previously duplicated ~85 lines of data-setup logic each: panel pivoting, absorbing-state validation, treated/control unit identification, first-treatment-period detection, and pre/post period counting. The duplicated blocks were near-identical line-by-line, differing only in which index mappings the caller built (local built all four; global built only the forward maps). Extracts `_setup_trop_data(...)` in trop_local.py (alongside the existing `_validate_and_pivot_treatment` helper). Both callers now invoke it and unpack only the fields they consume. The helper returns all four index mappings (`unit_to_idx`, `period_to_idx`, `idx_to_unit`, `idx_to_period`) uniformly, eliminating a class of subtle drift bug; `_fit_global` gains two unused locals as a trade-off. The global-method-specific staggered-adoption check stays in `_fit_global` as a post-helper validation (it depends on estimator semantics, not data preparation). Bootstrap-loop dedup (~40 LoC across `_bootstrap_variance` / `_bootstrap_variance_global`) is deferred to a follow-up (TODO.md row added). Adds a parity regression test `TestTROPModuleSplit::test_setup_trop_data_internal_contract` that round-trip-verifies the index mappings, shape consistency, and treated/control partition disjointness. Behavior-preserving: TROP test suite (84 non-slow tests) is the safety net. Closes Tier-A backlog item 4. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/trop.py | 115 +++++++----------------------------- diff_diff/trop_global.py | 93 ++++++----------------------- diff_diff/trop_local.py | 123 +++++++++++++++++++++++++++++++++++++++ tests/test_trop.py | 43 ++++++++++++++ 4 files changed, 206 insertions(+), 168 deletions(-) diff --git a/diff_diff/trop.py b/diff_diff/trop.py index d4bd1840a..5c60dc15f 100644 --- a/diff_diff/trop.py +++ b/diff_diff/trop.py @@ -31,7 +31,7 @@ _rust_loocv_grid_search, ) from diff_diff.trop_global import TROPGlobalMixin -from diff_diff.trop_local import TROPLocalMixin, _validate_and_pivot_treatment +from diff_diff.trop_local import TROPLocalMixin, _setup_trop_data from diff_diff.trop_results import ( _LAMBDA_INF, _PrecomputedStructures, @@ -432,7 +432,6 @@ def fit( # Resolve survey design from diff_diff.survey import ( - _extract_unit_survey_weights, _resolve_survey_for_fit, _validate_unit_constant_survey, ) @@ -470,94 +469,21 @@ def fit( ) # Below is the local method (default) - # Get unique units and periods - all_units = sorted(data[unit].unique()) - - # Extract unit-level survey weights - if resolved_survey is not None: - unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units) - else: - unit_weight_arr = None - all_periods = sorted(data[time].unique()) - - n_units = len(all_units) - n_periods = len(all_periods) - - # Create mappings - unit_to_idx = {u: i for i, u in enumerate(all_units)} - period_to_idx = {p: i for i, p in enumerate(all_periods)} - idx_to_unit = {i: u for u, i in unit_to_idx.items()} - idx_to_period = {i: p for p, i in period_to_idx.items()} - - # Create outcome matrix Y (n_periods x n_units) and treatment matrix D - # Vectorized: use pivot for O(1) reshaping instead of O(n) iterrows loop - Y = ( - data.pivot(index=time, columns=unit, values=outcome) - .reindex(index=all_periods, columns=all_units) - .values + _ctx = _setup_trop_data( + data, outcome, treatment, unit, time, resolved_survey, survey_design ) - - # For D matrix, validate observed treatment and handle unbalanced panels - D, missing_mask = _validate_and_pivot_treatment( - data, time, unit, treatment, all_periods, all_units - ) - - # Validate D is monotonic non-decreasing per unit (absorbing state) - # D[t, i] must satisfy: once D=1, it must stay 1 for all subsequent periods - # Issue 3 fix (round 10): Check each unit's OBSERVED D sequence for monotonicity - # This catches 1->0 violations that span missing period gaps - # Example: D[2]=1, missing [3,4], D[5]=0 is a real violation even though - # adjacent period transitions don't show it (the gap hides the transition) - violating_units = [] - for unit_idx in range(n_units): - # Get observed D values for this unit (where not missing) - observed_mask = ~missing_mask[:, unit_idx] - observed_d = D[observed_mask, unit_idx] - - # Check if observed sequence is monotonically non-decreasing - if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0): - violating_units.append(all_units[unit_idx]) - - if violating_units: - raise ValueError( - f"Treatment indicator is not an absorbing state for units: {violating_units}. " - f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). " - f"If this is event-study style data, convert to absorbing state: " - f"D[t, i] = 1 for all t >= first treatment period." - ) - - # Identify treated observations - treated_mask = D == 1 - n_treated_obs = np.sum(treated_mask) - - if n_treated_obs == 0: - raise ValueError("No treated observations found") - - # Identify treated and control units - unit_ever_treated = np.any(D == 1, axis=0) - treated_unit_idx = np.where(unit_ever_treated)[0] - control_unit_idx = np.where(~unit_ever_treated)[0] - - if len(control_unit_idx) == 0: - raise ValueError("No control units found") - - # Determine pre/post periods from treatment indicator D - # D matrix is the sole input for treatment timing per the paper - first_treat_period = None - for t in range(n_periods): - if np.any(D[t, :] == 1): - first_treat_period = t - break - if first_treat_period is None: - raise ValueError("Could not infer post-treatment periods from D matrix") - - n_pre_periods = first_treat_period - # Count periods where D=1 is actually observed (matches docstring) - # Per docstring: "Number of post-treatment periods (periods with D=1 observations)" - n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1))) - - if n_pre_periods < 2: - raise ValueError("Need at least 2 pre-treatment periods") + n_units = _ctx["n_units"] + n_periods = _ctx["n_periods"] + idx_to_unit = _ctx["idx_to_unit"] + idx_to_period = _ctx["idx_to_period"] + unit_weight_arr = _ctx["unit_weight_arr"] + Y = _ctx["Y"] + D = _ctx["D"] + n_treated_obs = _ctx["n_treated_obs"] + treated_unit_idx = _ctx["treated_unit_idx"] + control_unit_idx = _ctx["control_unit_idx"] + n_pre_periods = _ctx["n_pre_periods"] + n_post_periods = _ctx["n_post_periods"] # Step 1: Grid search with LOOCV for tuning parameters best_lambda = None @@ -769,7 +695,12 @@ def fit( # Fit model with these weights n_fits_attempted += 1 alpha_hat, beta_hat, L_hat = self._estimate_model( - Y, control_mask, weight_matrix, lambda_nn, n_units, n_periods, + Y, + control_mask, + weight_matrix, + lambda_nn, + n_units, + n_periods, _nonconvergence_tracker=nonconverg_tracker, ) @@ -907,9 +838,7 @@ def set_params(self, **params) -> "TROP": """Set estimator parameters.""" for key, value in params.items(): if key == "method" and value not in ("local", "global"): - raise ValueError( - f"method must be one of ('local', 'global'), got '{value}'" - ) + raise ValueError(f"method must be one of ('local', 'global'), got '{value}'") if hasattr(self, key): setattr(self, key, value) else: diff --git a/diff_diff/trop_global.py b/diff_diff/trop_global.py index 568e0eebd..ec7402e70 100644 --- a/diff_diff/trop_global.py +++ b/diff_diff/trop_global.py @@ -28,7 +28,7 @@ stratified_bootstrap_indices, warn_bootstrap_failure_rate, ) -from diff_diff.trop_local import _soft_threshold_svd, _validate_and_pivot_treatment +from diff_diff.trop_local import _setup_trop_data, _soft_threshold_svd from diff_diff.trop_results import TROPResults from diff_diff.utils import safe_inference, warn_if_not_converged @@ -584,82 +584,25 @@ def _fit_global( across units, use `method="local"` which computes observation-specific weights that naturally handle heterogeneous timing. """ - # Data setup (same as local method) - all_units = sorted(data[unit].unique()) - all_periods = sorted(data[time].unique()) - - # Extract per-unit survey weights for weighted ATT aggregation - if resolved_survey is not None: - from diff_diff.survey import _extract_unit_survey_weights - - unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units) - else: - unit_weight_arr = None - - n_units = len(all_units) - n_periods = len(all_periods) - - idx_to_unit = {i: u for i, u in enumerate(all_units)} - idx_to_period = {i: p for i, p in enumerate(all_periods)} - - # Create matrices - Y = ( - data.pivot(index=time, columns=unit, values=outcome) - .reindex(index=all_periods, columns=all_units) - .values - ) - - D, missing_mask = _validate_and_pivot_treatment( - data, time, unit, treatment, all_periods, all_units + # Data setup (shared with local method via _setup_trop_data helper). + _ctx = _setup_trop_data( + data, outcome, treatment, unit, time, resolved_survey, survey_design ) - - # Validate absorbing state - violating_units = [] - for unit_idx in range(n_units): - observed_mask = ~missing_mask[:, unit_idx] - observed_d = D[observed_mask, unit_idx] - if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0): - violating_units.append(all_units[unit_idx]) - - if violating_units: - raise ValueError( - f"Treatment indicator is not an absorbing state for units: {violating_units}. " - f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). " - f"If this is event-study style data, convert to absorbing state: " - f"D[t, i] = 1 for all t >= first treatment period." - ) - - # Identify treated observations - treated_mask = D == 1 - n_treated_obs = np.sum(treated_mask) - - if n_treated_obs == 0: - raise ValueError("No treated observations found") - - # Identify treated and control units - unit_ever_treated = np.any(D == 1, axis=0) - treated_unit_idx = np.where(unit_ever_treated)[0] - control_unit_idx = np.where(~unit_ever_treated)[0] - - if len(control_unit_idx) == 0: - raise ValueError("No control units found") - - # Determine pre/post periods - first_treat_period = None - for t in range(n_periods): - if np.any(D[t, :] == 1): - first_treat_period = t - break - - if first_treat_period is None: - raise ValueError("Could not infer post-treatment periods from D matrix") - - n_pre_periods = first_treat_period + n_units = _ctx["n_units"] + n_periods = _ctx["n_periods"] + idx_to_unit = _ctx["idx_to_unit"] + idx_to_period = _ctx["idx_to_period"] + unit_weight_arr = _ctx["unit_weight_arr"] + Y = _ctx["Y"] + D = _ctx["D"] + missing_mask = _ctx["missing_mask"] + n_treated_obs = _ctx["n_treated_obs"] + treated_unit_idx = _ctx["treated_unit_idx"] + control_unit_idx = _ctx["control_unit_idx"] + first_treat_period = _ctx["first_treat_period"] + n_pre_periods = _ctx["n_pre_periods"] + n_post_periods = _ctx["n_post_periods"] treated_periods = n_periods - first_treat_period - n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1))) - - if n_pre_periods < 2: - raise ValueError("Need at least 2 pre-treatment periods") # Check for staggered adoption (global method requires simultaneous treatment) # Use only observed periods (skip missing) to avoid false positives on unbalanced panels diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py index 6efa3d7a3..64b633d1b 100644 --- a/diff_diff/trop_local.py +++ b/diff_diff/trop_local.py @@ -71,6 +71,129 @@ def _validate_and_pivot_treatment(data, time, unit, treatment, all_periods, all_ return D, missing_mask +def _setup_trop_data(data, outcome, treatment, unit, time, resolved_survey, survey_design): + """Shared data setup for TROP local and global fit paths. + + Performs panel pivoting (long → wide), absorbing-state validation, + treated/control unit identification, first-treatment-period detection, + and pre/post period counting. Returns a dict so both callers can + unpack only the fields they need. + + The global-method-specific staggered-adoption check stays in + `_fit_global` as a post-helper validation because it depends on + estimator semantics (global method requires simultaneous treatment), + not data preparation. + + Note: the same dict-shaped contract is also relied on by the bootstrap + paths (`_bootstrap_variance` / `_bootstrap_variance_global`) which + currently rebuild similar state per draw. Future contract changes to + this helper must be checked against both `_fit_*` and `_bootstrap_*` + call sites for cross-file sync. + + Returns + ------- + dict + With keys: + ``all_units, all_periods, n_units, n_periods, unit_to_idx, + period_to_idx, idx_to_unit, idx_to_period, unit_weight_arr, + Y, D, missing_mask, treated_mask, n_treated_obs, + treated_unit_idx, control_unit_idx, first_treat_period, + n_pre_periods, n_post_periods``. + """ + all_units = sorted(data[unit].unique()) + all_periods = sorted(data[time].unique()) + + if resolved_survey is not None: + from diff_diff.survey import _extract_unit_survey_weights + + unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units) + else: + unit_weight_arr = None + + n_units = len(all_units) + n_periods = len(all_periods) + + unit_to_idx = {u: i for i, u in enumerate(all_units)} + period_to_idx = {p: i for i, p in enumerate(all_periods)} + idx_to_unit = {i: u for u, i in unit_to_idx.items()} + idx_to_period = {i: p for p, i in period_to_idx.items()} + + Y = ( + data.pivot(index=time, columns=unit, values=outcome) + .reindex(index=all_periods, columns=all_units) + .values + ) + + D, missing_mask = _validate_and_pivot_treatment( + data, time, unit, treatment, all_periods, all_units + ) + + violating_units = [] + for unit_idx in range(n_units): + observed_mask = ~missing_mask[:, unit_idx] + observed_d = D[observed_mask, unit_idx] + if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0): + violating_units.append(all_units[unit_idx]) + + if violating_units: + raise ValueError( + f"Treatment indicator is not an absorbing state for units: {violating_units}. " + f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). " + f"If this is event-study style data, convert to absorbing state: " + f"D[t, i] = 1 for all t >= first treatment period." + ) + + treated_mask = D == 1 + n_treated_obs = int(np.sum(treated_mask)) + + if n_treated_obs == 0: + raise ValueError("No treated observations found") + + unit_ever_treated = np.any(D == 1, axis=0) + treated_unit_idx = np.where(unit_ever_treated)[0] + control_unit_idx = np.where(~unit_ever_treated)[0] + + if len(control_unit_idx) == 0: + raise ValueError("No control units found") + + first_treat_period = None + for t in range(n_periods): + if np.any(D[t, :] == 1): + first_treat_period = t + break + + if first_treat_period is None: + raise ValueError("Could not infer post-treatment periods from D matrix") + + n_pre_periods = first_treat_period + n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1))) + + if n_pre_periods < 2: + raise ValueError("Need at least 2 pre-treatment periods") + + return { + "all_units": all_units, + "all_periods": all_periods, + "n_units": n_units, + "n_periods": n_periods, + "unit_to_idx": unit_to_idx, + "period_to_idx": period_to_idx, + "idx_to_unit": idx_to_unit, + "idx_to_period": idx_to_period, + "unit_weight_arr": unit_weight_arr, + "Y": Y, + "D": D, + "missing_mask": missing_mask, + "treated_mask": treated_mask, + "n_treated_obs": n_treated_obs, + "treated_unit_idx": treated_unit_idx, + "control_unit_idx": control_unit_idx, + "first_treat_period": first_treat_period, + "n_pre_periods": n_pre_periods, + "n_post_periods": n_post_periods, + } + + # Module-level convergence tolerance for SVD singular value truncation. # Singular values below this threshold after soft-thresholding are treated # as zero to improve numerical stability. diff --git a/tests/test_trop.py b/tests/test_trop.py index aa5de0939..5cf10cac0 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -4146,6 +4146,49 @@ def test_method_dispatch_local_does_not_use_fit_global(self): trop_est.fit(df, "outcome", "treated", "unit", "time") mock_fg.assert_not_called() + def test_setup_trop_data_internal_contract(self): + """`_setup_trop_data` returns a self-consistent state dict used by both fit paths. + + Regression guard for the Wave 4 refactor: both `TROP.fit()` local path and + `_fit_global()` now consume `_setup_trop_data`'s dict. If a future contract + change drops or renames a field, this catches it. + """ + from diff_diff.trop_local import _setup_trop_data + + df = self._make_panel() + ctx = _setup_trop_data( + df, + outcome="outcome", + treatment="treated", + unit="unit", + time="time", + resolved_survey=None, + survey_design=None, + ) + n_units = ctx["n_units"] + n_periods = ctx["n_periods"] + # Dimensions are consistent across return fields. + assert ctx["Y"].shape == (n_periods, n_units) + assert ctx["D"].shape == (n_periods, n_units) + assert ctx["missing_mask"].shape == (n_periods, n_units) + assert ctx["treated_mask"].shape == (n_periods, n_units) + assert len(ctx["all_units"]) == n_units + assert len(ctx["all_periods"]) == n_periods + # Round-trip both mapping pairs (the local path historically built both + # forward and inverse maps; helper now returns all four uniformly so + # global path gains parity). + for i in range(n_units): + assert ctx["unit_to_idx"][ctx["idx_to_unit"][i]] == i + for t in range(n_periods): + assert ctx["period_to_idx"][ctx["idx_to_period"][t]] == t + # first_treat_period derivation matches the canonical "first row of D + # with any treated cell" expression used pre-refactor. + assert ctx["first_treat_period"] == int(np.argmax(np.any(ctx["D"] == 1, axis=1))) + assert ctx["n_pre_periods"] == ctx["first_treat_period"] + # Treated/control unit partition is complete and disjoint. + assert len(ctx["treated_unit_idx"]) + len(ctx["control_unit_idx"]) == n_units + assert len(set(ctx["treated_unit_idx"]) & set(ctx["control_unit_idx"])) == 0 + class TestSilentWarningAudit: """Tests for UserWarning emissions added by the silent warning audit.""" From 347340b9e3560f4063b94249138b8c76d80f3a0c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 12:23:56 -0400 Subject: [PATCH 4/6] todo: drain items 2/3/4 from Tier A backlog Removes three Methodology/Correctness rows (EfficientDiD anticipation trim, generate_ddd_panel_data, TROP fit/_fit_global dedup) and the three corresponding Tier A bullets, addressed in this PR. Adds two follow-up rows for the deferred scope (TROP bootstrap-loop dedup, TripleDifference power auto-routing). Co-Authored-By: Claude Opus 4.7 (1M context) --- TODO.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/TODO.md b/TODO.md index d585f5135..45d4f6587 100644 --- a/TODO.md +++ b/TODO.md @@ -82,11 +82,10 @@ Deferred items from PR reviews that were not addressed before merge. | CallawaySantAnna: consider materializing NaN entries for non-estimable (g,t) cells in group_time_effects dict (currently omitted with consolidated warning); would require updating downstream consumers (event study, balance_e, aggregation) | `staggered.py` | #256 | Low | | ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails) | | Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium | -| EfficientDiD `control_group="last_cohort"` trims at `last_g - anticipation` but REGISTRY says `t >= last_g`. With `anticipation=0` (default) these are identical. With `anticipation>0`, code is arguably more conservative (excludes anticipation-contaminated periods). Either align REGISTRY with code or change code to `t < last_g` — needs design decision. | `efficient_did.py` | #230 | Low | -| TripleDifference power: `generate_ddd_data` is a fixed 2×2×2 cross-sectional DGP — no multi-period or unbalanced-group support. Add a `generate_ddd_panel_data` for panel DDD power analysis. | `prep_dgp.py`, `power.py` | #208 | Low | | Survey design resolution/collapse patterns are inconsistent across panel estimators — ContinuousDiD rebuilds unit-level design in SE code, EfficientDiD builds once in fit(), StackedDiD re-resolves on stacked data; extract shared helpers for panel-to-unit collapse, post-filter re-resolution, and metadata recomputation | `continuous_did.py`, `efficient_did.py`, `stacked_did.py` | #226 | Low | | Survey-weighted Silverman bandwidth in EfficientDiD conditional Omega* — `_silverman_bandwidth()` uses unweighted mean/std for bandwidth selection; survey-weighted statistics would better reflect the population distribution but is a second-order refinement | `efficient_did_covariates.py` | — | Low | -| TROP: `fit()` and `_fit_global()` share ~150 lines of near-identical data setup (panel pivoting, absorbing-state validation, first-treatment detection, effective rank, NaN warnings). Both bootstrap methods also duplicate the stratified resampling loop. Extract shared helpers to eliminate cross-file sync risk. | `trop.py`, `trop_global.py`, `trop_local.py` | — | Low | +| TROP: extend Wave 4's `_setup_trop_data` helper to also cover the duplicated bootstrap resampling loop in `_bootstrap_variance` / `_bootstrap_variance_global` (~40 LoC dedup; mirrors the data-setup helper pattern with a `fit_callable` parameter for the per-draw refit step). | `trop_local.py`, `trop_global.py` | follow-up | Low | +| TripleDifference power auto-routing: `power.simulate_power` ignores `n_periods` for DDD because `_ddd_dgp_kwargs` is hard-coded to the cross-sectional `generate_ddd_data`. Now that `generate_ddd_panel_data` exists (Wave 4), add a new `_EstimatorProfile` registry entry (or extend the existing one) to route to the panel DGP when `n_periods > 2`. | `power.py`, `prep_dgp.py` | follow-up | Low | | StaggeredTripleDifference R cross-validation: CSV fixtures not committed (gitignored); tests skip without local R + triplediff. Commit fixtures or generate deterministically. | `tests/test_methodology_staggered_triple_diff.py` | #245 | Medium | | StaggeredTripleDifference R parity: benchmark only tests no-covariate path (xformla=~1). Add covariate-adjusted scenarios and aggregation SE parity assertions. | `benchmarks/R/benchmark_staggered_triplediff.R` | #245 | Medium | | StaggeredTripleDifference: per-cohort group-effect SEs include WIF (conservative vs R's wif=NULL). Documented in REGISTRY. Could override mixin for exact R match. | `staggered_triple_diff.py` | #245 | Low | @@ -170,9 +169,6 @@ Ordered paydown view across the tables above. Tier A → D is by effort × risk, #### Tier A — Quick wins (≤1 day, ≤3 CI rounds expected) -- EfficientDiD `control_group="last_cohort"` REGISTRY-vs-code alignment with `anticipation>0` (`efficient_did.py`, one design decision) -- TripleDifference: add `generate_ddd_panel_data` for panel DDD power analysis (`prep_dgp.py`, `power.py`) -- TROP: extract shared data-setup helper between `fit()` and `_fit_global()` (~150 LoC dedup; `trop.py`, `trop_global.py`, `trop_local.py`) - WooldridgeDiD: optional efficiency hint when method/outcome pairing is sub-optimal (NOT a canonical-link violation per W2023 Prop 3.1 — see Methodology/Correctness row for the corrected framing) (SyntheticDiD `placebo_effects` → `variance_effects` rename moved to Tier B — the user-facing field rename + one-release deprecation alias is too large for ≤1 day / ≤3 CI rounds.) From 4aa3009308294665d03d16544ce453dccc7bbf2b Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 12:45:00 -0400 Subject: [PATCH 5/6] prep_dgp: guarantee all (group, partition) cells in generate_ddd_panel_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The R1 review identified that independent marginal sampling of `group` and `partition` could leave one of the four (group, partition) cells empty under valid inputs — e.g., `n_units=4, group_frac=partition_frac=0.25` rounds to `n_group_1=1` and `n_p1_g1=0`, so the (1, 1) cell collapses before `TripleDifference.fit`'s 2x2x2 cell-presence check. Switches to stratified allocation: assign `group` to its requested fraction, then within each group stratum, draw `partition=1` at `partition_frac`. Adds a targeted `ValueError` when the rounded cell counts would leave any (group, partition) cell empty (with the four cell counts in the error message so users can pick a feasible config). Adds two regression tests: - `test_infeasible_cell_counts_raise` exercises both the `n_units=4` small-marginal case and an `n_units=10, group_frac=0.1` case. - `test_smallest_feasible_config_populates_all_cells` verifies the smallest feasible config (`n_units=4, fracs=0.5`) yields one unit per cell and that `TripleDifference.fit(..., time="post")` succeeds on it (the contract the docstring advertises). Updates the `group_frac` / `partition_frac` docstring entries to describe the stratified allocation guarantee, and the `[Unreleased]` CHANGELOG entry to mention the cell-coverage invariant. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- diff_diff/prep_dgp.py | 49 +++++++++++++++++++++++++++++++++------ tests/test_prep.py | 53 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19003f9d5..956365c86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 14 tests). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. +- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction. - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422. diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py index 7e271c72f..a178b0d02 100644 --- a/diff_diff/prep_dgp.py +++ b/diff_diff/prep_dgp.py @@ -1188,10 +1188,16 @@ def generate_ddd_panel_data( Period (0-indexed) at which ``post`` switches from 0 to 1. Must satisfy ``1 <= treatment_period < n_periods``. group_frac : float, default=0.5 - Fraction of units with ``group=1``. Must be in ``(0, 1)``. + Fraction of units with ``group=1``. Must be in ``(0, 1)``. The + partition split is then drawn stratified-by-group at the requested + ``partition_frac`` so every (group, partition) cell receives at + least one unit; a ``ValueError`` is raised when the rounded cell + counts would leave any cell empty. partition_frac : float, default=0.5 - Fraction of units with ``partition=1`` (assigned independently of - ``group``). Must be in ``(0, 1)``. + Fraction of units with ``partition=1`` within each ``group`` + stratum. Must be in ``(0, 1)``. The stratified allocation is what + makes TripleDifference.fit's 2x2x2 surface populated for any valid + ``(n_units, group_frac, partition_frac)``. treatment_effect : float, default=2.0 True ATT for the triple-interaction cell (group=1, partition=1, post=1). @@ -1270,16 +1276,45 @@ def generate_ddd_panel_data( f"got {n_units}." ) + # Stratified allocation that guarantees every (group, partition) cell receives + # at least one unit. Independent marginal sampling could collapse a cell when + # rounded marginals are small (e.g., n_units=4, group_frac=partition_frac=0.25 + # yields n_group_1=1 with no room for both partition values within group=1). + # TripleDifference.fit requires all 8 G×P×T cells, so we validate first. + n_group_1 = int(round(n_units * group_frac)) + n_group_0 = n_units - n_group_1 + n_p1_g0 = int(round(n_group_0 * partition_frac)) + n_p1_g1 = int(round(n_group_1 * partition_frac)) + n_p0_g0 = n_group_0 - n_p1_g0 + n_p0_g1 = n_group_1 - n_p1_g1 + cell_counts = { + (0, 0): n_p0_g0, + (0, 1): n_p1_g0, + (1, 0): n_p0_g1, + (1, 1): n_p1_g1, + } + if min(cell_counts.values()) < 1: + raise ValueError( + f"generate_ddd_panel_data requires every (group, partition) cell to " + f"contain at least one unit so TripleDifference.fit's 2x2x2 surface is " + f"populated. With n_units={n_units}, group_frac={group_frac}, " + f"partition_frac={partition_frac}, the rounded cell counts are " + f"(group, partition): (0,0)={n_p0_g0}, (0,1)={n_p1_g0}, " + f"(1,0)={n_p0_g1}, (1,1)={n_p1_g1}. Increase n_units or move " + f"group_frac/partition_frac closer to 0.5 so each cell receives " + f">=1 unit." + ) + rng = np.random.default_rng(seed) - n_group_1 = max(1, min(n_units - 1, int(round(n_units * group_frac)))) - n_partition_1 = max(1, min(n_units - 1, int(round(n_units * partition_frac)))) group = np.zeros(n_units, dtype=int) group_idx = rng.choice(n_units, size=n_group_1, replace=False) group[group_idx] = 1 partition = np.zeros(n_units, dtype=int) - partition_idx = rng.choice(n_units, size=n_partition_1, replace=False) - partition[partition_idx] = 1 + g0_units = np.where(group == 0)[0] + g1_units = np.where(group == 1)[0] + partition[rng.choice(g0_units, size=n_p1_g0, replace=False)] = 1 + partition[rng.choice(g1_units, size=n_p1_g1, replace=False)] = 1 unit_fe = rng.normal(0.0, unit_fe_sd, size=n_units) diff --git a/tests/test_prep.py b/tests/test_prep.py index 696f507ce..38607850e 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -1234,6 +1234,59 @@ def test_n_units_validation(self): with pytest.raises(ValueError, match="n_units"): generate_ddd_panel_data(n_units=3, seed=42) + def test_infeasible_cell_counts_raise(self): + """Configs that round to empty (group, partition) cells fail fast.""" + from diff_diff.prep import generate_ddd_panel_data + + # n_units=4, group_frac=0.25 → n_group_1=1; partition_frac=0.25 inside + # the 1-unit group=1 stratum rounds to 0, leaving the (1, 1) cell empty. + with pytest.raises(ValueError, match=r"every \(group, partition\) cell"): + generate_ddd_panel_data( + n_units=4, + group_frac=0.25, + partition_frac=0.25, + seed=42, + ) + # n_units=10, group_frac=0.1 → n_group_1=1 again; same failure mode for + # the (1, 1) cell. + with pytest.raises(ValueError, match=r"every \(group, partition\) cell"): + generate_ddd_panel_data( + n_units=10, + group_frac=0.1, + partition_frac=0.5, + seed=42, + ) + + def test_smallest_feasible_config_populates_all_cells(self): + """n_units=4 with fracs=0.5 yields one unit per (group, partition) cell.""" + from diff_diff import TripleDifference + from diff_diff.prep import generate_ddd_panel_data + + data = generate_ddd_panel_data( + n_units=4, + n_periods=4, + treatment_period=2, + group_frac=0.5, + partition_frac=0.5, + noise_sd=0.0, + seed=42, + ) + # All four (group, partition) cells must receive exactly one unit. + unit_cells = data.groupby("unit")[["group", "partition"]].first() + cell_counts = unit_cells.groupby(["group", "partition"]).size() + assert len(cell_counts) == 4 + assert (cell_counts == 1).all() + # TripleDifference.fit (which requires all 8 G×P×T cells) must succeed + # on the smallest feasible config — validates the public contract + # advertised in the docstring's compatibility paragraph. + TripleDifference().fit( + data, + outcome="outcome", + group="group", + partition="partition", + time="post", + ) + def test_reproducibility(self): """Same seed produces identical DataFrames.""" from diff_diff.prep import generate_ddd_panel_data From f564ce902ee82456c32d10a2cbf45d05bac4eb8c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 13:14:15 -0400 Subject: [PATCH 6/6] prep_dgp: document and test cluster="unit" pattern for panel DGP R3 review flagged that `generate_ddd_panel_data` advertises direct `TripleDifference().fit(..., time="post")` usage, but `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator whose default analytical SE treats each row as iid (df = n_obs - 8). With the new panel-shaped output (unit FE + within-unit serial correlation), unclustered SE understates variance and overstates power. Updates: - DGP docstring gains a `.. warning::` block stating the panel + cluster requirement; the `Examples` block now demonstrates `TripleDifference(cluster="unit").fit(...)` as the recommended pattern. - REGISTRY.md `## TripleDifference` SE section gains a "Note (panel-shaped input, `generate_ddd_panel_data`)" paragraph documenting the repeated-cross-section semantics and the cluster contract. - CHANGELOG `[Unreleased]` entry tightened to mention the cluster requirement explicitly. - New test `test_recommended_clustered_panel_path` locks the documented pattern: clustered fit succeeds, `n_clusters == n_units`, ATT point estimate is invariant to clustering, but SE differs between clustered and unclustered fits (within-unit correlation is non-trivial). Point estimate semantics unchanged. Fix is documentation + invariance test only. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- diff_diff/prep_dgp.py | 19 ++++++++++++++-- docs/methodology/REGISTRY.md | 10 ++++++++ tests/test_prep.py | 44 ++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 956365c86..223abd121 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. +- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference(cluster="unit").fit(..., time="post")` (the cluster kwarg is required because `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator and unclustered SE on panel-generated rows understates variance under within-unit serial correlation; the point estimate `att` is invariant to clustering — see the new `TripleDifference` REGISTRY note on panel-shaped input). Users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction. - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422. diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py index a178b0d02..36f2656d1 100644 --- a/diff_diff/prep_dgp.py +++ b/diff_diff/prep_dgp.py @@ -1178,6 +1178,19 @@ def generate_ddd_panel_data( making it suitable for panel-aware power-analysis simulations or sanity-checking estimators that ignore the panel dimension. + .. warning:: + + ``TripleDifference`` is a repeated-cross-section ``panel=FALSE`` + estimator: its analytical default treats each row as an + independent observation (df = n_obs - 8). When fitting against + ``generate_ddd_panel_data`` output, the within-unit serial + correlation makes unclustered SEs anti-conservative — they + understate sampling variability and overstate power. Always pass + ``cluster="unit"`` (Liang-Zeger CR1) when fitting on + panel-generated data; the point estimate ``att`` is invariant to + clustering but the inference contract is not. See the + ``TripleDifference`` REGISTRY entry for the clustering contract. + Parameters ---------- n_units : int, default=200 @@ -1253,10 +1266,12 @@ def generate_ddd_panel_data( >>> data.groupby('unit')['period'].count().eq(8).all() True - Fit with TripleDifference (note ``time="post"``): + Fit with TripleDifference. Note ``time="post"`` (the derived binary + indicator) and ``cluster="unit"`` (required for valid inference on + panel-generated data; see the warning above): >>> from diff_diff import TripleDifference - >>> result = TripleDifference().fit( + >>> result = TripleDifference(cluster="unit").fit( ... data, outcome='outcome', group='group', ... partition='partition', time='post', ... ) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 4e1495789..6d64c37d0 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1747,6 +1747,16 @@ finite-sample adjustment. Note: IF-based SEs are inherently heteroskedasticity-robust; the `robust` parameter has no additional effect. +**Note (panel-shaped input, `generate_ddd_panel_data`):** `TripleDifference` is the +repeated-cross-section `panel=FALSE` implementation: the individual-level default SE +treats each row as an independent observation (df = n_obs - 8). When fitting against +panel-shaped data with repeated unit rows and within-unit serial correlation +(e.g., `generate_ddd_panel_data` output), unclustered SE understates sampling +variability and overstates power. Pass `cluster="unit"` to invoke the Liang-Zeger +CR1 path so the influence functions are aggregated within unit before variance +computation. The point estimate `att` is invariant to clustering, only the inference +contract changes. + *Edge cases:* - Propensity scores near 0/1: trimmed at `pscore_trim` (default 0.01) - Empty cells: raises ValueError with diagnostic message diff --git a/tests/test_prep.py b/tests/test_prep.py index 38607850e..408adc36a 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -1287,6 +1287,50 @@ def test_smallest_feasible_config_populates_all_cells(self): time="post", ) + def test_recommended_clustered_panel_path(self): + """Documented clustered-by-unit path produces n_clusters == n_units inference. + + Locks the docstring's recommended pattern (`cluster="unit"` for panel data) + against silent regression: TripleDifference is the repeated-cross-section + `panel=FALSE` estimator, so unclustered SE on panel-generated rows + understates variance. Clustering by `unit` aggregates per-row IFs within + unit before variance computation (Liang-Zeger CR1). + """ + from diff_diff import TripleDifference + from diff_diff.prep import generate_ddd_panel_data + + n_units = 200 + data = generate_ddd_panel_data( + n_units=n_units, + n_periods=8, + treatment_period=4, + seed=42, + ) + result = TripleDifference(cluster="unit").fit( + data, + outcome="outcome", + group="group", + partition="partition", + time="post", + ) + assert np.isfinite(result.att) + assert np.isfinite(result.se) + # Cluster-robust path records the number of clusters used. + assert result.n_clusters == n_units + # Unclustered fit on the same data MUST produce a different SE (clustering + # is materially different when within-unit serial correlation exists). + unclustered = TripleDifference().fit( + data, + outcome="outcome", + group="group", + partition="partition", + time="post", + ) + # Point estimate is invariant to clustering. + np.testing.assert_allclose(unclustered.att, result.att, atol=1e-10) + # SE differs because panel rows are not iid. + assert unclustered.se != result.se + def test_reproducibility(self): """Same seed produces identical DataFrames.""" from diff_diff.prep import generate_ddd_panel_data