From fa8be27b9dce9a3952d3e1e6d1b236ada13d3094 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 12:23:26 -0400
Subject: [PATCH 1/6] =?UTF-8?q?efficient=5Fdid:=20align=20REGISTRY=20note?=
 =?UTF-8?q?=20with=20last=5Fcohort=20=C3=97=20anticipation=20trim?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `control_group="last_cohort"` path in EfficientDiD.fit (line 470) trims
periods at `last_g - anticipation`, excluding anticipation-contaminated
periods from the pseudo-control's pre-treatment window. REGISTRY.md
previously described only the `anticipation=0` case. Aligns both the
Edge Cases bullet and the algorithm Note to the code's actual behavior,
cross-references the interaction in the `control_group` and `anticipation`
docstring entries, and adds a regression test asserting the trim cuts at
`last_g - anticipation` rather than `last_g`.

Closes Tier-A backlog item 2.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/efficient_did.py   | 12 ++++++---
 docs/methodology/REGISTRY.md |  4 +--
 tests/test_efficient_did.py  | 51 +++++++++++++++++++++++++++---------
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
index 4dcb93e42..1f3e48cac 100644
--- a/diff_diff/efficient_did.py
+++ b/diff_diff/efficient_did.py
@@ -162,9 +162,11 @@ class EfficientDiD(EfficientDiDBootstrapMixin):
         Which units serve as the comparison group:
         ``"never_treated"`` requires a never-treated cohort (raises if
         none exist); ``"last_cohort"`` reclassifies the latest treatment
-        cohort as pseudo-never-treated and drops post-treatment periods
-        for that cohort.  Distinct from CallawaySantAnna's
-        ``"not_yet_treated"`` — see REGISTRY.md for details.
+        cohort as pseudo-never-treated and drops periods at
+        ``t >= last_g - anticipation`` so the pseudo-control's
+        pre-treatment window excludes anticipation-contaminated periods.
+        Distinct from CallawaySantAnna's ``"not_yet_treated"`` — see
+        REGISTRY.md for details.
     n_bootstrap : int, default 0
         Number of multiplier bootstrap iterations (0 = analytical only).
     bootstrap_weights : str, default ``"rademacher"``
@@ -173,7 +175,9 @@ class EfficientDiD(EfficientDiDBootstrapMixin):
         Random seed for reproducibility.
     anticipation : int, default 0
         Number of anticipation periods (shifts the effective treatment
-        boundary forward by this amount).
+        boundary forward by this amount). When combined with
+        ``control_group="last_cohort"``, also trims the pseudo-control
+        period set at ``t >= last_g - anticipation`` (see REGISTRY.md).
     sieve_k_max : int or None
         Maximum polynomial degree for sieve ratio estimation. None = auto
         (``min(floor(n_gp^{1/5}), 5)``). Only used with covariates.
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index d8bc1d0ed..4e1495789 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -877,7 +877,7 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`.
 - **Near-zero propensity scores**: Ratio `p_g(X)/p_{g'}(X)` explodes. Overlap assumption (O) rules this out in population; implement trimming or warn on finite-sample instability
 - **Note:** When no sieve degree K succeeds for ratio estimation (basis dimension exceeds comparison group size, or all linear systems are singular), the estimator falls back to a constant ratio of 1 for all units with a UserWarning. The outcome regression adjustment remains active, so the generated outcomes (Eq 4.4) still incorporate covariate information via the m_hat terms. The DR property ensures consistency as long as the outcome regression is correctly specified.
 - **Note:** When no sieve degree K succeeds for inverse propensity estimation (algorithm step 4), the estimator falls back to unconditional n/n_group scaling with a UserWarning, which reduces to the unconditional Omega* approximation for the affected group.
-- **All units eventually treated**: Last cohort serves as "never-treated" by dropping time periods from the last cohort's treatment onset onward. Use `control_group="last_cohort"` to enable; default `"never_treated"` raises ValueError if no never-treated units exist
+- **All units eventually treated**: Last cohort serves as "never-treated" by dropping time periods from `last_g - anticipation` onward (where `last_g` is the latest treatment cohort). This excludes anticipation-contaminated periods from the pseudo-control's pre-treatment window. With `anticipation=0` (default), this equals `last_g`. Use `control_group="last_cohort"` to enable; default `"never_treated"` raises ValueError if no never-treated units exist
 - **Negative weights**: Explicitly stated as harmless for bias and beneficial for precision; arise from efficiency optimization under overidentification (Section 5.2)
 - **PT-Post regime (just-identified)**: Under PT-Post, EDiD automatically reduces to standard single-baseline estimator (Corollary 3.2). No downside to using EDiD -- it subsumes standard estimators
 - **Duplicate rows**: Duplicate `(unit, time)` entries are rejected with `ValueError`. The estimator requires exactly one observation per unit-period
@@ -936,7 +936,7 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`.
 - **Note:** EfficientDiD covariates (DR path) with survey weights supported — WLS outcome regression, weighted sieve normal equations for propensity ratios/inverse propensities, survey-weighted Nadaraya-Watson kernel for conditional Omega*(X), and survey-weighted ATT averaging. Silverman bandwidth uses unweighted statistics (survey-weighted bandwidth deferred as second-order refinement).
 - **Note:** Cluster-robust SEs use the standard Liang-Zeger clustered sandwich estimator applied to EIF values: aggregate EIF within clusters, center, and compute variance with G/(G-1) small-sample correction. Cluster bootstrap generates multiplier weights at the cluster level (all units in a cluster share the same weight). Analytical clustered SEs are the default when `cluster` is set; cluster bootstrap is opt-in via `n_bootstrap > 0`.
 - **Note:** Hausman pretest operates on the post-treatment event-study vector ES(e) per Theorem A.1. Both PT-All and PT-Post fits are aggregated to ES(e) using cohort-size weights before computing the test statistic H = delta' V^{-1} delta where delta = ES_post - ES_all and V = Cov(ES_post) - Cov(ES_all). Covariance is computed from aggregated ES(e)-level EIF values. The variance-difference matrix V is inverted via Moore-Penrose pseudoinverse to handle finite-sample non-positive-definiteness. Effective rank of V (number of positive eigenvalues) is used as degrees of freedom.
-- **Note:** Last-cohort-as-control (`control_group="last_cohort"`) reclassifies the latest treatment cohort as pseudo-never-treated and drops time periods at/after that cohort's treatment start. This is distinct from CallawaySantAnna's `not_yet_treated` option which dynamically selects not-yet-treated units per (g,t) pair.
+- **Note:** Last-cohort-as-control (`control_group="last_cohort"`) reclassifies the latest treatment cohort as pseudo-never-treated and drops time periods at `t >= last_g - anticipation`, excluding anticipation-contaminated periods from the pseudo-control's pre-treatment window. This is distinct from CallawaySantAnna's `not_yet_treated` option which dynamically selects not-yet-treated units per (g,t) pair.
 
 ---
 
diff --git a/tests/test_efficient_did.py b/tests/test_efficient_did.py
index ddce55a63..3bd192fbc 100644
--- a/tests/test_efficient_did.py
+++ b/tests/test_efficient_did.py
@@ -1125,6 +1125,39 @@ def test_last_cohort_no_treated_raises(self):
         with pytest.raises(ValueError, match="No treated cohorts"):
             EfficientDiD(control_group="last_cohort").fit(df, "y", "unit", "time", "first_treat")
 
+    def test_last_cohort_with_anticipation_trims_at_last_g_minus_anticipation(self):
+        """last_cohort + anticipation>0 trims at `last_g - anticipation`, not `last_g`.
+
+        Regression guard for PR #230 deferral: the code at efficient_did.py:470 uses
+        `effective_last = last_g - self.anticipation` so anticipation-contaminated periods
+        are excluded from the pseudo-control's pre-treatment window. If a future change
+        reverts to `t < last_g`, this test will catch it by checking the trimmed
+        `time_periods` set exposed on EfficientDiDResults.
+        """
+        df = _make_staggered_panel(
+            n_per_group=60,
+            n_control=0,
+            groups=(3, 5, 7),
+            effects={3: 2.0, 5: 1.5, 7: 1.0},
+        )
+        # _make_staggered_panel default n_periods=7, last_g=7, times 1..7.
+        # anticipation=0: effective_last=7, time_periods=[1..6]
+        # anticipation=1: effective_last=6, time_periods=[1..5]
+        result_a0 = EfficientDiD(
+            pt_assumption="all", control_group="last_cohort", anticipation=0
+        ).fit(df, "y", "unit", "time", "first_treat")
+        result_a1 = EfficientDiD(
+            pt_assumption="all", control_group="last_cohort", anticipation=1
+        ).fit(df, "y", "unit", "time", "first_treat")
+
+        assert max(result_a0.time_periods) == 6
+        assert max(result_a1.time_periods) == 5
+        assert len(result_a1.time_periods) == len(result_a0.time_periods) - 1
+        assert np.isfinite(result_a0.overall_att)
+        assert np.isfinite(result_a1.overall_att)
+        assert 7 not in result_a0.groups
+        assert 7 not in result_a1.groups
+
     def test_last_cohort_aggregate_event_study(self):
         """last_cohort with aggregate='event_study' should produce finite results."""
         df = _make_staggered_panel(
@@ -2084,9 +2117,7 @@ def test_ratio_sieve_partial_skip_warns(self):
         with pytest.warns(UserWarning) as caught:
             ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3)
         assert np.all(np.isfinite(ratio))
-        partial_skip_msgs = [
-            str(w.message) for w in caught if "skipped K=" in str(w.message)
-        ]
+        partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)]
         assert partial_skip_msgs, (
             "Expected a partial-K-skip warning when some K's are rank deficient "
             "but at least one succeeds; got none."
@@ -2106,9 +2137,7 @@ def test_inverse_propensity_sieve_partial_skip_warns(self):
         with pytest.warns(UserWarning) as caught:
             s_hat = estimate_inverse_propensity_sieve(X, mask, k_max=3)
         assert np.all(np.isfinite(s_hat))
-        partial_skip_msgs = [
-            str(w.message) for w in caught if "skipped K=" in str(w.message)
-        ]
+        partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)]
         assert partial_skip_msgs
 
     def test_ratio_sieve_no_warning_when_no_skips(self):
@@ -2128,9 +2157,7 @@ def test_ratio_sieve_no_warning_when_no_skips(self):
             _w.simplefilter("always")
             ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3)
         assert np.all(np.isfinite(ratio))
-        partial_skip_msgs = [
-            str(w.message) for w in caught if "skipped K=" in str(w.message)
-        ]
-        assert partial_skip_msgs == [], (
-            f"Unexpected partial-skip warning on clean data: {partial_skip_msgs}"
-        )
+        partial_skip_msgs = [str(w.message) for w in caught if "skipped K=" in str(w.message)]
+        assert (
+            partial_skip_msgs == []
+        ), f"Unexpected partial-skip warning on clean data: {partial_skip_msgs}"

From 77fe4daa05e3dabc0ea0acb19e2295bc8cc5b0a2 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 12:23:37 -0400
Subject: [PATCH 2/6] prep_dgp: add generate_ddd_panel_data for panel DDD power
 analysis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new public DGP function that generates a balanced panel of n_units
observed over n_periods with two unit-level binary dimensions (`group`,
`partition`, time-invariant) and a derived `post` indicator. The DDD-CPT
identifying assumption holds because `group_partition_interaction` enters
only as a unit-level (time-invariant) effect, leaving the triple
interaction `treatment_effect * group * partition * post` as the sole
source of differential group × partition trend.

The existing cross-sectional `generate_ddd_data` remains unchanged.
Compatible with `TripleDifference.fit(..., time="post")` directly; the
binary 2×2×2 estimator surface is unchanged. Auto-routing of
`power.simulate_power` to the panel DGP for `n_periods > 2` is deferred
to a follow-up (TODO.md row added).

Exports: top-level `diff_diff` and `diff_diff.prep` re-export; new
autofunction stub in docs/api/prep.rst. Tests in
tests/test_prep.py::TestGenerateDddPanelData (14 tests) including a
deterministic recovery test (noise_sd=0, ATT recovery to ~1e-15) and a
finite-sample recovery test.

Closes Tier-A backlog item 3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md          |   1 +
 diff_diff/__init__.py |   2 +
 diff_diff/prep.py     |   1 +
 diff_diff/prep_dgp.py | 207 ++++++++++++++++++++++++++++-
 docs/api/prep.rst     |   7 +
 tests/test_prep.py    | 296 ++++++++++++++++++++++++++++++++++--------
 6 files changed, 459 insertions(+), 55 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 964e35b9b..19003f9d5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 14 tests). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md.
 - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row).
 - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_<ring_label>"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction.
 - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422.
diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
index cee940432..a0f2a38ab 100644
--- a/diff_diff/__init__.py
+++ b/diff_diff/__init__.py
@@ -125,6 +125,7 @@
     generate_continuous_did_data,
     generate_did_data,
     generate_ddd_data,
+    generate_ddd_panel_data,
     generate_event_study_data,
     generate_factor_data,
     generate_panel_data,
@@ -409,6 +410,7 @@
     "generate_staggered_data",
     "generate_factor_data",
     "generate_ddd_data",
+    "generate_ddd_panel_data",
     "generate_panel_data",
     "generate_event_study_data",
     "generate_staggered_ddd_data",
diff --git a/diff_diff/prep.py b/diff_diff/prep.py
index 3c47832c7..1f1048a6e 100644
--- a/diff_diff/prep.py
+++ b/diff_diff/prep.py
@@ -19,6 +19,7 @@
 from diff_diff.prep_dgp import (  # noqa: F401
     generate_continuous_did_data,
     generate_ddd_data,
+    generate_ddd_panel_data,
     generate_did_data,
     generate_event_study_data,
     generate_factor_data,
diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py
index 45377b87b..7e271c72f 100644
--- a/diff_diff/prep_dgp.py
+++ b/diff_diff/prep_dgp.py
@@ -1129,6 +1129,209 @@ def generate_staggered_ddd_data(
     return pd.DataFrame(records)
 
 
+def generate_ddd_panel_data(
+    n_units: int = 200,
+    n_periods: int = 8,
+    treatment_period: int = 4,
+    group_frac: float = 0.5,
+    partition_frac: float = 0.5,
+    treatment_effect: float = 2.0,
+    group_effect: float = 2.0,
+    partition_effect: float = 1.0,
+    time_effect: float = 0.5,
+    group_time_interaction: float = 1.0,
+    partition_time_interaction: float = 0.5,
+    group_partition_interaction: float = 1.5,
+    unit_fe_sd: float = 1.5,
+    noise_sd: float = 1.0,
+    add_covariates: bool = False,
+    seed: Optional[int] = None,
+) -> pd.DataFrame:
+    """
+    Generate synthetic panel data for Triple Difference (DDD) power analysis.
+
+    Creates a balanced panel of n_units observed over n_periods with two
+    time-invariant binary dimensions (``group`` and ``partition``) and a
+    derived binary ``post`` indicator. The triple-interaction effect
+    (``group * partition * post``) is the identifying ATT under DDD-CPT.
+
+    The DGP equation is::
+
+        Y_{i,t} = unit_fe_i
+                + group_i        * group_effect
+                + partition_i    * partition_effect
+                + post_t         * time_effect
+                + (group_i * partition_i)  * group_partition_interaction
+                + (group_i * post_t)       * group_time_interaction
+                + (partition_i * post_t)   * partition_time_interaction
+                + treatment_effect * group_i * partition_i * post_t
+                + epsilon_{i,t}
+
+    where ``group_i`` and ``partition_i`` are unit-level (constant in t)
+    and ``post_t = 1[period >= treatment_period]``. DDD-CPT identification
+    holds because ``group_partition_interaction`` enters only as a
+    unit-level (time-invariant) effect, leaving the triple-interaction as
+    the sole source of differential group × partition trend.
+
+    Unlike the cross-sectional ``generate_ddd_data``, this DGP provides
+    panel-realistic unit fixed effects and within-unit serial structure,
+    making it suitable for panel-aware power-analysis simulations or
+    sanity-checking estimators that ignore the panel dimension.
+
+    Parameters
+    ----------
+    n_units : int, default=200
+        Number of units in the panel.
+    n_periods : int, default=8
+        Number of time periods.
+    treatment_period : int, default=4
+        Period (0-indexed) at which ``post`` switches from 0 to 1.
+        Must satisfy ``1 <= treatment_period < n_periods``.
+    group_frac : float, default=0.5
+        Fraction of units with ``group=1``. Must be in ``(0, 1)``.
+    partition_frac : float, default=0.5
+        Fraction of units with ``partition=1`` (assigned independently of
+        ``group``). Must be in ``(0, 1)``.
+    treatment_effect : float, default=2.0
+        True ATT for the triple-interaction cell (group=1, partition=1,
+        post=1).
+    group_effect : float, default=2.0
+        Main effect of ``group=1`` (unit-level).
+    partition_effect : float, default=1.0
+        Main effect of ``partition=1`` (unit-level).
+    time_effect : float, default=0.5
+        Main effect of ``post=1`` (time-level).
+    group_time_interaction : float, default=1.0
+        Coefficient on ``group * post`` (differential trend for the group
+        dimension).
+    partition_time_interaction : float, default=0.5
+        Coefficient on ``partition * post`` (differential trend for the
+        partition dimension).
+    group_partition_interaction : float, default=1.5
+        Coefficient on the unit-level ``group * partition`` interaction.
+        Must be time-invariant for DDD-CPT to hold.
+    unit_fe_sd : float, default=1.5
+        Standard deviation of the unit fixed effect.
+    noise_sd : float, default=1.0
+        Standard deviation of the idiosyncratic noise term.
+    add_covariates : bool, default=False
+        If True, add unit-level covariates ``x1`` (continuous) and ``x2``
+        (binary) that affect the outcome.
+    seed : int, optional
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    pd.DataFrame
+        Long-format panel with columns:
+
+        - ``unit``: integer unit ID.
+        - ``period``: integer time period (0-indexed).
+        - ``outcome``: outcome variable.
+        - ``group``: unit-level binary group indicator (time-invariant).
+        - ``partition``: unit-level binary partition indicator
+          (time-invariant, orthogonal to group).
+        - ``post``: binary indicator, ``1`` if ``period >= treatment_period``.
+        - ``treated``: ``group * partition * post`` (binary).
+        - ``true_effect``: ``treatment_effect`` when treated, else 0.
+        - ``x1``, ``x2``: optional unit-level covariates (only if
+          ``add_covariates=True``).
+
+    Examples
+    --------
+    Generate a balanced panel with default parameters:
+
+    >>> data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42)
+    >>> data.shape
+    (1600, 8)
+    >>> data.groupby('unit')['period'].count().eq(8).all()
+    True
+
+    Fit with TripleDifference (note ``time="post"``):
+
+    >>> from diff_diff import TripleDifference
+    >>> result = TripleDifference().fit(
+    ...     data, outcome='outcome', group='group',
+    ...     partition='partition', time='post',
+    ... )
+    """
+    if not (1 <= treatment_period < n_periods):
+        raise ValueError(
+            f"treatment_period must satisfy 1 <= treatment_period < n_periods; "
+            f"got treatment_period={treatment_period}, n_periods={n_periods}."
+        )
+    if not (0.0 < group_frac < 1.0):
+        raise ValueError(f"group_frac must be in (0, 1); got {group_frac}.")
+    if not (0.0 < partition_frac < 1.0):
+        raise ValueError(f"partition_frac must be in (0, 1); got {partition_frac}.")
+    if n_units < 4:
+        raise ValueError(
+            f"n_units must be >= 4 to populate all four (group, partition) cells; "
+            f"got {n_units}."
+        )
+
+    rng = np.random.default_rng(seed)
+
+    n_group_1 = max(1, min(n_units - 1, int(round(n_units * group_frac))))
+    n_partition_1 = max(1, min(n_units - 1, int(round(n_units * partition_frac))))
+    group = np.zeros(n_units, dtype=int)
+    group_idx = rng.choice(n_units, size=n_group_1, replace=False)
+    group[group_idx] = 1
+    partition = np.zeros(n_units, dtype=int)
+    partition_idx = rng.choice(n_units, size=n_partition_1, replace=False)
+    partition[partition_idx] = 1
+
+    unit_fe = rng.normal(0.0, unit_fe_sd, size=n_units)
+
+    if add_covariates:
+        x1 = rng.normal(0.0, 1.0, size=n_units)
+        x2 = rng.choice([0, 1], size=n_units)
+    else:
+        x1 = None
+        x2 = None
+
+    records = []
+    for i in range(n_units):
+        g_i = int(group[i])
+        p_i = int(partition[i])
+        for t in range(n_periods):
+            post_t = 1 if t >= treatment_period else 0
+            y = (
+                unit_fe[i]
+                + g_i * group_effect
+                + p_i * partition_effect
+                + post_t * time_effect
+                + (g_i * p_i) * group_partition_interaction
+                + (g_i * post_t) * group_time_interaction
+                + (p_i * post_t) * partition_time_interaction
+            )
+            treated = int(g_i == 1 and p_i == 1 and post_t == 1)
+            true_eff = treatment_effect if treated else 0.0
+            if treated:
+                y += treatment_effect
+            if add_covariates:
+                y += 0.5 * x1[i] + 0.3 * x2[i]
+            y += rng.normal(0.0, noise_sd)
+
+            row = {
+                "unit": i,
+                "period": t,
+                "outcome": float(y),
+                "group": g_i,
+                "partition": p_i,
+                "post": post_t,
+                "treated": treated,
+                "true_effect": float(true_eff),
+            }
+            if add_covariates:
+                row["x1"] = float(x1[i])
+                row["x2"] = int(x2[i])
+
+            records.append(row)
+
+    return pd.DataFrame(records)
+
+
 def _rank_pair_weights(
     unit_weight: np.ndarray,
     unit_stratum: np.ndarray,
@@ -1432,9 +1635,7 @@ def generate_survey_did_data(
         raise ValueError("te_covariate_interaction requires add_covariates=True")
 
     if not np.isfinite(conditional_pt):
-        raise ValueError(
-            f"conditional_pt must be finite, got {conditional_pt}"
-        )
+        raise ValueError(f"conditional_pt must be finite, got {conditional_pt}")
     if conditional_pt != 0.0 and not add_covariates:
         raise ValueError("conditional_pt requires add_covariates=True")
     if conditional_pt != 0.0:
diff --git a/docs/api/prep.rst b/docs/api/prep.rst
index 555100d6d..e01c80b4d 100644
--- a/docs/api/prep.rst
+++ b/docs/api/prep.rst
@@ -70,6 +70,13 @@ Generate synthetic Triple Difference data.
 
 .. autofunction:: diff_diff.generate_ddd_data
 
+generate_ddd_panel_data
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Generate synthetic panel-structured Triple Difference data for power analysis.
+
+.. autofunction:: diff_diff.generate_ddd_panel_data
+
 generate_factor_data
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/tests/test_prep.py b/tests/test_prep.py
index 0e4bf5d39..696f507ce 100644
--- a/tests/test_prep.py
+++ b/tests/test_prep.py
@@ -792,9 +792,9 @@ def test_extreme_Y_scale_synthetic_weight_column(self):
         weights = result["synthetic_weight"].to_numpy()
         # Valid simplex: non-negative, sums to 1.
         assert np.all(weights >= 0), "synthetic_weight must be non-negative"
-        assert abs(weights.sum() - 1.0) < 1e-10, (
-            f"synthetic_weight should sum to 1.0, got {weights.sum()}"
-        )
+        assert (
+            abs(weights.sum() - 1.0) < 1e-10
+        ), f"synthetic_weight should sum to 1.0, got {weights.sum()}"
         # Non-degenerate: at least 2 controls receive non-trivial weight.
         # This guards the Rust-PGD collapse-to-one-vertex bug that
         # previously fired at Y ~ 1e9 under the deleted wrapper.
@@ -833,9 +833,7 @@ def test_synthetic_weight_column_with_lambda_reg(self):
 
         for w, label in [(w_unreg, "unregularized"), (w_reg, "regularized")]:
             assert np.all(w >= 0), f"{label} weights must be non-negative"
-            assert abs(w.sum() - 1.0) < 1e-10, (
-                f"{label} weights should sum to 1.0, got {w.sum()}"
-            )
+            assert abs(w.sum() - 1.0) < 1e-10, f"{label} weights should sum to 1.0, got {w.sum()}"
 
         # Regularization should increase entropy (pull toward uniform) or at
         # least not collapse the simplex — a valid regularized solution must
@@ -1138,6 +1136,178 @@ def test_reproducibility(self):
         pd.testing.assert_frame_equal(data1, data2)
 
 
+class TestGenerateDddPanelData:
+    """Tests for generate_ddd_panel_data function (panel-structured DDD DGP)."""
+
+    def test_shape(self):
+        """len(data) == n_units * n_periods (balanced panel)."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42)
+        assert len(data) == 200 * 8
+        expected_cols = {
+            "unit",
+            "period",
+            "outcome",
+            "group",
+            "partition",
+            "post",
+            "treated",
+            "true_effect",
+        }
+        assert expected_cols.issubset(set(data.columns))
+
+    def test_balanced_cells(self):
+        """Each (group, partition) cell has roughly n_units * group_frac * partition_frac units."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(
+            n_units=400, n_periods=8, group_frac=0.5, partition_frac=0.5, seed=42
+        )
+        # Cell counts measured at unit level (not row level)
+        unit_cells = data.groupby("unit")[["group", "partition"]].first()
+        cell_counts = unit_cells.groupby(["group", "partition"]).size()
+        assert len(cell_counts) == 4
+        expected_per_cell = 400 * 0.5 * 0.5  # = 100
+        for count in cell_counts.values:
+            assert abs(count - expected_per_cell) <= 2
+
+    def test_treatment_effect_location(self):
+        """true_effect != 0 only when group==1 AND partition==1 AND post==1."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(
+            n_units=200, n_periods=8, treatment_period=4, treatment_effect=3.0, seed=42
+        )
+        treated_mask = (data["group"] == 1) & (data["partition"] == 1) & (data["post"] == 1)
+        assert (data.loc[treated_mask, "true_effect"] == 3.0).all()
+        assert (data.loc[~treated_mask, "true_effect"] == 0.0).all()
+        assert (data["treated"] == treated_mask.astype(int)).all()
+
+    def test_panel_structure(self):
+        """Each unit observed in exactly n_periods periods."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(n_units=100, n_periods=6, seed=42)
+        assert data.groupby("unit")["period"].count().eq(6).all()
+
+    def test_post_derived_from_period(self):
+        """post is exactly 1[period >= treatment_period]."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=3, seed=42)
+        expected_post = (data["period"] >= 3).astype(int)
+        assert (data["post"] == expected_post).all()
+
+    def test_group_partition_time_invariant(self):
+        """group and partition are unit-level (constant within unit) — DDD-CPT requirement."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(n_units=200, n_periods=8, seed=42)
+        assert data.groupby("unit")["group"].nunique().eq(1).all()
+        assert data.groupby("unit")["partition"].nunique().eq(1).all()
+
+    def test_treatment_period_validation(self):
+        """treatment_period must satisfy 1 <= treatment_period < n_periods."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        with pytest.raises(ValueError, match="treatment_period"):
+            generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=0, seed=42)
+        with pytest.raises(ValueError, match="treatment_period"):
+            generate_ddd_panel_data(n_units=100, n_periods=8, treatment_period=8, seed=42)
+
+    def test_group_frac_validation(self):
+        """group_frac and partition_frac must be strictly in (0, 1)."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        with pytest.raises(ValueError, match="group_frac"):
+            generate_ddd_panel_data(n_units=100, group_frac=0.0, seed=42)
+        with pytest.raises(ValueError, match="group_frac"):
+            generate_ddd_panel_data(n_units=100, group_frac=1.0, seed=42)
+        with pytest.raises(ValueError, match="partition_frac"):
+            generate_ddd_panel_data(n_units=100, partition_frac=0.0, seed=42)
+
+    def test_n_units_validation(self):
+        """n_units must be >= 4."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        with pytest.raises(ValueError, match="n_units"):
+            generate_ddd_panel_data(n_units=3, seed=42)
+
+    def test_reproducibility(self):
+        """Same seed produces identical DataFrames."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data1 = generate_ddd_panel_data(n_units=100, n_periods=6, seed=123)
+        data2 = generate_ddd_panel_data(n_units=100, n_periods=6, seed=123)
+        pd.testing.assert_frame_equal(data1, data2)
+
+    def test_ddd_effect_recovery_deterministic(self):
+        """noise_sd=0 deterministic recovery within 1e-6 (catches DGP transcription bugs)."""
+        from diff_diff import TripleDifference
+        from diff_diff.prep import generate_ddd_panel_data
+
+        true_effect = 2.0
+        data = generate_ddd_panel_data(
+            n_units=400,
+            n_periods=8,
+            treatment_period=4,
+            treatment_effect=true_effect,
+            noise_sd=0.0,
+            seed=42,
+        )
+        result = TripleDifference().fit(
+            data,
+            outcome="outcome",
+            group="group",
+            partition="partition",
+            time="post",
+        )
+        assert abs(result.att - true_effect) < 1e-6
+
+    def test_ddd_effect_recovery(self):
+        """Finite-sample recovery within tolerance under default noise."""
+        from diff_diff import TripleDifference
+        from diff_diff.prep import generate_ddd_panel_data
+
+        true_effect = 2.0
+        data = generate_ddd_panel_data(
+            n_units=1000,
+            n_periods=8,
+            treatment_period=4,
+            treatment_effect=true_effect,
+            seed=42,
+        )
+        result = TripleDifference().fit(
+            data,
+            outcome="outcome",
+            group="group",
+            partition="partition",
+            time="post",
+        )
+        assert abs(result.att - true_effect) < 0.7
+
+    def test_covariates_added(self):
+        """add_covariates=True adds x1 and x2 columns."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(
+            n_units=100, n_periods=6, treatment_period=3, add_covariates=True, seed=42
+        )
+        assert "x1" in data.columns
+        assert "x2" in data.columns
+
+    def test_covariates_omitted(self):
+        """add_covariates=False omits x1 and x2 columns."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(
+            n_units=100, n_periods=6, treatment_period=3, add_covariates=False, seed=42
+        )
+        assert "x1" not in data.columns
+        assert "x2" not in data.columns
+
+
 class TestGeneratePanelData:
     """Tests for generate_panel_data function."""
 
@@ -2095,20 +2265,27 @@ def test_conditional_pt_requires_both_groups(self):
         # Zero never-treated (exact)
         with pytest.raises(ValueError, match="conditional_pt requires at least one"):
             generate_survey_did_data(
-                add_covariates=True, conditional_pt=0.3,
-                never_treated_frac=0.0, seed=42,
+                add_covariates=True,
+                conditional_pt=0.3,
+                never_treated_frac=0.0,
+                seed=42,
             )
         # Small fraction that floors to zero never-treated units
         with pytest.raises(ValueError, match="conditional_pt requires at least one"):
             generate_survey_did_data(
-                n_units=50, add_covariates=True, conditional_pt=0.3,
-                never_treated_frac=0.01, seed=42,
+                n_units=50,
+                add_covariates=True,
+                conditional_pt=0.3,
+                never_treated_frac=0.01,
+                seed=42,
             )
         # All never-treated (no ever-treated units)
         with pytest.raises(ValueError, match="conditional_pt requires at least one"):
             generate_survey_did_data(
-                add_covariates=True, conditional_pt=0.3,
-                never_treated_frac=1.0, seed=42,
+                add_covariates=True,
+                conditional_pt=0.3,
+                never_treated_frac=1.0,
+                seed=42,
             )
 
     def test_conditional_pt_requires_covariates(self):
@@ -2123,13 +2300,9 @@ def test_conditional_pt_nonfinite_rejected(self):
         from diff_diff.prep_dgp import generate_survey_did_data
 
         with pytest.raises(ValueError, match="conditional_pt must be finite"):
-            generate_survey_did_data(
-                add_covariates=True, conditional_pt=np.inf, seed=42
-            )
+            generate_survey_did_data(add_covariates=True, conditional_pt=np.inf, seed=42)
         with pytest.raises(ValueError, match="conditional_pt must be finite"):
-            generate_survey_did_data(
-                add_covariates=True, conditional_pt=np.nan, seed=42
-            )
+            generate_survey_did_data(add_covariates=True, conditional_pt=np.nan, seed=42)
 
     def test_conditional_pt_x1_distribution_shift(self):
         """Treated units should have higher x1 when conditional_pt is active."""
@@ -2295,9 +2468,7 @@ def test_conditional_pt_crosssection_conditional_did(self):
         uncond_did = abs(beta_uncond[3])
 
         # Conditional DID: add x1 and x1*post
-        X_cond = np.column_stack([
-            np.ones(n), treated, post, treated_post, x1, x1_post
-        ])
+        X_cond = np.column_stack([np.ones(n), treated, post, treated_post, x1, x1_post])
         beta_cond = np.linalg.lstsq(X_cond, y, rcond=None)[0]
         cond_did = abs(beta_cond[3])
 
@@ -2311,9 +2482,7 @@ def test_conditional_pt_backward_compatible(self):
         """conditional_pt=0.0 should produce identical output to default."""
         from diff_diff.prep_dgp import generate_survey_did_data
 
-        df_default = generate_survey_did_data(
-            n_units=100, add_covariates=True, seed=99
-        )
+        df_default = generate_survey_did_data(n_units=100, add_covariates=True, seed=99)
         df_explicit = generate_survey_did_data(
             n_units=100, add_covariates=True, conditional_pt=0.0, seed=99
         )
@@ -2380,9 +2549,7 @@ def test_conditional_pt_panel_and_crosssection(self):
             p1 = df[df["period"] == 1]
             x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].mean()
             x1_control = p1.loc[p1["first_treat"] == 0, "x1"].mean()
-            assert x1_treated > x1_control, (
-                f"panel={panel_mode}: treated x1 not shifted"
-            )
+            assert x1_treated > x1_control, f"panel={panel_mode}: treated x1 not shifted"
 
 
 class TestAggregateSurvey:
@@ -3355,9 +3522,7 @@ def test_pweight_values(self, micro_data, design):
         for state in panel["state"].unique():
             state_rows = panel[panel["state"] == state]
             expected_weight = state_rows["cell_sum_w"].mean()
-            np.testing.assert_allclose(
-                state_rows["y_weight"].values, expected_weight, rtol=1e-10
-            )
+            np.testing.assert_allclose(state_rows["y_weight"].values, expected_weight, rtol=1e-10)
             # Must be constant within unit
             assert state_rows["y_weight"].nunique() == 1
 
@@ -3459,15 +3624,21 @@ def test_pweight_callaway_santanna_integration(self):
                 n_resp = rng.randint(15, 25)  # varying respondents per cell
                 te = 2.0 if (first_treats[geo] > 0 and period >= first_treats[geo]) else 0.0
                 for _ in range(n_resp):
-                    rows.append({
-                        "geo": geo, "period": period,
-                        "wt": rng.uniform(0.5, 3.0),
-                        "y": rng.normal(10 + te, 2),
-                    })
+                    rows.append(
+                        {
+                            "geo": geo,
+                            "period": period,
+                            "wt": rng.uniform(0.5, 3.0),
+                            "y": rng.normal(10 + te, 2),
+                        }
+                    )
         micro = pd.DataFrame(rows)
         design = SurveyDesign(weights="wt")
         panel, stage2 = aggregate_survey(
-            micro, by=["geo", "period"], outcomes="y", survey_design=design,
+            micro,
+            by=["geo", "period"],
+            outcomes="y",
+            survey_design=design,
         )
         assert stage2.weight_type == "pweight"
 
@@ -3475,14 +3646,18 @@ def test_pweight_callaway_santanna_integration(self):
         # but y_weight must be unit-constant
         for geo in panel["geo"].unique():
             geo_rows = panel[panel["geo"] == geo]
-            assert geo_rows["y_weight"].nunique() == 1, (
-                f"Geo {geo}: y_weight not constant within unit"
-            )
+            assert (
+                geo_rows["y_weight"].nunique() == 1
+            ), f"Geo {geo}: y_weight not constant within unit"
 
         panel["first_treat"] = panel["geo"].map(first_treats)
         result = CallawaySantAnna().fit(
-            panel, outcome="y_mean", unit="geo", time="period",
-            first_treat="first_treat", survey_design=stage2,
+            panel,
+            outcome="y_mean",
+            unit="geo",
+            time="period",
+            first_treat="first_treat",
+            survey_design=stage2,
         )
         assert np.isfinite(result.overall_att), f"ATT not finite: {result.overall_att}"
         assert result.overall_se > 0, f"SE not positive: {result.overall_se}"
@@ -3546,7 +3721,10 @@ def test_pweight_retains_zero_precision_geo(self):
 
         # Pweight mode: state 0 retained (cell_sum_w > 0 despite NaN precision)
         panel_p, _ = aggregate_survey(
-            data, by=["state", "period"], outcomes="y", survey_design=design,
+            data,
+            by=["state", "period"],
+            outcomes="y",
+            survey_design=design,
             second_stage_weights="pweight",
         )
         assert 0 in panel_p["state"].values
@@ -3555,7 +3733,10 @@ def test_pweight_retains_zero_precision_geo(self):
         # Aweight mode: state 0 dropped (all precision NaN -> weight 0)
         with pytest.warns(UserWarning, match="zero total weight"):
             panel_a, _ = aggregate_survey(
-                data, by=["state", "period"], outcomes="y", survey_design=design,
+                data,
+                by=["state", "period"],
+                outcomes="y",
+                survey_design=design,
                 second_stage_weights="aweight",
             )
         assert 0 not in panel_a["state"].values
@@ -3584,9 +3765,7 @@ def _build_microdata(self, mode, seed=42):
         n = len(state)
         wt = rng.uniform(0.5, 2.5, n)
         y = rng.normal(5.0, 1.5, n)
-        df_base = pd.DataFrame(
-            {"state": state, "year": year, "wt": wt, "y": y}
-        )
+        df_base = pd.DataFrame({"state": state, "year": year, "wt": wt, "y": y})
 
         if mode == "stratified_fpc":
             df = df_base.copy()
@@ -3643,7 +3822,10 @@ def _build_microdata(self, mode, seed=42):
             df["psu"] = psu
             policy = mode.split("_", 1)[1]
             sd = SurveyDesign(
-                weights="wt", strata="stratum", psu="psu", lonely_psu=policy,
+                weights="wt",
+                strata="stratum",
+                psu="psu",
+                lonely_psu=policy,
             )
             return df, sd
 
@@ -3660,8 +3842,10 @@ def _assert_panels_equivalent(p_fast, p_legacy, outcome="y"):
             nan_a, nan_b = np.isnan(a), np.isnan(b)
             assert np.array_equal(nan_a, nan_b), f"NaN pattern mismatch in {col}"
             np.testing.assert_allclose(
-                a[~nan_a], b[~nan_b],
-                atol=1e-14, rtol=1e-14,
+                a[~nan_a],
+                b[~nan_b],
+                atol=1e-14,
+                rtol=1e-14,
                 err_msg=f"{col} diverges between fast and legacy paths",
             )
 
@@ -3685,16 +3869,24 @@ def test_fast_path_equals_legacy(self, mode, monkeypatch):
 
         data, sd = self._build_microdata(mode)
         panel_fast, _ = aggregate_survey(
-            data, by=["state", "year"], outcomes="y", survey_design=sd,
+            data,
+            by=["state", "year"],
+            outcomes="y",
+            survey_design=sd,
         )
         # Force the legacy code path by disabling the scaffolding precompute.
         # _cell_mean_variance falls back to compute_survey_if_variance when
         # scaffolding is None.
         monkeypatch.setattr(
-            prep, "_precompute_psu_scaffolding", lambda resolved: None,
+            prep,
+            "_precompute_psu_scaffolding",
+            lambda resolved: None,
         )
         panel_legacy, _ = aggregate_survey(
-            data, by=["state", "year"], outcomes="y", survey_design=sd,
+            data,
+            by=["state", "year"],
+            outcomes="y",
+            survey_design=sd,
         )
         self._assert_panels_equivalent(panel_fast, panel_legacy)
 

From a897e3a9046e94e5b0e924f04d7aeae6e5cde9bc Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 12:23:50 -0400
Subject: [PATCH 3/6] trop: extract shared data-setup helper from fit() and
 _fit_global()

`TROP.fit()` (local path) and `_fit_global()` previously duplicated ~85
lines of data-setup logic each: panel pivoting, absorbing-state
validation, treated/control unit identification, first-treatment-period
detection, and pre/post period counting. The duplicated blocks were
near-identical line-by-line, differing only in which index mappings
the caller built (local built all four; global built only the forward
maps).

Extracts `_setup_trop_data(...)` in trop_local.py (alongside the existing
`_validate_and_pivot_treatment` helper). Both callers now invoke it and
unpack only the fields they consume. The helper returns all four index
mappings (`unit_to_idx`, `period_to_idx`, `idx_to_unit`, `idx_to_period`)
uniformly, eliminating a class of subtle drift bug; `_fit_global` gains
two unused locals as a trade-off.

The global-method-specific staggered-adoption check stays in
`_fit_global` as a post-helper validation (it depends on estimator
semantics, not data preparation). Bootstrap-loop dedup (~40 LoC across
`_bootstrap_variance` / `_bootstrap_variance_global`) is deferred to a
follow-up (TODO.md row added).

Adds a parity regression test `TestTROPModuleSplit::test_setup_trop_data_internal_contract`
that round-trip-verifies the index mappings, shape consistency, and
treated/control partition disjointness.

Behavior-preserving: TROP test suite (84 non-slow tests) is the safety
net.

Closes Tier-A backlog item 4.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/trop.py        | 115 +++++++-----------------------------
 diff_diff/trop_global.py |  93 ++++++-----------------------
 diff_diff/trop_local.py  | 123 +++++++++++++++++++++++++++++++++++++++
 tests/test_trop.py       |  43 ++++++++++++++
 4 files changed, 206 insertions(+), 168 deletions(-)

diff --git a/diff_diff/trop.py b/diff_diff/trop.py
index d4bd1840a..5c60dc15f 100644
--- a/diff_diff/trop.py
+++ b/diff_diff/trop.py
@@ -31,7 +31,7 @@
     _rust_loocv_grid_search,
 )
 from diff_diff.trop_global import TROPGlobalMixin
-from diff_diff.trop_local import TROPLocalMixin, _validate_and_pivot_treatment
+from diff_diff.trop_local import TROPLocalMixin, _setup_trop_data
 from diff_diff.trop_results import (
     _LAMBDA_INF,
     _PrecomputedStructures,
@@ -432,7 +432,6 @@ def fit(
 
         # Resolve survey design
         from diff_diff.survey import (
-            _extract_unit_survey_weights,
             _resolve_survey_for_fit,
             _validate_unit_constant_survey,
         )
@@ -470,94 +469,21 @@ def fit(
             )
 
         # Below is the local method (default)
-        # Get unique units and periods
-        all_units = sorted(data[unit].unique())
-
-        # Extract unit-level survey weights
-        if resolved_survey is not None:
-            unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units)
-        else:
-            unit_weight_arr = None
-        all_periods = sorted(data[time].unique())
-
-        n_units = len(all_units)
-        n_periods = len(all_periods)
-
-        # Create mappings
-        unit_to_idx = {u: i for i, u in enumerate(all_units)}
-        period_to_idx = {p: i for i, p in enumerate(all_periods)}
-        idx_to_unit = {i: u for u, i in unit_to_idx.items()}
-        idx_to_period = {i: p for p, i in period_to_idx.items()}
-
-        # Create outcome matrix Y (n_periods x n_units) and treatment matrix D
-        # Vectorized: use pivot for O(1) reshaping instead of O(n) iterrows loop
-        Y = (
-            data.pivot(index=time, columns=unit, values=outcome)
-            .reindex(index=all_periods, columns=all_units)
-            .values
+        _ctx = _setup_trop_data(
+            data, outcome, treatment, unit, time, resolved_survey, survey_design
         )
-
-        # For D matrix, validate observed treatment and handle unbalanced panels
-        D, missing_mask = _validate_and_pivot_treatment(
-            data, time, unit, treatment, all_periods, all_units
-        )
-
-        # Validate D is monotonic non-decreasing per unit (absorbing state)
-        # D[t, i] must satisfy: once D=1, it must stay 1 for all subsequent periods
-        # Issue 3 fix (round 10): Check each unit's OBSERVED D sequence for monotonicity
-        # This catches 1->0 violations that span missing period gaps
-        # Example: D[2]=1, missing [3,4], D[5]=0 is a real violation even though
-        # adjacent period transitions don't show it (the gap hides the transition)
-        violating_units = []
-        for unit_idx in range(n_units):
-            # Get observed D values for this unit (where not missing)
-            observed_mask = ~missing_mask[:, unit_idx]
-            observed_d = D[observed_mask, unit_idx]
-
-            # Check if observed sequence is monotonically non-decreasing
-            if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0):
-                violating_units.append(all_units[unit_idx])
-
-        if violating_units:
-            raise ValueError(
-                f"Treatment indicator is not an absorbing state for units: {violating_units}. "
-                f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). "
-                f"If this is event-study style data, convert to absorbing state: "
-                f"D[t, i] = 1 for all t >= first treatment period."
-            )
-
-        # Identify treated observations
-        treated_mask = D == 1
-        n_treated_obs = np.sum(treated_mask)
-
-        if n_treated_obs == 0:
-            raise ValueError("No treated observations found")
-
-        # Identify treated and control units
-        unit_ever_treated = np.any(D == 1, axis=0)
-        treated_unit_idx = np.where(unit_ever_treated)[0]
-        control_unit_idx = np.where(~unit_ever_treated)[0]
-
-        if len(control_unit_idx) == 0:
-            raise ValueError("No control units found")
-
-        # Determine pre/post periods from treatment indicator D
-        # D matrix is the sole input for treatment timing per the paper
-        first_treat_period = None
-        for t in range(n_periods):
-            if np.any(D[t, :] == 1):
-                first_treat_period = t
-                break
-        if first_treat_period is None:
-            raise ValueError("Could not infer post-treatment periods from D matrix")
-
-        n_pre_periods = first_treat_period
-        # Count periods where D=1 is actually observed (matches docstring)
-        # Per docstring: "Number of post-treatment periods (periods with D=1 observations)"
-        n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1)))
-
-        if n_pre_periods < 2:
-            raise ValueError("Need at least 2 pre-treatment periods")
+        n_units = _ctx["n_units"]
+        n_periods = _ctx["n_periods"]
+        idx_to_unit = _ctx["idx_to_unit"]
+        idx_to_period = _ctx["idx_to_period"]
+        unit_weight_arr = _ctx["unit_weight_arr"]
+        Y = _ctx["Y"]
+        D = _ctx["D"]
+        n_treated_obs = _ctx["n_treated_obs"]
+        treated_unit_idx = _ctx["treated_unit_idx"]
+        control_unit_idx = _ctx["control_unit_idx"]
+        n_pre_periods = _ctx["n_pre_periods"]
+        n_post_periods = _ctx["n_post_periods"]
 
         # Step 1: Grid search with LOOCV for tuning parameters
         best_lambda = None
@@ -769,7 +695,12 @@ def fit(
             # Fit model with these weights
             n_fits_attempted += 1
             alpha_hat, beta_hat, L_hat = self._estimate_model(
-                Y, control_mask, weight_matrix, lambda_nn, n_units, n_periods,
+                Y,
+                control_mask,
+                weight_matrix,
+                lambda_nn,
+                n_units,
+                n_periods,
                 _nonconvergence_tracker=nonconverg_tracker,
             )
 
@@ -907,9 +838,7 @@ def set_params(self, **params) -> "TROP":
         """Set estimator parameters."""
         for key, value in params.items():
             if key == "method" and value not in ("local", "global"):
-                raise ValueError(
-                    f"method must be one of ('local', 'global'), got '{value}'"
-                )
+                raise ValueError(f"method must be one of ('local', 'global'), got '{value}'")
             if hasattr(self, key):
                 setattr(self, key, value)
             else:
diff --git a/diff_diff/trop_global.py b/diff_diff/trop_global.py
index 568e0eebd..ec7402e70 100644
--- a/diff_diff/trop_global.py
+++ b/diff_diff/trop_global.py
@@ -28,7 +28,7 @@
     stratified_bootstrap_indices,
     warn_bootstrap_failure_rate,
 )
-from diff_diff.trop_local import _soft_threshold_svd, _validate_and_pivot_treatment
+from diff_diff.trop_local import _setup_trop_data, _soft_threshold_svd
 from diff_diff.trop_results import TROPResults
 from diff_diff.utils import safe_inference, warn_if_not_converged
 
@@ -584,82 +584,25 @@ def _fit_global(
         across units, use `method="local"` which computes observation-specific
         weights that naturally handle heterogeneous timing.
         """
-        # Data setup (same as local method)
-        all_units = sorted(data[unit].unique())
-        all_periods = sorted(data[time].unique())
-
-        # Extract per-unit survey weights for weighted ATT aggregation
-        if resolved_survey is not None:
-            from diff_diff.survey import _extract_unit_survey_weights
-
-            unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units)
-        else:
-            unit_weight_arr = None
-
-        n_units = len(all_units)
-        n_periods = len(all_periods)
-
-        idx_to_unit = {i: u for i, u in enumerate(all_units)}
-        idx_to_period = {i: p for i, p in enumerate(all_periods)}
-
-        # Create matrices
-        Y = (
-            data.pivot(index=time, columns=unit, values=outcome)
-            .reindex(index=all_periods, columns=all_units)
-            .values
-        )
-
-        D, missing_mask = _validate_and_pivot_treatment(
-            data, time, unit, treatment, all_periods, all_units
+        # Data setup (shared with local method via _setup_trop_data helper).
+        _ctx = _setup_trop_data(
+            data, outcome, treatment, unit, time, resolved_survey, survey_design
         )
-
-        # Validate absorbing state
-        violating_units = []
-        for unit_idx in range(n_units):
-            observed_mask = ~missing_mask[:, unit_idx]
-            observed_d = D[observed_mask, unit_idx]
-            if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0):
-                violating_units.append(all_units[unit_idx])
-
-        if violating_units:
-            raise ValueError(
-                f"Treatment indicator is not an absorbing state for units: {violating_units}. "
-                f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). "
-                f"If this is event-study style data, convert to absorbing state: "
-                f"D[t, i] = 1 for all t >= first treatment period."
-            )
-
-        # Identify treated observations
-        treated_mask = D == 1
-        n_treated_obs = np.sum(treated_mask)
-
-        if n_treated_obs == 0:
-            raise ValueError("No treated observations found")
-
-        # Identify treated and control units
-        unit_ever_treated = np.any(D == 1, axis=0)
-        treated_unit_idx = np.where(unit_ever_treated)[0]
-        control_unit_idx = np.where(~unit_ever_treated)[0]
-
-        if len(control_unit_idx) == 0:
-            raise ValueError("No control units found")
-
-        # Determine pre/post periods
-        first_treat_period = None
-        for t in range(n_periods):
-            if np.any(D[t, :] == 1):
-                first_treat_period = t
-                break
-
-        if first_treat_period is None:
-            raise ValueError("Could not infer post-treatment periods from D matrix")
-
-        n_pre_periods = first_treat_period
+        n_units = _ctx["n_units"]
+        n_periods = _ctx["n_periods"]
+        idx_to_unit = _ctx["idx_to_unit"]
+        idx_to_period = _ctx["idx_to_period"]
+        unit_weight_arr = _ctx["unit_weight_arr"]
+        Y = _ctx["Y"]
+        D = _ctx["D"]
+        missing_mask = _ctx["missing_mask"]
+        n_treated_obs = _ctx["n_treated_obs"]
+        treated_unit_idx = _ctx["treated_unit_idx"]
+        control_unit_idx = _ctx["control_unit_idx"]
+        first_treat_period = _ctx["first_treat_period"]
+        n_pre_periods = _ctx["n_pre_periods"]
+        n_post_periods = _ctx["n_post_periods"]
         treated_periods = n_periods - first_treat_period
-        n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1)))
-
-        if n_pre_periods < 2:
-            raise ValueError("Need at least 2 pre-treatment periods")
 
         # Check for staggered adoption (global method requires simultaneous treatment)
         # Use only observed periods (skip missing) to avoid false positives on unbalanced panels
diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py
index 6efa3d7a3..64b633d1b 100644
--- a/diff_diff/trop_local.py
+++ b/diff_diff/trop_local.py
@@ -71,6 +71,129 @@ def _validate_and_pivot_treatment(data, time, unit, treatment, all_periods, all_
     return D, missing_mask
 
 
+def _setup_trop_data(data, outcome, treatment, unit, time, resolved_survey, survey_design):
+    """Shared data setup for TROP local and global fit paths.
+
+    Performs panel pivoting (long → wide), absorbing-state validation,
+    treated/control unit identification, first-treatment-period detection,
+    and pre/post period counting. Returns a dict so both callers can
+    unpack only the fields they need.
+
+    The global-method-specific staggered-adoption check stays in
+    `_fit_global` as a post-helper validation because it depends on
+    estimator semantics (global method requires simultaneous treatment),
+    not data preparation.
+
+    Note: the same dict-shaped contract is also relied on by the bootstrap
+    paths (`_bootstrap_variance` / `_bootstrap_variance_global`) which
+    currently rebuild similar state per draw. Future contract changes to
+    this helper must be checked against both `_fit_*` and `_bootstrap_*`
+    call sites for cross-file sync.
+
+    Returns
+    -------
+    dict
+        With keys:
+        ``all_units, all_periods, n_units, n_periods, unit_to_idx,
+        period_to_idx, idx_to_unit, idx_to_period, unit_weight_arr,
+        Y, D, missing_mask, treated_mask, n_treated_obs,
+        treated_unit_idx, control_unit_idx, first_treat_period,
+        n_pre_periods, n_post_periods``.
+    """
+    all_units = sorted(data[unit].unique())
+    all_periods = sorted(data[time].unique())
+
+    if resolved_survey is not None:
+        from diff_diff.survey import _extract_unit_survey_weights
+
+        unit_weight_arr = _extract_unit_survey_weights(data, unit, survey_design, all_units)
+    else:
+        unit_weight_arr = None
+
+    n_units = len(all_units)
+    n_periods = len(all_periods)
+
+    unit_to_idx = {u: i for i, u in enumerate(all_units)}
+    period_to_idx = {p: i for i, p in enumerate(all_periods)}
+    idx_to_unit = {i: u for u, i in unit_to_idx.items()}
+    idx_to_period = {i: p for p, i in period_to_idx.items()}
+
+    Y = (
+        data.pivot(index=time, columns=unit, values=outcome)
+        .reindex(index=all_periods, columns=all_units)
+        .values
+    )
+
+    D, missing_mask = _validate_and_pivot_treatment(
+        data, time, unit, treatment, all_periods, all_units
+    )
+
+    violating_units = []
+    for unit_idx in range(n_units):
+        observed_mask = ~missing_mask[:, unit_idx]
+        observed_d = D[observed_mask, unit_idx]
+        if len(observed_d) > 1 and np.any(np.diff(observed_d) < 0):
+            violating_units.append(all_units[unit_idx])
+
+    if violating_units:
+        raise ValueError(
+            f"Treatment indicator is not an absorbing state for units: {violating_units}. "
+            f"D[t, unit] must be monotonic non-decreasing (once treated, always treated). "
+            f"If this is event-study style data, convert to absorbing state: "
+            f"D[t, i] = 1 for all t >= first treatment period."
+        )
+
+    treated_mask = D == 1
+    n_treated_obs = int(np.sum(treated_mask))
+
+    if n_treated_obs == 0:
+        raise ValueError("No treated observations found")
+
+    unit_ever_treated = np.any(D == 1, axis=0)
+    treated_unit_idx = np.where(unit_ever_treated)[0]
+    control_unit_idx = np.where(~unit_ever_treated)[0]
+
+    if len(control_unit_idx) == 0:
+        raise ValueError("No control units found")
+
+    first_treat_period = None
+    for t in range(n_periods):
+        if np.any(D[t, :] == 1):
+            first_treat_period = t
+            break
+
+    if first_treat_period is None:
+        raise ValueError("Could not infer post-treatment periods from D matrix")
+
+    n_pre_periods = first_treat_period
+    n_post_periods = int(np.sum(np.any(D[first_treat_period:, :] == 1, axis=1)))
+
+    if n_pre_periods < 2:
+        raise ValueError("Need at least 2 pre-treatment periods")
+
+    return {
+        "all_units": all_units,
+        "all_periods": all_periods,
+        "n_units": n_units,
+        "n_periods": n_periods,
+        "unit_to_idx": unit_to_idx,
+        "period_to_idx": period_to_idx,
+        "idx_to_unit": idx_to_unit,
+        "idx_to_period": idx_to_period,
+        "unit_weight_arr": unit_weight_arr,
+        "Y": Y,
+        "D": D,
+        "missing_mask": missing_mask,
+        "treated_mask": treated_mask,
+        "n_treated_obs": n_treated_obs,
+        "treated_unit_idx": treated_unit_idx,
+        "control_unit_idx": control_unit_idx,
+        "first_treat_period": first_treat_period,
+        "n_pre_periods": n_pre_periods,
+        "n_post_periods": n_post_periods,
+    }
+
+
 # Module-level convergence tolerance for SVD singular value truncation.
 # Singular values below this threshold after soft-thresholding are treated
 # as zero to improve numerical stability.
diff --git a/tests/test_trop.py b/tests/test_trop.py
index aa5de0939..5cf10cac0 100644
--- a/tests/test_trop.py
+++ b/tests/test_trop.py
@@ -4146,6 +4146,49 @@ def test_method_dispatch_local_does_not_use_fit_global(self):
                 trop_est.fit(df, "outcome", "treated", "unit", "time")
             mock_fg.assert_not_called()
 
+    def test_setup_trop_data_internal_contract(self):
+        """`_setup_trop_data` returns a self-consistent state dict used by both fit paths.
+
+        Regression guard for the Wave 4 refactor: both `TROP.fit()` local path and
+        `_fit_global()` now consume `_setup_trop_data`'s dict. If a future contract
+        change drops or renames a field, this catches it.
+        """
+        from diff_diff.trop_local import _setup_trop_data
+
+        df = self._make_panel()
+        ctx = _setup_trop_data(
+            df,
+            outcome="outcome",
+            treatment="treated",
+            unit="unit",
+            time="time",
+            resolved_survey=None,
+            survey_design=None,
+        )
+        n_units = ctx["n_units"]
+        n_periods = ctx["n_periods"]
+        # Dimensions are consistent across return fields.
+        assert ctx["Y"].shape == (n_periods, n_units)
+        assert ctx["D"].shape == (n_periods, n_units)
+        assert ctx["missing_mask"].shape == (n_periods, n_units)
+        assert ctx["treated_mask"].shape == (n_periods, n_units)
+        assert len(ctx["all_units"]) == n_units
+        assert len(ctx["all_periods"]) == n_periods
+        # Round-trip both mapping pairs (the local path historically built both
+        # forward and inverse maps; helper now returns all four uniformly so
+        # global path gains parity).
+        for i in range(n_units):
+            assert ctx["unit_to_idx"][ctx["idx_to_unit"][i]] == i
+        for t in range(n_periods):
+            assert ctx["period_to_idx"][ctx["idx_to_period"][t]] == t
+        # first_treat_period derivation matches the canonical "first row of D
+        # with any treated cell" expression used pre-refactor.
+        assert ctx["first_treat_period"] == int(np.argmax(np.any(ctx["D"] == 1, axis=1)))
+        assert ctx["n_pre_periods"] == ctx["first_treat_period"]
+        # Treated/control unit partition is complete and disjoint.
+        assert len(ctx["treated_unit_idx"]) + len(ctx["control_unit_idx"]) == n_units
+        assert len(set(ctx["treated_unit_idx"]) & set(ctx["control_unit_idx"])) == 0
+
 
 class TestSilentWarningAudit:
     """Tests for UserWarning emissions added by the silent warning audit."""

From 347340b9e3560f4063b94249138b8c76d80f3a0c Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 12:23:56 -0400
Subject: [PATCH 4/6] todo: drain items 2/3/4 from Tier A backlog

Removes three Methodology/Correctness rows (EfficientDiD anticipation
trim, generate_ddd_panel_data, TROP fit/_fit_global dedup) and the
three corresponding Tier A bullets, addressed in this PR. Adds two
follow-up rows for the deferred scope (TROP bootstrap-loop dedup,
TripleDifference power auto-routing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 TODO.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/TODO.md b/TODO.md
index d585f5135..45d4f6587 100644
--- a/TODO.md
+++ b/TODO.md
@@ -82,11 +82,10 @@ Deferred items from PR reviews that were not addressed before merge.
 | CallawaySantAnna: consider materializing NaN entries for non-estimable (g,t) cells in group_time_effects dict (currently omitted with consolidated warning); would require updating downstream consumers (event study, balance_e, aggregation) | `staggered.py` | #256 | Low |
 | ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails) |
 | Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium |
-| EfficientDiD `control_group="last_cohort"` trims at `last_g - anticipation` but REGISTRY says `t >= last_g`. With `anticipation=0` (default) these are identical. With `anticipation>0`, code is arguably more conservative (excludes anticipation-contaminated periods). Either align REGISTRY with code or change code to `t < last_g` — needs design decision. | `efficient_did.py` | #230 | Low |
-| TripleDifference power: `generate_ddd_data` is a fixed 2×2×2 cross-sectional DGP — no multi-period or unbalanced-group support. Add a `generate_ddd_panel_data` for panel DDD power analysis. | `prep_dgp.py`, `power.py` | #208 | Low |
 | Survey design resolution/collapse patterns are inconsistent across panel estimators — ContinuousDiD rebuilds unit-level design in SE code, EfficientDiD builds once in fit(), StackedDiD re-resolves on stacked data; extract shared helpers for panel-to-unit collapse, post-filter re-resolution, and metadata recomputation | `continuous_did.py`, `efficient_did.py`, `stacked_did.py` | #226 | Low |
 | Survey-weighted Silverman bandwidth in EfficientDiD conditional Omega* — `_silverman_bandwidth()` uses unweighted mean/std for bandwidth selection; survey-weighted statistics would better reflect the population distribution but is a second-order refinement | `efficient_did_covariates.py` | — | Low |
-| TROP: `fit()` and `_fit_global()` share ~150 lines of near-identical data setup (panel pivoting, absorbing-state validation, first-treatment detection, effective rank, NaN warnings). Both bootstrap methods also duplicate the stratified resampling loop. Extract shared helpers to eliminate cross-file sync risk. | `trop.py`, `trop_global.py`, `trop_local.py` | — | Low |
+| TROP: extend Wave 4's `_setup_trop_data` helper to also cover the duplicated bootstrap resampling loop in `_bootstrap_variance` / `_bootstrap_variance_global` (~40 LoC dedup; mirrors the data-setup helper pattern with a `fit_callable` parameter for the per-draw refit step). | `trop_local.py`, `trop_global.py` | follow-up | Low |
+| TripleDifference power auto-routing: `power.simulate_power` ignores `n_periods` for DDD because `_ddd_dgp_kwargs` is hard-coded to the cross-sectional `generate_ddd_data`. Now that `generate_ddd_panel_data` exists (Wave 4), add a new `_EstimatorProfile` registry entry (or extend the existing one) to route to the panel DGP when `n_periods > 2`. | `power.py`, `prep_dgp.py` | follow-up | Low |
 | StaggeredTripleDifference R cross-validation: CSV fixtures not committed (gitignored); tests skip without local R + triplediff. Commit fixtures or generate deterministically. | `tests/test_methodology_staggered_triple_diff.py` | #245 | Medium |
 | StaggeredTripleDifference R parity: benchmark only tests no-covariate path (xformla=~1). Add covariate-adjusted scenarios and aggregation SE parity assertions. | `benchmarks/R/benchmark_staggered_triplediff.R` | #245 | Medium |
 | StaggeredTripleDifference: per-cohort group-effect SEs include WIF (conservative vs R's wif=NULL). Documented in REGISTRY. Could override mixin for exact R match. | `staggered_triple_diff.py` | #245 | Low |
@@ -170,9 +169,6 @@ Ordered paydown view across the tables above. Tier A → D is by effort × risk,
 
 #### Tier A — Quick wins (≤1 day, ≤3 CI rounds expected)
 
-- EfficientDiD `control_group="last_cohort"` REGISTRY-vs-code alignment with `anticipation>0` (`efficient_did.py`, one design decision)
-- TripleDifference: add `generate_ddd_panel_data` for panel DDD power analysis (`prep_dgp.py`, `power.py`)
-- TROP: extract shared data-setup helper between `fit()` and `_fit_global()` (~150 LoC dedup; `trop.py`, `trop_global.py`, `trop_local.py`)
 - WooldridgeDiD: optional efficiency hint when method/outcome pairing is sub-optimal (NOT a canonical-link violation per W2023 Prop 3.1 — see Methodology/Correctness row for the corrected framing)
 
 (SyntheticDiD `placebo_effects` → `variance_effects` rename moved to Tier B — the user-facing field rename + one-release deprecation alias is too large for ≤1 day / ≤3 CI rounds.)

From 4aa3009308294665d03d16544ce453dccc7bbf2b Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 12:45:00 -0400
Subject: [PATCH 5/6] prep_dgp: guarantee all (group, partition) cells in
 generate_ddd_panel_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The R1 review identified that independent marginal sampling of `group` and
`partition` could leave one of the four (group, partition) cells empty
under valid inputs — e.g., `n_units=4, group_frac=partition_frac=0.25`
rounds to `n_group_1=1` and `n_p1_g1=0`, so the (1, 1) cell collapses
before `TripleDifference.fit`'s 2x2x2 cell-presence check.

Switches to stratified allocation: assign `group` to its requested
fraction, then within each group stratum, draw `partition=1` at
`partition_frac`. Adds a targeted `ValueError` when the rounded cell
counts would leave any (group, partition) cell empty (with the four
cell counts in the error message so users can pick a feasible config).

Adds two regression tests:
- `test_infeasible_cell_counts_raise` exercises both the `n_units=4`
  small-marginal case and an `n_units=10, group_frac=0.1` case.
- `test_smallest_feasible_config_populates_all_cells` verifies the
  smallest feasible config (`n_units=4, fracs=0.5`) yields one unit per
  cell and that `TripleDifference.fit(..., time="post")` succeeds on
  it (the contract the docstring advertises).

Updates the `group_frac` / `partition_frac` docstring entries to describe
the stratified allocation guarantee, and the `[Unreleased]` CHANGELOG
entry to mention the cell-coverage invariant.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md          |  2 +-
 diff_diff/prep_dgp.py | 49 +++++++++++++++++++++++++++++++++------
 tests/test_prep.py    | 53 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19003f9d5..956365c86 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
-- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 14 tests). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md.
+- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md.
 - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row).
 - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_<ring_label>"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction.
 - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422.
diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py
index 7e271c72f..a178b0d02 100644
--- a/diff_diff/prep_dgp.py
+++ b/diff_diff/prep_dgp.py
@@ -1188,10 +1188,16 @@ def generate_ddd_panel_data(
         Period (0-indexed) at which ``post`` switches from 0 to 1.
         Must satisfy ``1 <= treatment_period < n_periods``.
     group_frac : float, default=0.5
-        Fraction of units with ``group=1``. Must be in ``(0, 1)``.
+        Fraction of units with ``group=1``. Must be in ``(0, 1)``. The
+        partition split is then drawn stratified-by-group at the requested
+        ``partition_frac`` so every (group, partition) cell receives at
+        least one unit; a ``ValueError`` is raised when the rounded cell
+        counts would leave any cell empty.
     partition_frac : float, default=0.5
-        Fraction of units with ``partition=1`` (assigned independently of
-        ``group``). Must be in ``(0, 1)``.
+        Fraction of units with ``partition=1`` within each ``group``
+        stratum. Must be in ``(0, 1)``. The stratified allocation is what
+        makes TripleDifference.fit's 2x2x2 surface populated for any valid
+        ``(n_units, group_frac, partition_frac)``.
     treatment_effect : float, default=2.0
         True ATT for the triple-interaction cell (group=1, partition=1,
         post=1).
@@ -1270,16 +1276,45 @@ def generate_ddd_panel_data(
             f"got {n_units}."
         )
 
+    # Stratified allocation that guarantees every (group, partition) cell receives
+    # at least one unit. Independent marginal sampling could collapse a cell when
+    # rounded marginals are small (e.g., n_units=4, group_frac=partition_frac=0.25
+    # yields n_group_1=1 with no room for both partition values within group=1).
+    # TripleDifference.fit requires all 8 G×P×T cells, so we validate first.
+    n_group_1 = int(round(n_units * group_frac))
+    n_group_0 = n_units - n_group_1
+    n_p1_g0 = int(round(n_group_0 * partition_frac))
+    n_p1_g1 = int(round(n_group_1 * partition_frac))
+    n_p0_g0 = n_group_0 - n_p1_g0
+    n_p0_g1 = n_group_1 - n_p1_g1
+    cell_counts = {
+        (0, 0): n_p0_g0,
+        (0, 1): n_p1_g0,
+        (1, 0): n_p0_g1,
+        (1, 1): n_p1_g1,
+    }
+    if min(cell_counts.values()) < 1:
+        raise ValueError(
+            f"generate_ddd_panel_data requires every (group, partition) cell to "
+            f"contain at least one unit so TripleDifference.fit's 2x2x2 surface is "
+            f"populated. With n_units={n_units}, group_frac={group_frac}, "
+            f"partition_frac={partition_frac}, the rounded cell counts are "
+            f"(group, partition): (0,0)={n_p0_g0}, (0,1)={n_p1_g0}, "
+            f"(1,0)={n_p0_g1}, (1,1)={n_p1_g1}. Increase n_units or move "
+            f"group_frac/partition_frac closer to 0.5 so each cell receives "
+            f">=1 unit."
+        )
+
     rng = np.random.default_rng(seed)
 
-    n_group_1 = max(1, min(n_units - 1, int(round(n_units * group_frac))))
-    n_partition_1 = max(1, min(n_units - 1, int(round(n_units * partition_frac))))
     group = np.zeros(n_units, dtype=int)
     group_idx = rng.choice(n_units, size=n_group_1, replace=False)
     group[group_idx] = 1
     partition = np.zeros(n_units, dtype=int)
-    partition_idx = rng.choice(n_units, size=n_partition_1, replace=False)
-    partition[partition_idx] = 1
+    g0_units = np.where(group == 0)[0]
+    g1_units = np.where(group == 1)[0]
+    partition[rng.choice(g0_units, size=n_p1_g0, replace=False)] = 1
+    partition[rng.choice(g1_units, size=n_p1_g1, replace=False)] = 1
 
     unit_fe = rng.normal(0.0, unit_fe_sd, size=n_units)
 
diff --git a/tests/test_prep.py b/tests/test_prep.py
index 696f507ce..38607850e 100644
--- a/tests/test_prep.py
+++ b/tests/test_prep.py
@@ -1234,6 +1234,59 @@ def test_n_units_validation(self):
         with pytest.raises(ValueError, match="n_units"):
             generate_ddd_panel_data(n_units=3, seed=42)
 
+    def test_infeasible_cell_counts_raise(self):
+        """Configs that round to empty (group, partition) cells fail fast."""
+        from diff_diff.prep import generate_ddd_panel_data
+
+        # n_units=4, group_frac=0.25 → n_group_1=1; partition_frac=0.25 inside
+        # the 1-unit group=1 stratum rounds to 0, leaving the (1, 1) cell empty.
+        with pytest.raises(ValueError, match=r"every \(group, partition\) cell"):
+            generate_ddd_panel_data(
+                n_units=4,
+                group_frac=0.25,
+                partition_frac=0.25,
+                seed=42,
+            )
+        # n_units=10, group_frac=0.1 → n_group_1=1 again; same failure mode for
+        # the (1, 1) cell.
+        with pytest.raises(ValueError, match=r"every \(group, partition\) cell"):
+            generate_ddd_panel_data(
+                n_units=10,
+                group_frac=0.1,
+                partition_frac=0.5,
+                seed=42,
+            )
+
+    def test_smallest_feasible_config_populates_all_cells(self):
+        """n_units=4 with fracs=0.5 yields one unit per (group, partition) cell."""
+        from diff_diff import TripleDifference
+        from diff_diff.prep import generate_ddd_panel_data
+
+        data = generate_ddd_panel_data(
+            n_units=4,
+            n_periods=4,
+            treatment_period=2,
+            group_frac=0.5,
+            partition_frac=0.5,
+            noise_sd=0.0,
+            seed=42,
+        )
+        # All four (group, partition) cells must receive exactly one unit.
+        unit_cells = data.groupby("unit")[["group", "partition"]].first()
+        cell_counts = unit_cells.groupby(["group", "partition"]).size()
+        assert len(cell_counts) == 4
+        assert (cell_counts == 1).all()
+        # TripleDifference.fit (which requires all 8 G×P×T cells) must succeed
+        # on the smallest feasible config — validates the public contract
+        # advertised in the docstring's compatibility paragraph.
+        TripleDifference().fit(
+            data,
+            outcome="outcome",
+            group="group",
+            partition="partition",
+            time="post",
+        )
+
     def test_reproducibility(self):
         """Same seed produces identical DataFrames."""
         from diff_diff.prep import generate_ddd_panel_data

From f564ce902ee82456c32d10a2cbf45d05bac4eb8c Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 13:14:15 -0400
Subject: [PATCH 6/6] prep_dgp: document and test cluster="unit" pattern for
 panel DGP

R3 review flagged that `generate_ddd_panel_data` advertises direct
`TripleDifference().fit(..., time="post")` usage, but `TripleDifference`
is the repeated-cross-section `panel=FALSE` estimator whose default
analytical SE treats each row as iid (df = n_obs - 8). With the new
panel-shaped output (unit FE + within-unit serial correlation),
unclustered SE understates variance and overstates power.

Updates:
- DGP docstring gains a `.. warning::` block stating the panel + cluster
  requirement; the `Examples` block now demonstrates
  `TripleDifference(cluster="unit").fit(...)` as the recommended pattern.
- REGISTRY.md `## TripleDifference` SE section gains a "Note (panel-shaped
  input, `generate_ddd_panel_data`)" paragraph documenting the
  repeated-cross-section semantics and the cluster contract.
- CHANGELOG `[Unreleased]` entry tightened to mention the cluster
  requirement explicitly.
- New test `test_recommended_clustered_panel_path` locks the documented
  pattern: clustered fit succeeds, `n_clusters == n_units`, ATT point
  estimate is invariant to clustering, but SE differs between clustered
  and unclustered fits (within-unit correlation is non-trivial).

Point estimate semantics unchanged. Fix is documentation + invariance
test only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                 |  2 +-
 diff_diff/prep_dgp.py        | 19 ++++++++++++++--
 docs/methodology/REGISTRY.md | 10 ++++++++
 tests/test_prep.py           | 44 ++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 956365c86..223abd121 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
-- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference.fit(..., time="post")` directly — users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md.
+- **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference(cluster="unit").fit(..., time="post")` (the cluster kwarg is required because `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator and unclustered SE on panel-generated rows understates variance under within-unit serial correlation; the point estimate `att` is invariant to clustering — see the new `TripleDifference` REGISTRY note on panel-shaped input). Users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md.
 - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete (R parity goldens pending)**). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (~24 tests across 6 classes: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` skips with a pointer when goldens missing; `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); JSON goldens deferred until `bacondecomp` R package is installed (parity tests skip cleanly with an explicit pointer). (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row).
 - **`SpilloverDiD` — ring-indicator spillover-aware DiD (Butts 2021).** New standalone estimator at `diff_diff/spillover.py` implementing two-stage Gardner methodology with ring-indicator covariates that identify direct effect on treated (`tau_total`) alongside per-ring spillover effects on near-control units (`delta_j`). Documented synthesis of ingredients (no single published software covers the exact recipe — `did2s` implements Gardner two-stage without rings; the Butts ring estimator has no R/Stata package): Butts (2021) Section 5 / Table 2 identification, Gardner (2022) two-stage residualize-then-fit, and the Conley spatial-HAC vcov shipped in 3.3.3. Handles both panel non-staggered (Equations 5/6/8) and Section 5 staggered timing in one estimator — non-staggered is the special case where all treated units share an onset time. **API:** `SpilloverDiD(rings=[0, 50, 100, 200], conley_coords=("lat","lon"), ...).fit(data, outcome="y", unit="unit", time="t", treatment="D")` (binary D auto-converted to `first_treat`) or `.fit(..., first_treat="first_treat")` (Gardner convention). Result: `SpilloverDiDResults(DiDResults)` with `.att` = `tau_total`, `.spillover_effects` (per-ring `pd.DataFrame` with `coef`/`se`/`t_stat`/`p_value`/`ci_low`/`ci_high`), `.ring_breakpoints`, `.d_bar`, `.n_units_ever_in_ring`, `.n_far_away_obs`, `.is_staggered`. `.coefficients` exposes all `(1+K)` stage-2 entries (`"treatment"` + `"_spillover_<ring_label>"`) plus an `"ATT"` alias keyed to vcov columns. **Methodology spec (committed):** stage-2 regressor is the time-varying `(1 - D_it) * Ring_{it,j}` form (paper page 12's `S_it = S_i * 1{t >= t_treat}` notation; Section 5 Table 2's `S^k_{it}` / `Ring^k_{it,j}`). Reading the literal unit-static `(1 - D_it) * S_i` from Equation 5 is algebraically rank-deficient under TWFE (`(1-D_it) * S_i = S_i - D_it`, with `S_i` absorbed by `mu_i`, leaving `-D_it`); only the time-varying form supports the paper's identification (Proposition 2.3). Stage-1 subsample uses Butts' STRICTER `Omega_0 = {D_it = 0 AND S_it = 0}` (untreated AND unexposed), not TwoStageDiD's `{D_it = 0}` alone — this prevents spillover-contaminated near-controls in pre/post periods from biasing the time FE. **Gardner identity (non-staggered):** a 20-seed deterministic regression test pins `SpilloverDiD.att` against a direct single-stage TWFE ring regression on the full sample (`y ~ mu_i + lambda_t + tau * D_it + sum_j delta_j * (1 - D_it) * Ring_{it,j}`) at `atol=1e-10` — empirically bit-identical, so the reported non-staggered `tau_total` IS the Butts Eqs. 4-6 estimator. **Identification-check policy (period strict, unit warn-and-drop, plus connectivity):** every period must have at least one Omega_0 row (hard `ValueError` — dropping a period removes all units' cross-time identification). Units lacking Omega_0 rows (e.g. baseline-treated units with `D_it = 1` at every observed `t`) are warned-and-dropped: their unit FE is NaN, residualization writes NaN on their rows, and the downstream finite-mask path excludes them from stage 2 — mirrors `TwoStageDiD`'s always-treated convention. Additionally, the supported-units bipartite graph (units linked by shared Omega_0 periods) must form a single connected component; `K > 1` components raise `ValueError` because the FE solver would return only component-specific constants and residualization would silently mix them across components (defense-in-depth — under absorbing treatment the disconnected case may be unreachable through the upstream validators, but the check future-proofs Wave B follow-ups). **Public API restrictions (Wave B MVP):** `covariates=` raises `NotImplementedError` because Gardner-style two-stage requires covariate effects estimated on the untreated-and-unexposed subsample at stage 1 (appending raw covariates only at stage 2 silently biases `tau_total` / `delta_j` on panels with time-varying covariates); non-absorbing / reversible treatment patterns (e.g. `[0, 1, 0]`) raise `ValueError` rather than being silently coerced into "treated from first 1 onward"; non-constant `first_treat` values across rows of the same unit raise `ValueError`; `conley_coords` is required on every fit path (not just `vcov_type="conley"`) because ring construction always uses it. **Far-away control identification:** uses CURRENT-period untreated status (`D_it = 0`) rather than never-treated-only, so all-eventually-treated staggered designs (no never-treated units) can identify the counterfactual via not-yet-treated far-away rows. **Variance (Wave B MVP):** stage-2 OLS variance via `solve_ols` (HC1 / Conley / cluster paths all flow through). The Gardner GMM first-stage uncertainty correction is NOT applied at stage 2 in this PR (documented limitation; planned follow-up extends `two_stage.py::_compute_gmm_variance` to accept a Conley kernel matrix in place of HC1's identity at the influence-function outer-product step). **Deferred features (planned follow-ups):** `event_study=True` per-event-time × ring coefficients (Butts Table 2), `survey_design=` integration, `ring_method="count"` (count-of-treated-in-ring), data-driven `d_bar` selection (Butts 2021b / Butts 2023 JUE Insight), Gardner GMM first-stage correction at stage 2, sparse staggered ring-distance path. **Tests:** `tests/test_spillover.py` (157 tests across ring-construction primitives, validators, fit integration, raw-data invariant, identification MC — non-staggered DGP at 50 seeds + 200-seed `@pytest.mark.slow` variant recovers both `tau_total` and `delta_1`; staggered DGP at 30 seeds anchors both `tau_total` and `delta_1` — Conley plumbing (verifies `solve_ols` is called with `vcov_type="conley"` + Conley kwargs, no silent HC1 fallback), Gardner identity bit-identity, coefficients-vs-vcov alignment, warn-and-drop, rank_deficient_action validation, Omega_0 bipartite-graph connectivity, anticipation behavior on both fit paths). DGP factories `tests/_dgp_utils.py::generate_butts_nonstaggered_dgp` / `generate_butts_staggered_dgp` satisfy Butts Assumptions 1/3/5/7 by construction.
 - **`ChaisemartinDHaultfoeuille.predict_het` × `placebo`: R-parity on both global and per-path surfaces.** R-verified — `did_multiplegt_dyn(predict_het, placebo)` emits heterogeneity OLS results on backward (placebo) horizons via R's `DIDmultiplegtDYN:::did_multiplegt_main` placebo block (`effect = matrix(-i, ...)` rbind site); the same block runs per-by_level under `did_multiplegt_dyn(by_path, predict_het, placebo)`, so both global `res$results$predict_het` and per-by_level `res$by_level_i$results$predict_het` slots emit backward rows. R's predict_het syntax with `placebo > 0` requires the `c(-1)` sentinel in the horizon vector to trigger "compute heterogeneity for ALL forward (1..effects) AND ALL placebo (1..placebo) positions" — passing positive-only horizons errors with "specified numbers in predict_het that exceed the number of placebos". Python mirrors via `_compute_heterogeneity_test(..., placebo=L_max)` (set automatically from `self.placebo` at both global and per-path call sites in `fit()`) — the function iterates forward (1..L_max) and backward (-1..-L_max) horizons in a single loop with an explicit `out_idx < 0` eligibility guard for backward horizons whose `F_g` is too small (would otherwise silently misread `N_mat` via numpy negative indexing). `results.heterogeneity_effects` uses negative-int keys for backward horizons; `path_heterogeneity_effects` does the same per path. Placebo rows in `to_dataframe(level="by_path")` have non-NaN `het_*` columns when `placebo=True` and `heterogeneity=` are both set. **Survey gate (warn + skip):** `survey_design + placebo + heterogeneity` emits a `UserWarning` at fit-time and falls back to forward-horizon-only heterogeneity on both surfaces — the Binder TSL cell-period allocator's REGISTRY justification is tied to **post-period** attribution; backward-horizon attribution puts ψ_g mass on a pre-period cell, a separate library-extension claim that needs its own derivation. Forward-horizon `predict_het + survey_design` continues to work unchanged on both global and per-path surfaces. The function-level `_compute_heterogeneity_test` keeps a per-iteration `NotImplementedError` backstop for direct callers that bypass fit(). Pre-period allocator derivation deferred to a follow-up methodology PR (tracked in TODO.md). R parity confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityHeterogeneityWithPlacebo` (scenario 23, `multi_path_reversible_predict_het_with_placebo_global`, `placebo=2, effects=3, no by_path`) and `::TestDCDHDynRParityByPathHeterogeneityWithPlacebo` (scenario 22, same DGP plus `by_path=3`); pinned at `BETA_RTOL=1e-6` / `SE_RTOL=1e-5` for `beta` / `se` / `t_stat` / `n_obs` and `INFERENCE_RTOL=1e-4` for `p_value` / `conf_int` across 3 paths × (3 forward + 2 placebo) = 15 horizons + 1 global × 5 horizons. Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPredictHetPlacebo` (placebo het column population, survey-gate warn+skip behavior, forward+survey anti-regression, `out_idx<0` eligibility guard, single-path telescope `path_heterogeneity_effects[(only_path,)] == heterogeneity_effects` bit-exactly, summary rendering, direct-call `NotImplementedError` backstop). Closes TODO #422.
diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py
index a178b0d02..36f2656d1 100644
--- a/diff_diff/prep_dgp.py
+++ b/diff_diff/prep_dgp.py
@@ -1178,6 +1178,19 @@ def generate_ddd_panel_data(
     making it suitable for panel-aware power-analysis simulations or
     sanity-checking estimators that ignore the panel dimension.
 
+    .. warning::
+
+        ``TripleDifference`` is a repeated-cross-section ``panel=FALSE``
+        estimator: its analytical default treats each row as an
+        independent observation (df = n_obs - 8). When fitting against
+        ``generate_ddd_panel_data`` output, the within-unit serial
+        correlation makes unclustered SEs anti-conservative — they
+        understate sampling variability and overstate power. Always pass
+        ``cluster="unit"`` (Liang-Zeger CR1) when fitting on
+        panel-generated data; the point estimate ``att`` is invariant to
+        clustering but the inference contract is not. See the
+        ``TripleDifference`` REGISTRY entry for the clustering contract.
+
     Parameters
     ----------
     n_units : int, default=200
@@ -1253,10 +1266,12 @@ def generate_ddd_panel_data(
     >>> data.groupby('unit')['period'].count().eq(8).all()
     True
 
-    Fit with TripleDifference (note ``time="post"``):
+    Fit with TripleDifference. Note ``time="post"`` (the derived binary
+    indicator) and ``cluster="unit"`` (required for valid inference on
+    panel-generated data; see the warning above):
 
     >>> from diff_diff import TripleDifference
-    >>> result = TripleDifference().fit(
+    >>> result = TripleDifference(cluster="unit").fit(
     ...     data, outcome='outcome', group='group',
     ...     partition='partition', time='post',
     ... )
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 4e1495789..6d64c37d0 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -1747,6 +1747,16 @@ finite-sample adjustment.
 Note: IF-based SEs are inherently heteroskedasticity-robust; the `robust` parameter
 has no additional effect.
 
+**Note (panel-shaped input, `generate_ddd_panel_data`):** `TripleDifference` is the
+repeated-cross-section `panel=FALSE` implementation: the individual-level default SE
+treats each row as an independent observation (df = n_obs - 8). When fitting against
+panel-shaped data with repeated unit rows and within-unit serial correlation
+(e.g., `generate_ddd_panel_data` output), unclustered SE understates sampling
+variability and overstates power. Pass `cluster="unit"` to invoke the Liang-Zeger
+CR1 path so the influence functions are aggregated within unit before variance
+computation. The point estimate `att` is invariant to clustering, only the inference
+contract changes.
+
 *Edge cases:*
 - Propensity scores near 0/1: trimmed at `pscore_trim` (default 0.01)
 - Empty cells: raises ValueError with diagnostic message
diff --git a/tests/test_prep.py b/tests/test_prep.py
index 38607850e..408adc36a 100644
--- a/tests/test_prep.py
+++ b/tests/test_prep.py
@@ -1287,6 +1287,50 @@ def test_smallest_feasible_config_populates_all_cells(self):
             time="post",
         )
 
+    def test_recommended_clustered_panel_path(self):
+        """Documented clustered-by-unit path produces n_clusters == n_units inference.
+
+        Locks the docstring's recommended pattern (`cluster="unit"` for panel data)
+        against silent regression: TripleDifference is the repeated-cross-section
+        `panel=FALSE` estimator, so unclustered SE on panel-generated rows
+        understates variance. Clustering by `unit` aggregates per-row IFs within
+        unit before variance computation (Liang-Zeger CR1).
+        """
+        from diff_diff import TripleDifference
+        from diff_diff.prep import generate_ddd_panel_data
+
+        n_units = 200
+        data = generate_ddd_panel_data(
+            n_units=n_units,
+            n_periods=8,
+            treatment_period=4,
+            seed=42,
+        )
+        result = TripleDifference(cluster="unit").fit(
+            data,
+            outcome="outcome",
+            group="group",
+            partition="partition",
+            time="post",
+        )
+        assert np.isfinite(result.att)
+        assert np.isfinite(result.se)
+        # Cluster-robust path records the number of clusters used.
+        assert result.n_clusters == n_units
+        # Unclustered fit on the same data MUST produce a different SE (clustering
+        # is materially different when within-unit serial correlation exists).
+        unclustered = TripleDifference().fit(
+            data,
+            outcome="outcome",
+            group="group",
+            partition="partition",
+            time="post",
+        )
+        # Point estimate is invariant to clustering.
+        np.testing.assert_allclose(unclustered.att, result.att, atol=1e-10)
+        # SE differs because panel rows are not iid.
+        assert unclustered.se != result.se
+
     def test_reproducibility(self):
         """Same seed produces identical DataFrames."""
         from diff_diff.prep import generate_ddd_panel_data