Round-2 CI P2s: stale warning prepass + stale heterogeneity Note + varying-PSU equivalence test

igerber · claude · igerber · commit c930f74b2db1 · 2026-04-19T10:25:25.000-04:00
Addresses the three P2 findings from the CI re-review (all P0s cleared). 1. **Warning prepass assumed one PSU per group** (`chaisemartin_dhaultfoeuille.py:2111-2148`). The old code collected `labels[0]` per eligible group, so a within-group-varying PSU design was mis-counted as having one PSU per group and emitted a misleading "strictly-coarser PSU" UserWarning. Rewrite counts unique PSU labels across all positive-weight obs of eligible groups (not just the first label per group); under PSU=group unchanged, under varying-PSU no false warning. 2. **REGISTRY heterogeneity Note still claimed NotImplementedError** (`REGISTRY.md:618`, "Combining heterogeneity= with n_bootstrap > 0 and within-group-varying PSU still raises NotImplementedError"). That gate was removed in the current PR. Update to clarify that heterogeneity inference stays analytical when bootstrap runs on the main ATT surfaces — the two inference paths are independent. 3. **Zero-weight-equivalence test used `psu=group`** (`test_bootstrap_zero_weight_group_equivalent_to_removing_it`). Under PSU=group both the buggy and correct code paths collapse to the same identity-draw structure, so the test didn't actually exercise the P0 #1 regression. Switch the fixture to within-group-varying PSU (period parity per group) so the cell-level dispatcher is invoked and the before-fix silent-dropback bug would fail this test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2109,16 +2109,17 @@ def fit(
                     )
 
                 # Warning fires only when PSU is strictly coarser than
-                # group. Under auto-inject psu=group (or explicit
-                # psu=<group_col>), groups and PSUs coincide so
-                # Hall-Mammen wild PSU bootstrap equals group-level
-                # multiplier bootstrap — no need to warn.
-                # Count groups/PSUs on the POST-FILTER eligible set
-                # (`_eligible_group_ids`) — the same set the bootstrap
-                # map is built from below. Using raw positive-weight
-                # groups here would emit a misleading warning on
-                # panels where upstream dCDH filtering drops groups
-                # that happen to share PSUs with kept groups.
+                # group (multiple eligible groups share a PSU label).
+                # Under auto-inject psu=group or PSU that varies within
+                # group (each group contributes to >= 1 PSU), the
+                # warning should NOT fire because the Hall-Mammen
+                # wild PSU bootstrap is either identical to a group-
+                # level multiplier bootstrap (PSU=group) or finer-than-
+                # group (varying PSU; the cell-level allocator honors
+                # the per-cell PSU structure). Count unique PSUs
+                # across ALL positive-weight obs of eligible groups,
+                # not just the first label per group — under varying
+                # PSU a group spans multiple PSUs.
                 psu_arr_warn = getattr(resolved_survey, "psu", None)
                 if psu_arr_warn is None or _obs_survey_info is None:
                     # No PSU info — can't compare to group count.
@@ -2130,24 +2131,22 @@ def fit(
                     )
                     pos_mask_warn = obs_ws_warn > 0
                     psu_codes_warn = np.asarray(psu_arr_warn)
-                    # Collect the PSU label for each variance-eligible
-                    # group. PSU that varies within group is rejected
-                    # for the bootstrap path upstream (NotImplementedError
-                    # gate), so the first positive-weight label
-                    # represents the whole group here.
-                    eligible_psu_labels: List[Any] = []
-                    for gid in _eligible_group_ids:
-                        mask_g = (obs_gids_warn == gid) & pos_mask_warn
-                        if mask_g.any():
-                            eligible_psu_labels.append(
-                                psu_codes_warn[mask_g][0]
-                            )
-                    n_groups_eff_warn = len(eligible_psu_labels)
-                    n_psu_eff_warn = (
-                        int(len(np.unique(np.asarray(eligible_psu_labels))))
-                        if eligible_psu_labels
-                        else -1
+                    # Restrict to positive-weight obs whose group is
+                    # variance-eligible, then count unique PSU labels
+                    # across that full set (not first-per-group).
+                    eligible_gid_set = set(_eligible_group_ids)
+                    elig_obs_mask_warn = pos_mask_warn & np.array(
+                        [g in eligible_gid_set for g in obs_gids_warn],
+                        dtype=bool,
                     )
+                    if elig_obs_mask_warn.any():
+                        elig_psu_labels_arr = psu_codes_warn[elig_obs_mask_warn]
+                        n_psu_eff_warn = int(
+                            len(np.unique(elig_psu_labels_arr))
+                        )
+                        n_groups_eff_warn = len(_eligible_group_ids)
+                    else:
+                        n_psu_eff_warn, n_groups_eff_warn = -1, -1
                 if 0 <= n_psu_eff_warn < n_groups_eff_warn:
                     warnings.warn(
                         f"Bootstrap with survey_design uses Hall-Mammen "
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`.
 
-- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the group-level WLS coefficient influence function is `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`. The group-level IF is then attributed to observation level via **one of two allocators, chosen by variance helper** so each path preserves byte-identity for its aggregation rule: (1) **Binder TSL** (`compute_survey_if_variance`) uses the **cell-period single-cell allocator** — at each horizon `l_h`, `ψ_g` is assigned in full to the post-period cell `(g, out_idx)` with `out_idx = first_switch_idx[g] - 1 + l_h` and expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})` for obs in that cell, zero elsewhere (matches the DID_l post-period convention in the Survey IF expansion Note below). Under PSU=group per-observation distribution differs from the legacy `ψ_i = ψ_g * (w_i / W_g)`, but PSU-level aggregates telescope to the same `ψ_g` — so Binder TSL variance is byte-identical to the pre-cell-period release under PSU=group. Under within-group-varying PSU mass lands in the post-period PSU of the transition, which is what Binder TSL needs. An **empty post-period cell under zero-weight obs** (all obs at `(g, out_idx)` have `w_i = 0` despite `N > 0`) drops the group's contribution, matching the ATT cell allocator's convention; the pre-cell-period path diverged here by redistributing mass to other cells of the group. (2) **Rao-Wu replicate** (`compute_replicate_if_variance`) uses the **legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)`. Replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at the observation level, so moving `ψ_g` mass onto the post-period cell only would silently change the replicate SE whenever a replicate column's ratios vary within group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). Keeping the legacy allocator on this branch preserves byte-identity of replicate SE across every previously-supported fit; replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (bootstrap):** Combining `heterogeneity=` with `n_bootstrap > 0` and within-group-varying PSU still raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap uses a group-level PSU map until the follow-up PR extends it.
+- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the group-level WLS coefficient influence function is `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`. The group-level IF is then attributed to observation level via **one of two allocators, chosen by variance helper** so each path preserves byte-identity for its aggregation rule: (1) **Binder TSL** (`compute_survey_if_variance`) uses the **cell-period single-cell allocator** — at each horizon `l_h`, `ψ_g` is assigned in full to the post-period cell `(g, out_idx)` with `out_idx = first_switch_idx[g] - 1 + l_h` and expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})` for obs in that cell, zero elsewhere (matches the DID_l post-period convention in the Survey IF expansion Note below). Under PSU=group per-observation distribution differs from the legacy `ψ_i = ψ_g * (w_i / W_g)`, but PSU-level aggregates telescope to the same `ψ_g` — so Binder TSL variance is byte-identical to the pre-cell-period release under PSU=group. Under within-group-varying PSU mass lands in the post-period PSU of the transition, which is what Binder TSL needs. An **empty post-period cell under zero-weight obs** (all obs at `(g, out_idx)` have `w_i = 0` despite `N > 0`) drops the group's contribution, matching the ATT cell allocator's convention; the pre-cell-period path diverged here by redistributing mass to other cells of the group. (2) **Rao-Wu replicate** (`compute_replicate_if_variance`) uses the **legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)`. Replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at the observation level, so moving `ψ_g` mass onto the post-period cell only would silently change the replicate SE whenever a replicate column's ratios vary within group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). Keeping the legacy allocator on this branch preserves byte-identity of replicate SE across every previously-supported fit; replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (bootstrap):** Heterogeneity inference is analytical (no bootstrap path). When `n_bootstrap > 0` is combined with `heterogeneity=`, the main ATT surfaces receive bootstrap SE/CI (via the cell-level wild PSU bootstrap described in the survey + bootstrap contract Note below) while `heterogeneity_effects` continues to use the Binder TSL / Rao-Wu analytical SE described above. No gate; the two inference paths are independent.
 
 - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning.
 
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
@@ -2003,16 +2003,21 @@ def test_bootstrap_cell_level_with_all_zero_weight_group_does_not_crash(self):
         assert np.isfinite(res.bootstrap_results.overall_se)
 
     def test_bootstrap_zero_weight_group_equivalent_to_removing_it(self):
-        """Fixture A: 9 groups (1 all-zero-weighted + 8 positive).
-        Fixture B: 8 groups (same panel without the zero-weight
-        group). Under the fix, an eligible group that has no
-        positive-weight cells contributes nothing to the bootstrap
-        (its `psu_codes_per_cell` row is all sentinel). Both fits
-        therefore produce byte-identical bootstrap SE at the same
-        seed. Without the fix, the `valid_map` gate in fit() would
-        disable the entire PSU-aware path when any row is all
-        sentinel, silently dropping to unclustered group-level for
-        the other groups.
+        """Fixture A: 9 groups (1 all-zero-weighted + 8 positive)
+        with **within-group-varying PSU** so the dispatcher routes
+        through the cell-level path. Fixture B: 8 groups (same panel
+        without the zero-weight group), same varying PSU. Under the
+        fix, an eligible group with no positive-weight cells
+        contributes nothing to the bootstrap (its row of
+        `psu_codes_per_cell` is all sentinel), so both fits produce
+        byte-identical bootstrap SE at the same seed. Without the
+        fix, the `valid_map` gate disabled the entire PSU-aware
+        path — silently dropping fixture A to unclustered group-
+        level bootstrap while fixture B correctly ran the cell-
+        level path. Using `psu=group` (a within-group-constant PSU)
+        would not exercise this regression because the buggy and
+        correct paths collapse to the same identity-draw structure
+        under PSU=group — we deliberately use varying PSU here.
         """
         def _make(include_zero_group: bool) -> pd.DataFrame:
             rows = []
@@ -2023,13 +2028,16 @@ def _make(include_zero_group: bool) -> pd.DataFrame:
                     pw = 0.0 if (include_zero_group and g == 8) else 1.0
                     d = 1 if (f is not None and t >= f) else 0
                     y = float(g) + 0.1 * t + 1.0 * d
+                    # Within-group-varying PSU (period parity per
+                    # group) — exercises the cell-level dispatcher.
+                    psu = int(g) * 2 + (int(t) % 2)
                     rows.append({
                         "group": int(g),
                         "period": int(t),
                         "treatment": int(d),
                         "outcome": y,
                         "pw": pw,
-                        "psu": int(g),  # PSU=group, constant path
+                        "psu": psu,
                     })
             return pd.DataFrame(rows)
 
@@ -2053,8 +2061,9 @@ def _make(include_zero_group: bool) -> pd.DataFrame:
         assert np.isfinite(se_a) and np.isfinite(se_b)
         assert se_a == pytest.approx(se_b, rel=0.0, abs=1e-15), (
             f"Bootstrap SE must match when a zero-weight eligible "
-            f"group is added (fix P0 #1 — no silent dropback to "
-            f"unclustered group-level). Got SE_with_zero={se_a!r}, "
+            f"group is added under within-group-varying PSU (fix "
+            f"P0 #1 — no silent dropback to unclustered group-"
+            f"level). Got SE_with_zero={se_a!r}, "
             f"SE_without_zero={se_b!r}."
         )