Densify PSU codes over eligible subset + always populate per-cell tensor

igerber · claude · igerber · commit ee0cc544958e · 2026-04-19T10:13:53.000-04:00
Addresses two P0 correctness regressions in the PR-4 bootstrap PSU-map plumbing flagged by CI review. **P0 #1 - valid_map gate discarded the per-cell tensor too eagerly.** When any variance-eligible group had no positive-weight cells (all- sentinel row in psu_codes_per_cell), the old code set valid_map=False and left BOTH group_id_to_psu_code_bootstrap AND psu_codes_per_cell_bootstrap as None. The bootstrap then silently dropped to unclustered group-level instead of excluding only that group's empty row. Fix: always populate psu_codes_per_cell_bootstrap once the tensor is built; the cell-level path already masks out -1 cells at unroll time. Always populate group_id_to_psu_code_bootstrap with a per-group code (use placeholder 0 for all-sentinel rows since those groups have no IF mass and the multiplier they receive is irrelevant on either the legacy or the cell-level path). **P0 #2 - dense PSU codes factorized over non-eligible subset.** `np.unique(obs_psu_codes[pos_mask_boot])` previously included PSU labels from groups that were filtered out of _eligible_group_ids (e.g., singleton-baseline-excluded groups). The excluded groups' PSUs contributed dense codes that formed gaps in the eligible subset's map. Downstream `_generate_psu_or_group_weights` computes `n_psu = max(code) + 1` and triggers the identity fast path when `n_psu >= n_groups_target`. A gapped map like `[1, 1]` or `[0, 2, 2]` silently activated independent-draws clustering for eligible groups that should have shared a multiplier. Fix: restrict the np.unique factorization to the eligible-subset positive-weight obs only (`elig_obs_mask = pos_mask_boot & (g_idx_arr >= 0) & (t_idx_arr >= 0)`), so the dense code domain exactly matches the PSUs actually used by variance-eligible groups. Tests: - `test_bootstrap_zero_weight_group_equivalent_to_removing_it`: fit with vs without an all-zero-weight eligible group must produce byte-identical bootstrap SE at the same seed (byte- identity would have failed before P0 #1 fix because valid_map flipped the PSU-aware path off for the with-zero-group fit). - `test_bootstrap_dense_codes_under_singleton_baseline_excluded_group`: spies on the group_id_to_psu_code dict passed to `_compute_dcdh_bootstrap` under a fixture with an always-treated singleton-baseline group and strictly-coarser PSU among eligible groups. Asserts the dict's values form a contiguous `[0, n_unique-1]` range (no gaps from the excluded group's PSU), and that eligible groups sharing a PSU label receive the same dense code. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2314,81 +2314,90 @@ def fit(
                     _obs_survey_info["weights"], dtype=np.float64
                 )
                 pos_mask_boot = obs_weights_boot > 0
+                gid_to_idx = {
+                    gid: i for i, gid in enumerate(_eligible_group_ids)
+                }
+                tid_to_idx = {t: i for i, t in enumerate(all_periods)}
+                n_elig_boot = len(_eligible_group_ids)
+                n_per_boot = len(all_periods)
+                g_idx_arr = np.array(
+                    [gid_to_idx.get(g, -1) for g in obs_gids_boot],
+                    dtype=np.int64,
+                )
+                t_idx_arr = np.array(
+                    [tid_to_idx.get(t, -1) for t in obs_tids_boot],
+                    dtype=np.int64,
+                )
                 # Factor PSU labels to dense int codes over the
-                # positive-weight subpopulation. Shared code domain
-                # for both the per-cell tensor and the group-level
-                # dict below.
-                pos_psu_labels = obs_psu_codes[pos_mask_boot]
+                # **eligible-subset** positive-weight observations only
+                # (not the full positive-weight population). Restricting
+                # to eligible obs ensures the resulting dense codes
+                # range ONLY over PSUs actually used by variance-
+                # eligible groups, so downstream n_psu = max(code) + 1
+                # is exact: no gaps from singleton-baseline-excluded
+                # groups that would silently trigger the identity
+                # fast path in `_generate_psu_or_group_weights`.
+                elig_obs_mask = (
+                    pos_mask_boot & (g_idx_arr >= 0) & (t_idx_arr >= 0)
+                )
+                elig_psu_labels = obs_psu_codes[elig_obs_mask]
                 dense_per_row: Optional[np.ndarray] = None
-                if pos_psu_labels.size > 0:
-                    _, pos_dense_codes = np.unique(
-                        pos_psu_labels, return_inverse=True,
+                if elig_psu_labels.size > 0:
+                    _, elig_dense_codes = np.unique(
+                        elig_psu_labels, return_inverse=True,
                     )
-                    pos_dense_codes = np.asarray(pos_dense_codes, dtype=np.int64)
+                    elig_dense_codes = np.asarray(elig_dense_codes, dtype=np.int64)
                     dense_per_row = np.full(
                         len(obs_psu_codes), -1, dtype=np.int64,
                     )
-                    dense_per_row[pos_mask_boot] = pos_dense_codes
+                    dense_per_row[elig_obs_mask] = elig_dense_codes
 
                 # Per-cell PSU tensor: (n_eligible, n_periods), -1 sentinel
-                # for ineligible / zero-weight cells.
+                # for ineligible / zero-weight cells. Populated
+                # unconditionally when `dense_per_row` exists — a row
+                # that ends up all-sentinel (eligible group with no
+                # positive-weight obs) is masked out at unroll time,
+                # not by discarding the entire tensor. See also the
+                # dispatcher's `_psu_varies_within_group` helper which
+                # ignores sentinel entries row-wise.
                 if dense_per_row is not None:
-                    gid_to_idx = {
-                        gid: i for i, gid in enumerate(_eligible_group_ids)
-                    }
-                    tid_to_idx = {t: i for i, t in enumerate(all_periods)}
-                    n_elig_boot = len(_eligible_group_ids)
-                    n_per_boot = len(all_periods)
                     psu_codes_per_cell = np.full(
                         (n_elig_boot, n_per_boot), -1, dtype=np.int64,
                     )
-                    g_idx_arr = np.array(
-                        [gid_to_idx.get(g, -1) for g in obs_gids_boot],
-                        dtype=np.int64,
-                    )
-                    t_idx_arr = np.array(
-                        [tid_to_idx.get(t, -1) for t in obs_tids_boot],
-                        dtype=np.int64,
-                    )
-                    valid_obs_boot = (
-                        pos_mask_boot
-                        & (g_idx_arr >= 0)
-                        & (t_idx_arr >= 0)
-                    )
                     psu_codes_per_cell[
-                        g_idx_arr[valid_obs_boot],
-                        t_idx_arr[valid_obs_boot],
-                    ] = dense_per_row[valid_obs_boot]
-
-                    # Group-level dict: first non-sentinel code per row.
-                    # Under within-group-constant PSU this matches the
-                    # pre-PR-4 "first label per group" convention
-                    # bit-for-bit; under varying PSU the dispatcher
-                    # routes to the cell-level path which uses the
-                    # full `psu_codes_per_cell` tensor.
+                        g_idx_arr[elig_obs_mask],
+                        t_idx_arr[elig_obs_mask],
+                    ] = dense_per_row[elig_obs_mask]
+                    psu_codes_per_cell_bootstrap = psu_codes_per_cell
+
+                    # Group-level dict: one PSU code per eligible
+                    # group. For rows that are all-sentinel (eligible
+                    # group has no positive-weight obs), assign code
+                    # `0` as a harmless placeholder — the group's IF
+                    # mass is zero, so the bootstrap multiplier it
+                    # receives is irrelevant on either the legacy or
+                    # the cell-level path. Always populate the dict
+                    # so the legacy group-level path keeps clustering
+                    # correctly when psu_varies=False even if some
+                    # eligible groups happen to have no positive-
+                    # weight obs.
                     group_psu_labels: List[int] = []
-                    valid_map = True
                     for i in range(n_elig_boot):
                         row = psu_codes_per_cell[i]
                         valid = row[row >= 0]
                         if valid.size == 0:
-                            valid_map = False
-                            break
-                        group_psu_labels.append(int(valid[0]))
-                    if (
-                        valid_map
-                        and len(group_psu_labels) == n_groups_for_overall_var
-                    ):
-                        group_id_to_psu_code_bootstrap = {
-                            gid: code
-                            for gid, code in zip(
-                                _eligible_group_ids, group_psu_labels
-                            )
-                        }
-                        eligible_group_ids_bootstrap = np.asarray(
-                            _eligible_group_ids
+                            group_psu_labels.append(0)
+                        else:
+                            group_psu_labels.append(int(valid[0]))
+                    group_id_to_psu_code_bootstrap = {
+                        gid: code
+                        for gid, code in zip(
+                            _eligible_group_ids, group_psu_labels
                         )
-                        psu_codes_per_cell_bootstrap = psu_codes_per_cell
+                    }
+                    eligible_group_ids_bootstrap = np.asarray(
+                        _eligible_group_ids
+                    )
 
             br = self._compute_dcdh_bootstrap(
                 n_groups_for_overall=n_groups_for_overall_var,
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
@@ -2001,3 +2001,151 @@ def test_bootstrap_cell_level_with_all_zero_weight_group_does_not_crash(self):
         # Bootstrap SE should be finite (zero-weight group does not
         # disturb the other groups' contributions).
         assert np.isfinite(res.bootstrap_results.overall_se)
+
+    def test_bootstrap_zero_weight_group_equivalent_to_removing_it(self):
+        """Fixture A: 9 groups (1 all-zero-weighted + 8 positive).
+        Fixture B: 8 groups (same panel without the zero-weight
+        group). Under the fix, an eligible group that has no
+        positive-weight cells contributes nothing to the bootstrap
+        (its `psu_codes_per_cell` row is all sentinel). Both fits
+        therefore produce byte-identical bootstrap SE at the same
+        seed. Without the fix, the `valid_map` gate in fit() would
+        disable the entire PSU-aware path when any row is all
+        sentinel, silently dropping to unclustered group-level for
+        the other groups.
+        """
+        def _make(include_zero_group: bool) -> pd.DataFrame:
+            rows = []
+            n_groups = 9 if include_zero_group else 8
+            for g in range(n_groups):
+                f = 3 if g < 4 else None
+                for t in range(5):
+                    pw = 0.0 if (include_zero_group and g == 8) else 1.0
+                    d = 1 if (f is not None and t >= f) else 0
+                    y = float(g) + 0.1 * t + 1.0 * d
+                    rows.append({
+                        "group": int(g),
+                        "period": int(t),
+                        "treatment": int(d),
+                        "outcome": y,
+                        "pw": pw,
+                        "psu": int(g),  # PSU=group, constant path
+                    })
+            return pd.DataFrame(rows)
+
+        sd = SurveyDesign(weights="pw", psu="psu")
+        res_a = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=7).fit(
+            _make(include_zero_group=True),
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        res_b = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=7).fit(
+            _make(include_zero_group=False),
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        assert res_a.bootstrap_results is not None
+        assert res_b.bootstrap_results is not None
+        se_a = float(res_a.bootstrap_results.overall_se)
+        se_b = float(res_b.bootstrap_results.overall_se)
+        assert np.isfinite(se_a) and np.isfinite(se_b)
+        assert se_a == pytest.approx(se_b, rel=0.0, abs=1e-15), (
+            f"Bootstrap SE must match when a zero-weight eligible "
+            f"group is added (fix P0 #1 — no silent dropback to "
+            f"unclustered group-level). Got SE_with_zero={se_a!r}, "
+            f"SE_without_zero={se_b!r}."
+        )
+
+    def test_bootstrap_dense_codes_under_singleton_baseline_excluded_group(self):
+        """Regression for P0 #2: when a group is singleton-baseline-
+        excluded (e.g., an always-treated group whose baseline D=1
+        has no peer), its PSU label must NOT pollute the dense code
+        factorization used by `_compute_dcdh_bootstrap`. Otherwise
+        eligible groups that share a PSU receive gapped dense codes
+        (e.g., `[1, 1]`), `_generate_psu_or_group_weights` computes
+        `n_psu = max + 1 = 2 == n_groups_target = 2`, and the
+        identity fast path wrongly triggers — giving those eligible
+        groups independent multiplier draws instead of a shared
+        one. Assertion: instrument the call to capture the
+        `group_id_to_psu_code` dict actually passed and confirm its
+        values form a contiguous range `[0, n_unique - 1]`.
+        """
+        # Fixture: one always-treated group (D=1 at period 0 → singleton-
+        # baseline-excluded), plus eligible groups that share a PSU
+        # label while the excluded group has a different PSU.
+        rows = []
+        for g in range(5):
+            for t in range(5):
+                if g == 0:
+                    d = 1  # always-treated; baseline D=1 singleton
+                    psu = 100  # distinct PSU for the excluded group
+                else:
+                    d = 1 if t >= 3 else 0  # joiners at period 3
+                    # Groups 1, 2 share PSU=200; groups 3, 4 share PSU=300.
+                    psu = 200 if g in (1, 2) else 300
+                rows.append({
+                    "group": int(g),
+                    "period": int(t),
+                    "treatment": int(d),
+                    "outcome": float(g) + 0.1 * t + 0.5 * d,
+                    "pw": 1.0,
+                    "psu": psu,
+                })
+        df_ = pd.DataFrame(rows)
+        sd = SurveyDesign(weights="pw", psu="psu")
+
+        captured: dict = {}
+
+        est = ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1)
+        original_bootstrap = est._compute_dcdh_bootstrap
+
+        def _spy(**kwargs):
+            captured["group_id_to_psu_code"] = kwargs.get(
+                "group_id_to_psu_code"
+            )
+            return original_bootstrap(**kwargs)
+
+        est._compute_dcdh_bootstrap = _spy  # type: ignore[method-assign]
+
+        import warnings as _w
+        with _w.catch_warnings():
+            _w.simplefilter("ignore")  # singleton-baseline warning
+            est.fit(
+                df_, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd,
+            )
+
+        dict_passed = captured["group_id_to_psu_code"]
+        assert dict_passed is not None, (
+            "bootstrap received group_id_to_psu_code=None — the "
+            "PSU-aware path was disabled instead of routing to the "
+            "cell/legacy path via densified codes."
+        )
+        codes = sorted(set(dict_passed.values()))
+        # Eligible groups share only two PSUs (200 for g=1,2;
+        # 300 for g=3,4). Dense codes must be [0, 1], NOT [1, 2]
+        # (which would happen if the excluded g=0's PSU=100 were
+        # dense-coded first).
+        assert codes == list(range(len(codes))), (
+            f"group_id_to_psu_code values must be contiguous "
+            f"dense codes starting at 0, got {codes}. A non-"
+            f"contiguous range signals the excluded group's PSU "
+            f"polluted the dense factorization (P0 #2 regression)."
+        )
+        # Sanity: eligible groups 1, 2 must share a code (PSU=200),
+        # and eligible groups 3, 4 must share a code (PSU=300).
+        assert dict_passed[1] == dict_passed[2], (
+            "Groups 1 and 2 share PSU=200 and must receive the same "
+            "dense code under correct densification."
+        )
+        assert dict_passed[3] == dict_passed[4], (
+            "Groups 3 and 4 share PSU=300 and must receive the same "
+            "dense code."
+        )
+        assert dict_passed[1] != dict_passed[3], (
+            "Groups in PSU=200 and PSU=300 must receive distinct "
+            "dense codes."
+        )