Address PR #363 R2 review (2 P1)

igerber · claude · igerber · commit 4ecc6f437fbd · 2026-04-24T17:06:23.000-04:00
R2 P1 (methodology): reject SurveyDesign(lonely_psu='adjust') with
singleton strata in _sup_t_multiplier_bootstrap. The bootstrap helper
pools singletons into a pseudo-stratum with NONZERO multipliers, but
compute_survey_if_variance centers singleton PSU scores around the
global mean — without the matching pseudo-stratum centering transform
in the bootstrap, the simultaneous band target diverges from the
analytical Binder-TSL variance. Clear NotImplementedError points
users to lonely_psu='remove' (matches the 'remove' analytical target)
or cband=False (skips bootstrap). 'remove' / 'certainty' continue to
work unchanged. Deferred transform tracked for follow-up.

R2 P1 (code quality): reject cluster= + weights/survey= on
design='mass_point' on both static and event-study paths. The
weighted path composes Binder-TSL variance via
compute_survey_if_variance which was silently overriding the CR1
sandwich while result metadata still advertised vcov_type='cr1' and
cluster_name=&lt;col&gt;. Clean NotImplementedError with a pointer to the
two supported combinations: cluster= alone (unweighted CR1) or
survey/weights alone (Binder-TSL). Combined cluster-robust survey
inference requires a derivation not yet in scope.

Regression tests (+4):
- test_mass_point_survey_plus_cluster_rejected_static
- test_mass_point_survey_plus_cluster_rejected_event_study
- test_lonely_psu_adjust_with_singletons_rejected_on_cband
- test_stratified_h1_sup_t_matches_analytical (H=1 quantile lock
  under n_strata=4, psu_per_unit: q=1.985, matches Phi^-1(0.975))

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/had.py b/diff_diff/had.py
@@ -2062,6 +2062,45 @@ def _sup_t_multiplier_bootstrap(
     )
 
     if use_survey_bootstrap:
+        # Review R2 P1: lonely_psu="adjust" pools singleton strata into a
+        # pseudo-stratum with NONZERO multipliers in the bootstrap helper,
+        # but the analytical compute_survey_if_variance target for
+        # singletons is centered at the global mean of PSU scores. Since
+        # this PR's stratum-demean loop only matches the within-stratum
+        # Binder-TSL target (and skips singletons assuming zero
+        # contribution), pooled singleton multipliers would diverge from
+        # the analytical variance without an additional pseudo-stratum
+        # centering step. Reject with a clear pointer until the matching
+        # transform is derived; "remove" / "certainty" (singleton
+        # multipliers forced to zero) are fine.
+        _lonely = getattr(resolved_survey, "lonely_psu", "remove")
+        if _lonely == "adjust":
+            strata_arr = resolved_survey.strata
+            psu_arr = resolved_survey.psu
+            _has_singleton = False
+            if strata_arr is not None:
+                for h in np.unique(strata_arr):
+                    mask_h = np.asarray(strata_arr) == h
+                    if psu_arr is not None:
+                        n_psu_h = int(np.unique(np.asarray(psu_arr)[mask_h]).shape[0])
+                    else:
+                        n_psu_h = int(mask_h.sum())
+                    if n_psu_h < 2:
+                        _has_singleton = True
+                        break
+            if _has_singleton:
+                raise NotImplementedError(
+                    "HeterogeneousAdoptionDiD event-study sup-t bootstrap "
+                    "does not yet support SurveyDesign(lonely_psu='adjust') "
+                    "with singleton strata: the bootstrap helper pools "
+                    "singletons with nonzero multipliers while the "
+                    "analytical Binder-TSL target centers singleton PSU "
+                    "scores at the global mean, and the matching "
+                    "pseudo-stratum centering transform has not been "
+                    "implemented. Use lonely_psu='remove' (drops singleton "
+                    "contributions; matches the 'remove' analytical target) "
+                    "or pass cband=False to skip the simultaneous band."
+                )
         psu_weights, psu_ids = generate_survey_multiplier_weights_batch(
             n_bootstrap, resolved_survey, bootstrap_weights, rng
         )
@@ -3260,6 +3299,25 @@ def fit(
             vcov_label: Optional[str] = None
             cluster_label: Optional[str] = None
         elif resolved_design == "mass_point":
+            # Review R2 P1: mass-point + weights + cluster is a silent
+            # inference mismatch because the weighted path overrides the
+            # CR1 SE with Binder-TSL while result metadata still reports
+            # CR1. Reject the combination front-door until a combined
+            # cluster + survey variance is derived. Unweighted CR1
+            # continues to work unchanged; weighted pweight sandwich
+            # without cluster continues to work unchanged.
+            if cluster_arg is not None and weights_unit_full is not None:
+                raise NotImplementedError(
+                    f"cluster={cluster_arg!r} + survey=/weights= on "
+                    f"design='mass_point' is not yet supported: the "
+                    f"weighted path composes Binder-TSL variance and "
+                    f"would silently override the CR1 cluster-robust "
+                    f"sandwich. Pass either cluster= alone (unweighted "
+                    f"CR1) or survey=/weights= alone (weighted 2SLS "
+                    f"pweight sandwich → Binder-TSL under survey=); "
+                    f"combined cluster-robust survey inference is "
+                    f"deferred to a follow-up PR."
+                )
             if vcov_type_arg is None:
                 # Backward-compat: robust=True -> hc1, robust=False -> classical.
                 vcov_requested = "hc1" if robust_arg else "classical"
@@ -3858,6 +3916,22 @@ def _fit_event_study(
         # ---- Extract cluster IDs on mass-point path only ----
         cluster_arr: Optional[np.ndarray] = None
         if resolved_design == "mass_point" and cluster_arg is not None:
+            # Review R2 P1: reject cluster= + weights/survey on
+            # mass-point (mirrors the static-path rejection) —
+            # the weighted path would compose Binder-TSL variance
+            # and silently override CR1 while result metadata still
+            # claims cluster-robust inference.
+            if weights_unit_full is not None:
+                raise NotImplementedError(
+                    f"cluster={cluster_arg!r} + survey=/weights= on "
+                    f"design='mass_point' (event-study) is not yet "
+                    f"supported: the weighted path composes Binder-TSL "
+                    f"variance and would silently override the CR1 "
+                    f"cluster-robust sandwich. Pass either cluster= "
+                    f"alone (unweighted CR1) or survey=/weights= alone; "
+                    f"combined cluster-robust survey inference is "
+                    f"deferred to a follow-up PR."
+                )
             _, _, cluster_arr, _, _ = _aggregate_multi_period_first_differences(
                 data_filtered,
                 outcome_col,
diff --git a/tests/test_had.py b/tests/test_had.py
@@ -4846,3 +4846,147 @@ def test_weights_nonrange_index_aligned_positionally(self):
             )
         np.testing.assert_allclose(r_range.att, r_shifted.att, atol=1e-12, rtol=1e-12)
         np.testing.assert_allclose(r_range.se, r_shifted.se, atol=1e-12, rtol=1e-12)
+
+    def test_mass_point_survey_plus_cluster_rejected_static(self):
+        """Review R2 P1: mass-point + (weights= or survey=) + cluster=
+        must raise NotImplementedError on the static path. Previously
+        the weighted path silently overrode the CR1 SE with Binder-TSL
+        while the result still reported vcov_type='cr1'."""
+        rng = np.random.default_rng(0)
+        G = 200
+        d = np.concatenate([np.full(40, 0.3), rng.uniform(0.3, 1.0, G - 40)])
+        rng.shuffle(d)
+        dy = 2.0 * d + 0.3 * rng.standard_normal(G)
+        panel = pd.DataFrame(
+            {
+                "unit": np.repeat(np.arange(G), 2),
+                "period": np.tile([1, 2], G),
+                "dose": np.column_stack([np.zeros(G), d]).ravel(),
+                "outcome": np.column_stack([np.zeros(G), dy]).ravel(),
+                "state": np.repeat(np.arange(G) // 20, 2),
+            }
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            est = HeterogeneousAdoptionDiD(design="mass_point", vcov_type="hc1", cluster="state")
+            with pytest.raises(NotImplementedError, match="cluster"):
+                est.fit(
+                    panel,
+                    "outcome",
+                    "dose",
+                    "period",
+                    "unit",
+                    weights=np.ones(panel.shape[0]),
+                )
+
+    def test_mass_point_survey_plus_cluster_rejected_event_study(self):
+        """Review R2 P1 (event-study arm): same rejection must fire on
+        the multi-period dispatch."""
+        rng = np.random.default_rng(1)
+        G, T = 150, 4
+        d_mp = np.concatenate([np.full(30, 0.3), rng.uniform(0.3, 1.0, G - 30)])
+        rng.shuffle(d_mp)
+        rows = []
+        for t in range(T):
+            for g in range(G):
+                dose = d_mp[g] if t == T - 1 else 0.0
+                y = 0.2 * t + (2.0 * dose if t == T - 1 else 0.0) + 0.5 * rng.standard_normal()
+                rows.append((g, t, dose, y, g // 25))
+        panel = pd.DataFrame(rows, columns=["unit", "period", "dose", "outcome", "state"])
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            est = HeterogeneousAdoptionDiD(design="mass_point", vcov_type="hc1", cluster="state")
+            with pytest.raises(NotImplementedError, match="cluster"):
+                est.fit(
+                    panel,
+                    "outcome",
+                    "dose",
+                    "period",
+                    "unit",
+                    aggregate="event_study",
+                    weights=np.ones(panel.shape[0]),
+                )
+
+    def test_lonely_psu_adjust_with_singletons_rejected_on_cband(self):
+        """Review R2 P1: sup-t bootstrap rejects lonely_psu='adjust'
+        when there are singleton strata, because the bootstrap helper
+        pools singletons with nonzero multipliers but the analytical
+        target centers them at the global mean — mismatch."""
+        from diff_diff.had import _sup_t_multiplier_bootstrap
+        from diff_diff.survey import ResolvedSurveyDesign
+
+        rng = np.random.default_rng(0)
+        G = 80
+        # 3 strata, two with multiple PSUs, one singleton.
+        strata = np.array([1] * 30 + [2] * 30 + [3] * 20)
+        # PSUs: 10 in stratum 1, 10 in stratum 2, 1 in stratum 3 (singleton).
+        psu = np.concatenate(
+            [np.arange(10).repeat(3), (10 + np.arange(10)).repeat(3), np.full(20, 20)]
+        )
+        adjust_resolved = ResolvedSurveyDesign(
+            weights=np.ones(G),
+            weight_type="pweight",
+            strata=strata,
+            psu=psu,
+            fpc=None,
+            n_strata=3,
+            n_psu=21,
+            lonely_psu="adjust",
+            combined_weights=True,
+            mse=False,
+        )
+        psi = rng.standard_normal((G, 2))
+        with pytest.raises(NotImplementedError, match="lonely_psu='adjust'"):
+            _sup_t_multiplier_bootstrap(
+                psi,
+                np.zeros(2),
+                np.array([1.0, 1.0]),
+                adjust_resolved,
+                n_bootstrap=200,
+                alpha=0.05,
+                seed=0,
+            )
+
+    def test_stratified_h1_sup_t_matches_analytical(self):
+        """Review R2 P1 coverage: stratum-centered H=1 bootstrap variance
+        matches the analytical Binder-TSL target (q ≈ 1.96 at H=1)."""
+        from diff_diff.had import _sup_t_multiplier_bootstrap
+        from diff_diff.survey import ResolvedSurveyDesign, compute_survey_if_variance
+
+        rng = np.random.default_rng(7)
+        G = 400
+        strata = np.repeat(np.arange(4), G // 4)
+        psu = np.arange(G)
+        resolved = ResolvedSurveyDesign(
+            weights=np.ones(G),
+            weight_type="pweight",
+            strata=strata,
+            psu=psu,
+            fpc=None,
+            n_strata=4,
+            n_psu=G,
+            lonely_psu="remove",
+            combined_weights=True,
+            mse=False,
+        )
+        psi = rng.standard_normal((G, 1))
+        V_analytical = compute_survey_if_variance(psi[:, 0], resolved)
+        se_analytical = np.sqrt(V_analytical)
+        q, _, _, _ = _sup_t_multiplier_bootstrap(
+            psi,
+            np.zeros(1),
+            np.array([se_analytical]),
+            resolved,
+            n_bootstrap=5000,
+            alpha=0.05,
+            seed=42,
+        )
+        # At H=1 the sup collapses to the marginal; with stratum-
+        # centered + small-sample-corrected perturbations the bootstrap
+        # distribution is ~ N(0, 1), so q → Phi^-1(0.975) = 1.96.
+        # B=5000 MC noise on the tail quantile is ~0.03-0.05.
+        assert abs(q - 1.96) < 0.15, (
+            f"Stratified H=1 sup-t should match Normal quantile 1.96 up to "
+            f"MC noise; got q={q:.4f}. Likely a stratum-centering bug in "
+            f"_sup_t_multiplier_bootstrap."
+        )