Address PR #355 R4 P0 + P3: resolve-normalize pweight-only weights + tighten boot_idx slice test

igerber · claude · igerber · commit 676d8317de73 · 2026-04-24T06:35:29.000-04:00
R4 P0 (scale-invariance): the pweight-only bootstrap branch was sourcing
w_control / w_treated from raw panel-column weights via
_extract_unit_survey_weights. The weighted-FW bootstrap objective is not
scale-invariant in rw (loss scales as rw^2 via A·diag(rw), reg scales as
rw), so two equivalent designs w and c*w could produce different
bootstrap SE / p-value / CI with no warning. Fix: source w_control /
w_treated from resolved_survey_unit.weights, which SurveyDesign.resolve()
normalizes to mean=1 (survey.py L189-L203). Placebo / jackknife paths
also consume the same w_control / w_treated but are scale-invariant, so
their numerics are unchanged.

R4 P3 (test tightening): the boot_idx × Rao-Wu regression test asserted
captured rw values stayed within the known_rw[1, 15] range — too weak to
catch permutation / deduplication regressions in the slice order. Tighten
by reproducing the bootstrap RNG stream externally (fake_rao_wu doesn't
consume rng) and asserting exact-equality between the captured rw_control
vector and known_rw[:n_control][boot_idx[boot_is_control]].

New regression test: test_bootstrap_scale_invariance_under_pweight_rescaling
fits the same panel under SurveyDesign("wt") vs SurveyDesign("wt_scaled")
(10x rescale) and asserts SE, p-value, CI match to machine-epsilon tolerance.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py
@@ -292,7 +292,6 @@ def fit(  # type: ignore[override]
 
         # Resolve survey design
         from diff_diff.survey import (
-            _extract_unit_survey_weights,
             _resolve_survey_for_fit,
             _validate_unit_constant_survey,
         )
@@ -426,22 +425,37 @@ def fit(  # type: ignore[override]
         # consumes per draw.
         if resolved_survey is not None:
             _validate_unit_constant_survey(data, unit, survey_design)
-            w_treated = _extract_unit_survey_weights(data, unit, survey_design, treated_units)
-            w_control = _extract_unit_survey_weights(data, unit, survey_design, control_units)
             # Collapse to unit level for the bootstrap survey path. The
             # row order is [control_units..., treated_units...] so
             # boot_rw[:n_control] / boot_rw[n_control:] line up with the
             # bootstrap loop's column ordering. See
             # `collapse_survey_to_unit_level` in diff_diff/survey.py.
-            from diff_diff.survey import collapse_survey_to_unit_level
-            all_units_for_bootstrap = list(control_units) + list(treated_units)
             # Use `data` (not `working_data`) for the groupby — survey
             # design columns are unit-constant (validated above) and
             # covariate residualization doesn't shuffle row order, so the
             # collapse is invariant to which view we group on.
+            from diff_diff.survey import collapse_survey_to_unit_level
+            all_units_for_bootstrap = list(control_units) + list(treated_units)
             resolved_survey_unit = collapse_survey_to_unit_level(
                 resolved_survey, data, unit, all_units_for_bootstrap,
             )
+            # Source w_control / w_treated from resolved_survey_unit.weights
+            # rather than re-extracting raw panel columns. resolved_survey.weights
+            # is normalized to mean=1 by SurveyDesign.resolve() (survey.py L189-
+            # L203), so the weighted-FW bootstrap objective — which is NOT
+            # invariant to a global rescaling of rw — produces identical SE /
+            # p-value / CI under SurveyDesign(weights="w") vs "c*w" (PR #355
+            # R4 P0). Placebo / jackknife paths also consume w_control /
+            # w_treated but are scale-invariant (np.average divides by sum;
+            # ω_eff normalization likewise), so switching to resolved weights
+            # doesn't change their numerics.
+            n_control_for_split = len(control_units)
+            w_control = resolved_survey_unit.weights[:n_control_for_split].astype(
+                np.float64
+            )
+            w_treated = resolved_survey_unit.weights[n_control_for_split:].astype(
+                np.float64
+            )
         else:
             w_treated = None
             w_control = None
diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py
@@ -790,36 +790,120 @@ def capturing_helper(Y_pre_c, Y_pre_t_mean, rw, *args, **kwargs):
             sdid_mod, "compute_sdid_unit_weights_survey", capturing_helper
         )
 
-        SyntheticDiD(variance_method="bootstrap", n_bootstrap=10, seed=1).fit(
+        bootstrap_seed = 1
+        SyntheticDiD(
+            variance_method="bootstrap", n_bootstrap=10, seed=bootstrap_seed,
+        ).fit(
             df, outcome="outcome", treatment="treated",
             unit="unit", time="period",
             post_periods=[5, 6, 7],
             survey_design=SurveyDesign(weights="wt", strata="stratum", psu="psu"),
         )
 
-        # For each captured rw vector: its values must all come from the
-        # first n_control=15 positions of known_rw (never from the
-        # treated slice [15:18]). Values may repeat across the vector
-        # (bootstrap picks with replacement) but every element must be
-        # ≤ n_control (positions 1..15, since we built known_rw as
-        # arange(1, 19)). Catches either a slice-order bug (would mix in
-        # treated-slice values 16..18) or a rw-drift bug (would produce
-        # values outside [1, 15]).
-        assert len(captured) >= 1, "no FW calls captured — survey dispatch broken"
+        # Exact-equality check against a reproduced RNG stream (PR #355 R4
+        # P3). The captured rw vectors must match known_rw[:n_control]
+        # sliced by boot_idx[boot_is_control] value-for-value. Reproducing
+        # the bootstrap's rng externally works because:
+        #  - fake_rao_wu does NOT consume rng (just returns known_rw),
+        #    so the only per-draw rng advance is ``rng.choice(n_total, ...)``
+        #    which yields boot_idx;
+        #  - known_rw is strictly positive, so the zero-mass retry branch
+        #    (synthetic_did.py ``_bootstrap_se``) never fires;
+        #  - a 15/3 split makes the no-control and all-control retries
+        #    vanishingly rare.
+        # An exact-equality regression catches the sibling bugs the old
+        # range check missed: permuted indices, deduplicated boot_idx, or
+        # substituted ``resolved_survey_unit.weights`` lookup in place of
+        # the known_rw slice — any of which would silently change
+        # bootstrap SE.
         n_control = 15
-        control_slice_max = float(known_rw[:n_control].max())  # = 15.0
-        for i, rw_captured in enumerate(captured):
-            assert rw_captured.shape[0] > 0, f"draw {i}: empty rw"
-            assert rw_captured.max() <= control_slice_max, (
-                f"draw {i}: captured rw max = {rw_captured.max()} exceeds "
-                f"control-slice max ({control_slice_max}); slice order "
-                "regressed — Rao-Wu weights mixed with treated slice."
-            )
-            assert rw_captured.min() >= 1.0, (
-                f"draw {i}: captured rw min = {rw_captured.min()} below "
-                "known_rw[0]=1; weights drifted outside the Rao-Wu output."
+        rng_sim = np.random.default_rng(bootstrap_seed)
+        expected_slices = []
+        while len(expected_slices) < len(captured):
+            boot_idx = rng_sim.choice(n_total, size=n_total, replace=True)
+            boot_is_control = boot_idx < n_control
+            n_co_b = int(boot_is_control.sum())
+            if n_co_b == 0 or n_co_b == n_total:
+                continue
+            expected_slices.append(known_rw[:n_control][boot_idx[boot_is_control]])
+
+        assert len(captured) >= 1, "no FW calls captured — survey dispatch broken"
+        for i, (rw_captured, rw_expected) in enumerate(
+            zip(captured, expected_slices)
+        ):
+            np.testing.assert_array_equal(
+                rw_captured,
+                rw_expected,
+                err_msg=(
+                    f"draw {i}: captured rw_control differs from expected "
+                    f"known_rw[:n_control][boot_idx[boot_is_control]]. "
+                    "Regression in hybrid pairs-bootstrap + Rao-Wu "
+                    "slice ordering."
+                ),
             )
 
+    def test_bootstrap_scale_invariance_under_pweight_rescaling(self):
+        """Survey-bootstrap SE / p / CI are invariant to a global pweight rescaling.
+
+        ``SurveyDesign.resolve()`` normalizes pweights/aweights to mean=1
+        (survey.py L189-L203), which is the library's scale-invariance
+        contract for survey-weighted fits. This test fits the same SDID
+        panel under two SurveyDesigns — weights column ``"wt"`` vs a
+        10x-rescaled copy ``"wt_scaled"`` — and asserts bootstrap SE,
+        p-value, and CI agree to machine-epsilon tolerance.
+
+        Regression against PR #355 R4 P0: the initial PR #352 pweight-only
+        bootstrap branch bypassed the resolved (normalized) unit-level
+        weights and fed raw panel-column weights into the weighted-FW
+        objective. That objective is NOT invariant to a global rescale
+        of rw — the loss term scales as rw^2 (``A-tilde = A * diag(rw)``)
+        while the reg term scales as rw (``zeta^2 * sum rw * omega^2``) —
+        so any user who rescaled their pweight column (e.g. switched
+        units) would see silently different SEs. The fix
+        (synthetic_did.py ``fit()`` around the ``resolved_survey`` block)
+        sources ``w_control`` and ``w_treated`` from
+        ``resolved_survey_unit.weights`` (post-normalization) rather
+        than re-extracting raw weights via ``_extract_unit_survey_weights``.
+        Tolerance is machine-epsilon tight because floating-point multiply-
+        reduce ordering inside ``raw * (n / (c*raw_sum))`` vs
+        ``raw * (n / raw_sum)`` can drift by ~1 ULP; a raw-weight fallback
+        would produce differences on the order of 1 or larger.
+        """
+        from diff_diff.survey import SurveyDesign
+
+        df = _make_panel(n_control=12, n_treated=3, seed=42)
+        unique_units = np.sort(df["unit"].unique())
+        unit_weights = np.linspace(0.5, 2.5, len(unique_units))
+        wt_map = dict(zip(unique_units, unit_weights))
+        df["wt"] = df["unit"].map(wt_map)
+        df["wt_scaled"] = df["wt"] * 10.0
+
+        kwargs = dict(
+            outcome="outcome", treatment="treated",
+            unit="unit", time="period",
+            post_periods=[5, 6, 7],
+        )
+        result_base = SyntheticDiD(
+            variance_method="bootstrap", n_bootstrap=50, seed=1,
+        ).fit(df, survey_design=SurveyDesign(weights="wt"), **kwargs)
+        result_scaled = SyntheticDiD(
+            variance_method="bootstrap", n_bootstrap=50, seed=1,
+        ).fit(df, survey_design=SurveyDesign(weights="wt_scaled"), **kwargs)
+
+        assert np.isfinite(result_base.se) and result_base.se > 0
+        np.testing.assert_allclose(
+            result_scaled.se, result_base.se, rtol=1e-13, atol=0,
+            err_msg="bootstrap SE is not invariant to pweight global rescaling",
+        )
+        np.testing.assert_allclose(
+            result_scaled.p_value, result_base.p_value, rtol=1e-12, atol=1e-14,
+            err_msg="bootstrap p-value is not invariant to pweight global rescaling",
+        )
+        np.testing.assert_allclose(
+            result_scaled.conf_int, result_base.conf_int, rtol=1e-13, atol=0,
+            err_msg="bootstrap CI is not invariant to pweight global rescaling",
+        )
+
     def test_bootstrap_single_psu_returns_nan(self):
         """Unstratified single-PSU survey design returns NaN SE (PR #352).