Address PR #370 R4 review (1 P0 + 1 P1)

igerber · claude · igerber · commit f8e959cb5cd2 · 2026-04-25T10:00:53.000-04:00
R4 P0 (Methodology) -- Yatchew test statistic was not invariant to
uniform pweight rescaling. The formula `T_hr = sqrt(sum(w)) * (...)`
makes T_hr scale as sqrt(c) under weights -&gt; w * c, so weights=w and
weights=100*w produced different p-values for the same design. Worse,
SurveyDesign.resolve() normalizes pweights to mean=1 internally, so
the survey= entry path and the weights= shortcut disagreed numerically.

Fix: normalize per-unit pweights to mean=1 at every helper entry
(stute_test, yatchew_hr_test, stute_joint_pretest) and at the workflow
resolution helper. Matches SurveyDesign.resolve() convention; makes
the Yatchew statistic scale-invariant; ensures weights=w and
survey=SurveyDesign(weights="w") produce identical results for the
same design. Stute is internally scale-invariant in functional form
but normalization is required so the bootstrap helper sees the same
weight vector under both entry paths (cross-path numerical agreement).

R4 P1 (Code Quality) -- column-vector weights (e.g. `df[["w"]].to_numpy()`
producing (G, 1)) silently broadcast through weighted moments / CvM
sums instead of raising. Fix: validate via `_validate_1d_numeric` on
all `weights=` arrays in stute_test, yatchew_hr_test, stute_joint_pretest;
add explicit ndim check in `_resolve_pretest_unit_weights` with a
hint about the common df[["w"]].to_numpy() mistake.

6 new regression tests in TestPhase45CR1Regressions:
- test_yatchew_weights_scale_invariant (weights=w vs weights=100*w)
- test_stute_weights_scale_invariant (mirror for Stute)
- test_workflow_weights_eq_survey_at_overall_path (weights= shortcut
  and survey=SurveyDesign(...) produce identical Yatchew + Stute
  results, atol=1e-10)
- test_stute_test_rejects_2d_weights / test_yatchew_hr_test_rejects_2d_weights
  / test_workflow_rejects_2d_weights (column-vector rejection at all
  three direct-helper / workflow entry points)

177 pretest tests pass (was 171 after R3).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py
@@ -1548,19 +1548,15 @@ def stute_test(
     # Phase 4.5 C: resolve effective per-unit weights (None on the
     # unweighted path, preserves bit-exact regression). When survey= is
     # supplied, w is taken from the resolved design.
+    # R4 P1: validate 1D explicitly so column-vector inputs (e.g.
+    # df[["w"]].to_numpy()) raise instead of silently broadcasting.
     if survey is not None:
-        w_arr = np.asarray(survey.weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(np.asarray(survey.weights), "stute_test: survey.weights")
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"stute_test: survey.weights length {w_arr.shape[0]} does not "
                 f"match d/dy length {G}."
             )
-        # R1 P0: strictly-positive weights at the per-unit level (mirrors
-        # workflow guard in _resolve_pretest_unit_weights). Zero-weight
-        # units would leak into the dose-variation check + CvM cusum +
-        # bootstrap refit, producing silent wrong pretest decisions on
-        # subpopulation-restricted designs (e.g. only zero-weight units
-        # carry dose variation -> spurious finite test statistic).
         if (w_arr <= 0).any():
             raise ValueError(
                 "stute_test: survey weights must be strictly positive. "
@@ -1570,7 +1566,7 @@ def stute_test(
                 "weight subpopulation before calling stute_test."
             )
     elif weights is not None:
-        w_arr = np.asarray(weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(np.asarray(weights), "stute_test: weights")
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"stute_test: weights length {w_arr.shape[0]} does not match " f"d/dy length {G}."
@@ -1584,6 +1580,17 @@ def stute_test(
     else:
         w_arr = None
 
+    # R4 P0: normalize pweights to mean=1 (matches SurveyDesign.resolve()
+    # convention). Makes the test statistic scale-invariant under uniform
+    # rescaling of weights AND ensures weights= shortcut and
+    # survey=SurveyDesign(weights=...) produce identical results for the
+    # same design. Stute is internally scale-invariant in functional form,
+    # but the survey-aware bootstrap helper consumes weight values
+    # directly under non-trivial PSU/strata, so normalization is required
+    # for cross-path agreement.
+    if w_arr is not None:
+        w_arr = w_arr * (float(w_arr.shape[0]) / float(np.sum(w_arr)))
+
     if w_arr is None:
         a_hat, b_hat, eps = _fit_ols_intercept_slope(d_arr, dy_arr)
     else:
@@ -1895,8 +1902,9 @@ def yatchew_hr_test(
     # Phase 4.5 C: resolve effective per-unit weights. Strictly positive
     # required (the adjacent-difference formula divides by sum(w_avg) which
     # collapses to zero in any contiguous-zero block).
+    # R4 P1: validate 1D explicitly so column-vector inputs raise.
     if survey is not None:
-        w_arr = np.asarray(survey.weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(np.asarray(survey.weights), "yatchew_hr_test: survey.weights")
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"yatchew_hr_test: survey.weights length {w_arr.shape[0]} "
@@ -1909,7 +1917,7 @@ def yatchew_hr_test(
                 "zero-weight blocks)."
             )
     elif weights is not None:
-        w_arr = np.asarray(weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(np.asarray(weights), "yatchew_hr_test: weights")
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"yatchew_hr_test: weights length {w_arr.shape[0]} does not "
@@ -1924,6 +1932,17 @@ def yatchew_hr_test(
     else:
         w_arr = None
 
+    # R4 P0: normalize pweights to mean=1 (matches SurveyDesign.resolve()
+    # convention). Yatchew uses sqrt(sum(w)) as the effective sample size,
+    # which without normalization would scale as sqrt(c) under uniform
+    # rescaling weights -> w * c, producing different p-values for
+    # weights=w vs weights=100*w. Normalization makes the statistic
+    # scale-invariant AND ensures weights= and survey=SurveyDesign(...)
+    # produce identical results (the latter resolve()s to mean=1
+    # internally, the former previously did not).
+    if w_arr is not None:
+        w_arr = w_arr * (float(w_arr.shape[0]) / float(np.sum(w_arr)))
+
     if G < _MIN_G_YATCHEW:
         warnings.warn(
             f"yatchew_hr_test: G = {G} is below the minimum {_MIN_G_YATCHEW} "
@@ -2682,8 +2701,11 @@ def stute_joint_pretest(
 
     # Phase 4.5 C: resolve effective per-unit weights (None → bit-exact
     # unweighted path).
+    # R4 P1: validate 1D explicitly so column-vector inputs raise.
     if survey is not None:
-        w_arr = np.asarray(survey.weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(
+            np.asarray(survey.weights), "stute_joint_pretest: survey.weights"
+        )
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"stute_joint_pretest: survey.weights length {w_arr.shape[0]} "
@@ -2698,7 +2720,7 @@ def stute_joint_pretest(
                 "population mass."
             )
     elif weights is not None:
-        w_arr = np.asarray(weights, dtype=np.float64)
+        w_arr = _validate_1d_numeric(np.asarray(weights), "stute_joint_pretest: weights")
         if w_arr.shape[0] != G:
             raise ValueError(
                 f"stute_joint_pretest: weights length {w_arr.shape[0]} does "
@@ -2712,6 +2734,11 @@ def stute_joint_pretest(
     else:
         w_arr = None
 
+    # R4 P0: normalize pweights to mean=1 (matches SurveyDesign.resolve()
+    # convention; same fix as stute_test / yatchew_hr_test).
+    if w_arr is not None:
+        w_arr = w_arr * (float(w_arr.shape[0]) / float(np.sum(w_arr)))
+
     idx = np.argsort(doses_arr, kind="stable")
     d_sorted = doses_arr[idx]
 
@@ -2915,6 +2942,16 @@ def _resolve_pretest_unit_weights(
         )
     if weights is not None:
         weights_arr = np.asarray(weights, dtype=np.float64)
+        # R4 P1: validate 1D explicitly (column-vector inputs would otherwise
+        # broadcast through downstream computations and silently corrupt
+        # results).
+        if weights_arr.ndim != 1:
+            raise ValueError(
+                f"{caller_name}: weights must be 1-dimensional, got shape "
+                f"{weights_arr.shape}. (A common mistake is passing "
+                "df[['w']].to_numpy() which produces (N, 1); use "
+                "df['w'].to_numpy() for (N,).)"
+            )
         weights_unit = _aggregate_unit_weights(data, weights_arr, unit_col)
         # R1 P0: strictly-positive weights required on the pweight shortcut
         # (matches stute_test/yatchew_hr_test direct entry behavior; the CvM
@@ -2927,6 +2964,11 @@ def _resolve_pretest_unit_weights(
                 "mass; use survey= with explicit lonely-PSU handling for "
                 "principled subpopulation analysis."
             )
+        # R4 P0: normalize per-unit weights to mean=1 (matches
+        # SurveyDesign.resolve() convention so weights= and survey= entry
+        # paths produce identical statistic values; ensures Yatchew is
+        # scale-invariant under uniform rescaling).
+        weights_unit = weights_unit * (float(weights_unit.shape[0]) / float(np.sum(weights_unit)))
         return weights_unit, None
     # survey is not None
     if not hasattr(survey, "resolve"):
diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py
@@ -3724,3 +3724,103 @@ def test_workflow_single_psu_propagates_nan_through_stute(self):
         assert report.yatchew is not None and np.isfinite(report.yatchew.p_value)
         # Verdict carries the linearity-conditional suffix.
         assert "linearity-conditional verdict" in report.verdict
+
+    # --- R4 P0: weight-scale invariance + cross-path agreement ------------
+
+    def test_yatchew_weights_scale_invariant(self):
+        """R4 P0: Yatchew test statistic must be invariant under uniform
+        rescaling of weights. Pre-fix `T_hr = sqrt(sum(w)) * (...)` made
+        the stat scale as sqrt(c), so weights=w and weights=100*w gave
+        different p-values. Fix: helper normalizes pweights to mean=1
+        before any computation."""
+        d, dy = _linear_dgp(G=30, beta=2.0, sigma=0.3)
+        w = np.random.default_rng(7).uniform(0.5, 2.0, size=30)
+        r1 = yatchew_hr_test(d, dy, weights=w)
+        r2 = yatchew_hr_test(d, dy, weights=100.0 * w)
+        np.testing.assert_allclose(r1.t_stat_hr, r2.t_stat_hr, atol=1e-12, rtol=1e-12)
+        np.testing.assert_allclose(r1.p_value, r2.p_value, atol=1e-12, rtol=1e-12)
+
+    def test_stute_weights_scale_invariant(self):
+        """R4 P0 mirror: Stute is internally scale-invariant in functional
+        form, but normalization is required so weights= and survey=
+        entry paths agree numerically."""
+        d, dy = _linear_dgp(G=30, beta=2.0, sigma=0.3)
+        w = np.random.default_rng(7).uniform(0.5, 2.0, size=30)
+        r1 = stute_test(d, dy, weights=w, n_bootstrap=199, seed=0)
+        r2 = stute_test(d, dy, weights=100.0 * w, n_bootstrap=199, seed=0)
+        np.testing.assert_allclose(r1.cvm_stat, r2.cvm_stat, atol=1e-12, rtol=1e-12)
+        np.testing.assert_allclose(r1.p_value, r2.p_value, atol=1e-12, rtol=1e-12)
+
+    def test_workflow_weights_eq_survey_at_overall_path(self):
+        """R4 P0: workflow's weights= shortcut and survey=SurveyDesign(
+        weights="w") must produce identical Yatchew/Stute results for
+        the same design. SurveyDesign.resolve() normalizes pweights to
+        mean=1; the helper now applies the same normalization on the
+        weights= path so both paths agree numerically."""
+        from diff_diff import SurveyDesign
+
+        df = self._make_overall_panel(with_w_col=True)
+        # Build a per-row weights array matching df["w"] for the shortcut.
+        weights_per_row = df["w"].to_numpy()
+        with pytest.warns(UserWarning):
+            r_weights = did_had_pretest_workflow(
+                df,
+                "y",
+                "d",
+                "time",
+                "unit",
+                weights=weights_per_row,
+                n_bootstrap=199,
+                seed=0,
+            )
+        with pytest.warns(UserWarning):
+            r_survey = did_had_pretest_workflow(
+                df,
+                "y",
+                "d",
+                "time",
+                "unit",
+                survey=SurveyDesign(weights="w"),
+                n_bootstrap=199,
+                seed=0,
+            )
+        # Yatchew: closed-form, must match exactly under mean=1 normalization.
+        assert r_weights.yatchew is not None and r_survey.yatchew is not None
+        np.testing.assert_allclose(
+            r_weights.yatchew.t_stat_hr,
+            r_survey.yatchew.t_stat_hr,
+            atol=1e-10,
+            rtol=1e-10,
+        )
+        # Stute: bootstrap is seeded; same multiplier matrix shape under
+        # both paths means same RNG draws -> identical p-values.
+        assert r_weights.stute is not None and r_survey.stute is not None
+        np.testing.assert_allclose(
+            r_weights.stute.cvm_stat, r_survey.stute.cvm_stat, atol=1e-10, rtol=1e-10
+        )
+        np.testing.assert_allclose(
+            r_weights.stute.p_value, r_survey.stute.p_value, atol=1e-10, rtol=1e-10
+        )
+
+    # --- R4 P1: 1D weights validation ------------------------------------
+
+    def test_stute_test_rejects_2d_weights(self):
+        """R4 P1: column-vector weights must raise, not silently broadcast."""
+        d, dy = _linear_dgp(G=30)
+        w_2d = np.ones((30, 1))  # common df[["w"]].to_numpy() pattern
+        with pytest.raises(ValueError, match="1-dimensional"):
+            stute_test(d, dy, weights=w_2d, n_bootstrap=199, seed=0)
+
+    def test_yatchew_hr_test_rejects_2d_weights(self):
+        d, dy = _linear_dgp(G=30)
+        w_2d = np.ones((30, 1))
+        with pytest.raises(ValueError, match="1-dimensional"):
+            yatchew_hr_test(d, dy, weights=w_2d)
+
+    def test_workflow_rejects_2d_weights(self):
+        df = self._make_overall_panel()
+        w_2d = np.ones((40, 1))
+        with pytest.raises(ValueError, match="1-dimensional"):
+            did_had_pretest_workflow(
+                df, "y", "d", "time", "unit", weights=w_2d, n_bootstrap=199, seed=0
+            )