Address PR #392 R4 review (1 P0 + 1 P3)

igerber · claude · igerber · commit b7b7eb3f249f · 2026-04-26T10:10:26.000-04:00
P0 (observed-period detrending delta):
The R3 P1 fix made the slope LOOKUP use observed periods, but the
detrending DELTA (`t_rank - base_rank`) still pulled ranks from the
full categorical dtype via _build_period_rank. On panels with
unused intermediate categorical levels, the same observed data
produced different (t - base) multipliers and corrupted the
joint test statistic — silent wrong statistical output.

Fix: under trends_lin=True, build a fresh observed_rank dict from
sorted(set(data_filtered[time_col].unique())) and use it for both
the base and horizon ranks in the delta computation. Mirrors
HAD.fit's `_aggregate_multi_period_first_differences` convention
(`sorted(t_pre_list + t_post_list, ...)` for the event-time rank).
Both joint wrappers fixed; workflow inherits the fix automatically.

Regression tests (2):
  - joint_pretrends_test on (categorical with 2 unused levels)
    produces identical cvm_stat_joint and p_value to (categorical
    without unused levels) on the same observed data
  - joint_homogeneity_test twin invariant (unused level between
    base and post)

P3 (exact upstream-version pin):
The parity contract cited DIDHAD v2.0.0 / SHA edc09197 in
CHANGELOG, REGISTRY, and the parity test docstrings, but the
generator and test only enforced `&gt;= 2.0.0`. Future regeneration
could silently re-anchor goldens to a newer release while docs
still cited the old version.

Fix: pin exactly DIDHAD == 2.0.0 and YatchewTest == 1.1.1 in both
the generator's stopifnot guards and the parity test's metadata
assertion. Document the bump procedure in the comments.

Stats: 540 tests pass (538 prior + 2 new R4 P0 regressions), 0
regressions. All 24 R-parity cells still green at atol=1e-8 /
1e-10. Note: existing categorical-level invariance tests added in
R3 still pass — they exercised correctness on simple unused-level
shifts; the R4 invariants are stricter, asserting bit-exact
identity of the joint statistics across categorical re-ordering of
the same observed panel.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/R/generate_did_had_golden.R b/benchmarks/R/generate_did_had_golden.R
@@ -25,8 +25,13 @@ library(jsonlite)
 library(DIDHAD)
 library(YatchewTest)
 
-stopifnot(packageVersion("DIDHAD") >= "2.0.0")
-stopifnot(packageVersion("YatchewTest") >= "1.1.0")
+# PR #392 R4 P3: pin exact upstream versions so future regeneration
+# does not silently re-anchor the goldens to a newer CRAN release
+# while CHANGELOG / REGISTRY / parity test still cite v2.0.0 / SHA
+# `edc09197`. Bump these pins (here AND in the parity test's
+# `test_metadata_versions_match`) when intentionally re-anchoring.
+stopifnot(packageVersion("DIDHAD") == "2.0.0")
+stopifnot(packageVersion("YatchewTest") == "1.1.1")
 
 # -------------------------------------------------------------------------
 # Panel builder: 5-period panel with F=4 (treatment onset at t=4).
diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py
@@ -3693,11 +3693,28 @@ def joint_pretrends_test(
         slope = wide_y[base_period].to_numpy(dtype=np.float64) - wide_y[
             base_minus_1_period
         ].to_numpy(dtype=np.float64)
+        # PR #392 R4 P0: build the detrending rank from OBSERVED
+        # periods (on data_filtered), not from the full categorical
+        # dtype. Otherwise unused intermediate categorical levels
+        # silently change the (t - base) multiplier and corrupt the
+        # joint statistic. Mirrors HAD.fit's
+        # `_aggregate_multi_period_first_differences` convention which
+        # uses `sorted(t_pre_list + t_post_list, ...)` for the
+        # event-time rank.
+        observed_rank = {
+            p: i
+            for i, p in enumerate(
+                sorted(
+                    set(data_filtered[time_col].unique()),
+                    key=lambda p: period_rank[p],
+                )
+            )
+        }
+        base_rank_observed = observed_rank[base_period]
         # Apply detrending in place to remaining dy_by_horizon entries.
         for t in pre_periods_effective:
             label = str(t)
-            t_rank = period_rank[t]
-            delta = t_rank - base_rank  # < 0 for pre-periods
+            delta = observed_rank[t] - base_rank_observed  # < 0 for pre-periods
             dy_by_horizon[label] = dy_by_horizon[label] - delta * slope
 
     # Phase 4.5 C: aggregate per-row weights/survey to per-unit (G,)
@@ -4071,10 +4088,22 @@ def joint_homogeneity_test(
         slope_h = wide_y_h[base_period].to_numpy(dtype=np.float64) - wide_y_h[
             base_minus_1_period_h
         ].to_numpy(dtype=np.float64)
+        # PR #392 R4 P0: build the detrending rank from OBSERVED
+        # periods on data_filtered (matching HAD.fit). Twin of
+        # joint_pretrends_test fix.
+        observed_rank_h = {
+            p: i
+            for i, p in enumerate(
+                sorted(
+                    set(data_filtered[time_col].unique()),
+                    key=lambda p: period_rank[p],
+                )
+            )
+        }
+        base_rank_observed_h = observed_rank_h[base_period]
         for t in post_periods:
             label = str(t)
-            t_rank = period_rank[t]
-            delta = t_rank - base_rank  # > 0 for post-periods
+            delta = observed_rank_h[t] - base_rank_observed_h  # > 0 for post-periods
             dy_by_horizon[label] = dy_by_horizon[label] - delta * slope_h
 
     # Phase 4.5 C: aggregate weights/survey to per-unit; thread through.
diff --git a/tests/test_did_had_parity.py b/tests/test_did_had_parity.py
@@ -428,11 +428,23 @@ class TestFixtureMetadata:
     """Sanity checks on the fixture itself."""
 
     def test_metadata_versions_match(self, fixture):
-        """Ensure the JSON metadata lists the expected DIDHAD version pin."""
+        """Ensure the JSON metadata lists the EXACT pinned upstream
+        versions. PR #392 R4 P3: exact pin (not >=) so future
+        regeneration does not silently re-anchor the goldens to a
+        newer CRAN release while changelog / registry still cite the
+        old version. Bump these pins (here AND in
+        ``benchmarks/R/generate_did_had_golden.R``) when intentionally
+        re-anchoring."""
         meta = fixture["metadata"]
-        assert meta["didhad_version"] >= "2.0.0", (
+        assert meta["didhad_version"] == "2.0.0", (
             f"Fixture was generated against DIDHAD={meta['didhad_version']!r}; "
-            f"the parity test expects >= 2.0.0. Regenerate the fixture."
+            f"the parity test pins exactly 2.0.0. Regenerate after bumping "
+            f"the pin in both the generator and this test."
+        )
+        assert meta["yatchewtest_version"] == "1.1.1", (
+            f"Fixture was generated against YatchewTest="
+            f"{meta['yatchewtest_version']!r}; the parity test pins exactly "
+            f"1.1.1. Regenerate after bumping the pin."
         )
 
     def test_metadata_n_dgps(self, fixture):
diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py
@@ -4812,6 +4812,108 @@ def test_pretrends_trends_lin_unused_categorical_observed_only(self):
         assert np.isfinite(r.cvm_stat_joint)
         assert np.isfinite(r.p_value)
 
+    def test_pretrends_trends_lin_unused_categorical_invariant(self):
+        """Same observed panel with vs without unused intermediate
+        categorical levels must produce IDENTICAL joint statistics on
+        the pretrends path. Reviewer-requested invariant for PR #392
+        R4 P0 — detrending delta must use observed-period rank, not
+        full-categorical rank."""
+        df_int = self._panel(rng_seed=50)
+        time_map = {1: "t1", 2: "t2", 3: "t3", 4: "t4", 5: "t5"}
+        df_a = df_int.copy()
+        df_a["time"] = pd.Categorical(
+            df_a["time"].map(time_map),
+            categories=["t1", "t2", "t3", "t4", "t5"],
+            ordered=True,
+        )
+        df_b = df_int.copy()
+        df_b["time"] = pd.Categorical(
+            df_b["time"].map(time_map),
+            categories=["t1", "t_unused1", "t2", "t3", "t_unused2", "t4", "t5"],
+            ordered=True,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r_a = joint_pretrends_test(
+                df_a,
+                "y",
+                "d",
+                "time",
+                "unit",
+                pre_periods=["t1"],
+                base_period="t3",
+                n_bootstrap=99,
+                seed=42,
+                trends_lin=True,
+            )
+            r_b = joint_pretrends_test(
+                df_b,
+                "y",
+                "d",
+                "time",
+                "unit",
+                pre_periods=["t1"],
+                base_period="t3",
+                n_bootstrap=99,
+                seed=42,
+                trends_lin=True,
+            )
+        assert r_a.cvm_stat_joint == r_b.cvm_stat_joint, (
+            f"unused-categorical invariance broken on pretrends: "
+            f"a={r_a.cvm_stat_joint}, b={r_b.cvm_stat_joint}"
+        )
+        assert r_a.p_value == r_b.p_value
+
+    def test_homogeneity_trends_lin_unused_categorical_invariant(self):
+        """Twin invariant for joint_homogeneity_test."""
+        df_int = self._panel(rng_seed=51)
+        time_map = {1: "t1", 2: "t2", 3: "t3", 4: "t4", 5: "t5"}
+        df_a = df_int.copy()
+        df_a["time"] = pd.Categorical(
+            df_a["time"].map(time_map),
+            categories=["t1", "t2", "t3", "t4", "t5"],
+            ordered=True,
+        )
+        df_b = df_int.copy()
+        df_b["time"] = pd.Categorical(
+            df_b["time"].map(time_map),
+            # Insert unused level between base (t3) and post (t4) — would
+            # change the post-period delta under the buggy full-cat rank.
+            categories=["t1", "t2", "t3", "t_unused", "t4", "t5"],
+            ordered=True,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r_a = joint_homogeneity_test(
+                df_a,
+                "y",
+                "d",
+                "time",
+                "unit",
+                post_periods=["t4", "t5"],
+                base_period="t3",
+                n_bootstrap=99,
+                seed=42,
+                trends_lin=True,
+            )
+            r_b = joint_homogeneity_test(
+                df_b,
+                "y",
+                "d",
+                "time",
+                "unit",
+                post_periods=["t4", "t5"],
+                base_period="t3",
+                n_bootstrap=99,
+                seed=42,
+                trends_lin=True,
+            )
+        assert r_a.cvm_stat_joint == r_b.cvm_stat_joint, (
+            f"unused-categorical invariance broken on homogeneity: "
+            f"a={r_a.cvm_stat_joint}, b={r_b.cvm_stat_joint}"
+        )
+        assert r_a.p_value == r_b.p_value
+
     def test_workflow_trends_lin_with_overall_aggregate_raises(self):
         """trends_lin=True only valid on event_study aggregate."""
         df = self._panel(rng_seed=34)