Skip to content

Commit b7b7eb3

Browse files
igerberclaude
andcommitted
Address PR #392 R4 review (1 P0 + 1 P3)
P0 (observed-period detrending delta): The R3 P1 fix made the slope LOOKUP use observed periods, but the detrending DELTA (`t_rank - base_rank`) still pulled ranks from the full categorical dtype via _build_period_rank. On panels with unused intermediate categorical levels, the same observed data produced different (t - base) multipliers and corrupted the joint test statistic — silent wrong statistical output. Fix: under trends_lin=True, build a fresh observed_rank dict from sorted(set(data_filtered[time_col].unique())) and use it for both the base and horizon ranks in the delta computation. Mirrors HAD.fit's `_aggregate_multi_period_first_differences` convention (`sorted(t_pre_list + t_post_list, ...)` for the event-time rank). Both joint wrappers fixed; workflow inherits the fix automatically. Regression tests (2): - joint_pretrends_test on (categorical with 2 unused levels) produces identical cvm_stat_joint and p_value to (categorical without unused levels) on the same observed data - joint_homogeneity_test twin invariant (unused level between base and post) P3 (exact upstream-version pin): The parity contract cited DIDHAD v2.0.0 / SHA edc09197 in CHANGELOG, REGISTRY, and the parity test docstrings, but the generator and test only enforced `>= 2.0.0`. Future regeneration could silently re-anchor goldens to a newer release while docs still cited the old version. Fix: pin exactly DIDHAD == 2.0.0 and YatchewTest == 1.1.1 in both the generator's stopifnot guards and the parity test's metadata assertion. Document the bump procedure in the comments. Stats: 540 tests pass (538 prior + 2 new R4 P0 regressions), 0 regressions. All 24 R-parity cells still green at atol=1e-8 / 1e-10. Note: existing categorical-level invariance tests added in R3 still pass — they exercised correctness on simple unused-level shifts; the R4 invariants are stricter, asserting bit-exact identity of the joint statistics across categorical re-ordering of the same observed panel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent acbf699 commit b7b7eb3

4 files changed

Lines changed: 157 additions & 9 deletions

File tree

benchmarks/R/generate_did_had_golden.R

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,13 @@ library(jsonlite)
2525
library(DIDHAD)
2626
library(YatchewTest)
2727

28-
stopifnot(packageVersion("DIDHAD") >= "2.0.0")
29-
stopifnot(packageVersion("YatchewTest") >= "1.1.0")
28+
# PR #392 R4 P3: pin exact upstream versions so future regeneration
29+
# does not silently re-anchor the goldens to a newer CRAN release
30+
# while CHANGELOG / REGISTRY / parity test still cite v2.0.0 / SHA
31+
# `edc09197`. Bump these pins (here AND in the parity test's
32+
# `test_metadata_versions_match`) when intentionally re-anchoring.
33+
stopifnot(packageVersion("DIDHAD") == "2.0.0")
34+
stopifnot(packageVersion("YatchewTest") == "1.1.1")
3035

3136
# -------------------------------------------------------------------------
3237
# Panel builder: 5-period panel with F=4 (treatment onset at t=4).

diff_diff/had_pretests.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3693,11 +3693,28 @@ def joint_pretrends_test(
36933693
slope = wide_y[base_period].to_numpy(dtype=np.float64) - wide_y[
36943694
base_minus_1_period
36953695
].to_numpy(dtype=np.float64)
3696+
# PR #392 R4 P0: build the detrending rank from OBSERVED
3697+
# periods (on data_filtered), not from the full categorical
3698+
# dtype. Otherwise unused intermediate categorical levels
3699+
# silently change the (t - base) multiplier and corrupt the
3700+
# joint statistic. Mirrors HAD.fit's
3701+
# `_aggregate_multi_period_first_differences` convention which
3702+
# uses `sorted(t_pre_list + t_post_list, ...)` for the
3703+
# event-time rank.
3704+
observed_rank = {
3705+
p: i
3706+
for i, p in enumerate(
3707+
sorted(
3708+
set(data_filtered[time_col].unique()),
3709+
key=lambda p: period_rank[p],
3710+
)
3711+
)
3712+
}
3713+
base_rank_observed = observed_rank[base_period]
36963714
# Apply detrending in place to remaining dy_by_horizon entries.
36973715
for t in pre_periods_effective:
36983716
label = str(t)
3699-
t_rank = period_rank[t]
3700-
delta = t_rank - base_rank # < 0 for pre-periods
3717+
delta = observed_rank[t] - base_rank_observed # < 0 for pre-periods
37013718
dy_by_horizon[label] = dy_by_horizon[label] - delta * slope
37023719

37033720
# Phase 4.5 C: aggregate per-row weights/survey to per-unit (G,)
@@ -4071,10 +4088,22 @@ def joint_homogeneity_test(
40714088
slope_h = wide_y_h[base_period].to_numpy(dtype=np.float64) - wide_y_h[
40724089
base_minus_1_period_h
40734090
].to_numpy(dtype=np.float64)
4091+
# PR #392 R4 P0: build the detrending rank from OBSERVED
4092+
# periods on data_filtered (matching HAD.fit). Twin of
4093+
# joint_pretrends_test fix.
4094+
observed_rank_h = {
4095+
p: i
4096+
for i, p in enumerate(
4097+
sorted(
4098+
set(data_filtered[time_col].unique()),
4099+
key=lambda p: period_rank[p],
4100+
)
4101+
)
4102+
}
4103+
base_rank_observed_h = observed_rank_h[base_period]
40744104
for t in post_periods:
40754105
label = str(t)
4076-
t_rank = period_rank[t]
4077-
delta = t_rank - base_rank # > 0 for post-periods
4106+
delta = observed_rank_h[t] - base_rank_observed_h # > 0 for post-periods
40784107
dy_by_horizon[label] = dy_by_horizon[label] - delta * slope_h
40794108

40804109
# Phase 4.5 C: aggregate weights/survey to per-unit; thread through.

tests/test_did_had_parity.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,11 +428,23 @@ class TestFixtureMetadata:
428428
"""Sanity checks on the fixture itself."""
429429

430430
def test_metadata_versions_match(self, fixture):
431-
"""Ensure the JSON metadata lists the expected DIDHAD version pin."""
431+
"""Ensure the JSON metadata lists the EXACT pinned upstream
432+
versions. PR #392 R4 P3: exact pin (not >=) so future
433+
regeneration does not silently re-anchor the goldens to a
434+
newer CRAN release while changelog / registry still cite the
435+
old version. Bump these pins (here AND in
436+
``benchmarks/R/generate_did_had_golden.R``) when intentionally
437+
re-anchoring."""
432438
meta = fixture["metadata"]
433-
assert meta["didhad_version"] >= "2.0.0", (
439+
assert meta["didhad_version"] == "2.0.0", (
434440
f"Fixture was generated against DIDHAD={meta['didhad_version']!r}; "
435-
f"the parity test expects >= 2.0.0. Regenerate the fixture."
441+
f"the parity test pins exactly 2.0.0. Regenerate after bumping "
442+
f"the pin in both the generator and this test."
443+
)
444+
assert meta["yatchewtest_version"] == "1.1.1", (
445+
f"Fixture was generated against YatchewTest="
446+
f"{meta['yatchewtest_version']!r}; the parity test pins exactly "
447+
f"1.1.1. Regenerate after bumping the pin."
436448
)
437449

438450
def test_metadata_n_dgps(self, fixture):

tests/test_had_pretests.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4812,6 +4812,108 @@ def test_pretrends_trends_lin_unused_categorical_observed_only(self):
48124812
assert np.isfinite(r.cvm_stat_joint)
48134813
assert np.isfinite(r.p_value)
48144814

4815+
def test_pretrends_trends_lin_unused_categorical_invariant(self):
4816+
"""Same observed panel with vs without unused intermediate
4817+
categorical levels must produce IDENTICAL joint statistics on
4818+
the pretrends path. Reviewer-requested invariant for PR #392
4819+
R4 P0 — detrending delta must use observed-period rank, not
4820+
full-categorical rank."""
4821+
df_int = self._panel(rng_seed=50)
4822+
time_map = {1: "t1", 2: "t2", 3: "t3", 4: "t4", 5: "t5"}
4823+
df_a = df_int.copy()
4824+
df_a["time"] = pd.Categorical(
4825+
df_a["time"].map(time_map),
4826+
categories=["t1", "t2", "t3", "t4", "t5"],
4827+
ordered=True,
4828+
)
4829+
df_b = df_int.copy()
4830+
df_b["time"] = pd.Categorical(
4831+
df_b["time"].map(time_map),
4832+
categories=["t1", "t_unused1", "t2", "t3", "t_unused2", "t4", "t5"],
4833+
ordered=True,
4834+
)
4835+
with warnings.catch_warnings():
4836+
warnings.simplefilter("ignore", UserWarning)
4837+
r_a = joint_pretrends_test(
4838+
df_a,
4839+
"y",
4840+
"d",
4841+
"time",
4842+
"unit",
4843+
pre_periods=["t1"],
4844+
base_period="t3",
4845+
n_bootstrap=99,
4846+
seed=42,
4847+
trends_lin=True,
4848+
)
4849+
r_b = joint_pretrends_test(
4850+
df_b,
4851+
"y",
4852+
"d",
4853+
"time",
4854+
"unit",
4855+
pre_periods=["t1"],
4856+
base_period="t3",
4857+
n_bootstrap=99,
4858+
seed=42,
4859+
trends_lin=True,
4860+
)
4861+
assert r_a.cvm_stat_joint == r_b.cvm_stat_joint, (
4862+
f"unused-categorical invariance broken on pretrends: "
4863+
f"a={r_a.cvm_stat_joint}, b={r_b.cvm_stat_joint}"
4864+
)
4865+
assert r_a.p_value == r_b.p_value
4866+
4867+
def test_homogeneity_trends_lin_unused_categorical_invariant(self):
4868+
"""Twin invariant for joint_homogeneity_test."""
4869+
df_int = self._panel(rng_seed=51)
4870+
time_map = {1: "t1", 2: "t2", 3: "t3", 4: "t4", 5: "t5"}
4871+
df_a = df_int.copy()
4872+
df_a["time"] = pd.Categorical(
4873+
df_a["time"].map(time_map),
4874+
categories=["t1", "t2", "t3", "t4", "t5"],
4875+
ordered=True,
4876+
)
4877+
df_b = df_int.copy()
4878+
df_b["time"] = pd.Categorical(
4879+
df_b["time"].map(time_map),
4880+
# Insert unused level between base (t3) and post (t4) — would
4881+
# change the post-period delta under the buggy full-cat rank.
4882+
categories=["t1", "t2", "t3", "t_unused", "t4", "t5"],
4883+
ordered=True,
4884+
)
4885+
with warnings.catch_warnings():
4886+
warnings.simplefilter("ignore", UserWarning)
4887+
r_a = joint_homogeneity_test(
4888+
df_a,
4889+
"y",
4890+
"d",
4891+
"time",
4892+
"unit",
4893+
post_periods=["t4", "t5"],
4894+
base_period="t3",
4895+
n_bootstrap=99,
4896+
seed=42,
4897+
trends_lin=True,
4898+
)
4899+
r_b = joint_homogeneity_test(
4900+
df_b,
4901+
"y",
4902+
"d",
4903+
"time",
4904+
"unit",
4905+
post_periods=["t4", "t5"],
4906+
base_period="t3",
4907+
n_bootstrap=99,
4908+
seed=42,
4909+
trends_lin=True,
4910+
)
4911+
assert r_a.cvm_stat_joint == r_b.cvm_stat_joint, (
4912+
f"unused-categorical invariance broken on homogeneity: "
4913+
f"a={r_a.cvm_stat_joint}, b={r_b.cvm_stat_joint}"
4914+
)
4915+
assert r_a.p_value == r_b.p_value
4916+
48154917
def test_workflow_trends_lin_with_overall_aggregate_raises(self):
48164918
"""trends_lin=True only valid on event_study aggregate."""
48174919
df = self._panel(rng_seed=34)

0 commit comments

Comments
 (0)