Skip to content

Commit 18219c5

Browse files
igerberclaude
andcommitted
Move stratified_survey DGP test to benchmarks/ (CI isolated-install fix)
The R13 P1 regression test ``test_stratified_survey_dgp_post_periods_cover_full_post_tail`` imports ``benchmarks.python.coverage_sdid`` directly to exercise the private ``_stratified_survey_dgp`` helper. CI's isolated-install job deliberately copies only ``tests/``, not ``benchmarks/``, so the module import failed with ``ModuleNotFoundError`` on CI runners that install the package into a fresh site-packages and then run the test suite against that install. The target is a benchmarking harness helper, not shipped package code, so the natural home is ``benchmarks/python/``. Moving it there keeps the test runnable locally (developer invokes explicitly before regenerating the coverage MC artifact) and out of CI's collection (``pyproject.toml testpaths = ["tests"]`` scopes default discovery to ``tests/`` only, so the new file never interferes). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 08056e4 commit 18219c5

2 files changed

Lines changed: 75 additions & 43 deletions

File tree

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""Harness-level sanity tests for ``coverage_sdid.py``.
2+
3+
Lives under ``benchmarks/`` rather than ``tests/`` because the targets
4+
here (the private DGP helpers inside ``coverage_sdid.py``) are part of
5+
the MC harness, not the shipped ``diff_diff`` package. CI's isolated-
6+
install job deliberately copies only ``tests/``; running the pytest
7+
collection here would just fail with ``ModuleNotFoundError`` on the
8+
``benchmarks`` import.
9+
10+
Invoke manually when about to regenerate the coverage MC artifact::
11+
12+
pytest benchmarks/python/test_coverage_sdid.py -v
13+
14+
Or from the repo root::
15+
16+
python -m pytest benchmarks/python/test_coverage_sdid.py -v
17+
"""
18+
19+
from __future__ import annotations
20+
21+
from pathlib import Path
22+
import sys
23+
24+
# Make the top-level ``benchmarks.python.coverage_sdid`` import resolvable
25+
# when pytest is invoked from the repo root. The harness itself does a
26+
# similar sys.path insert at module load; this mirror is only needed for
27+
# test collection from this file.
28+
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
29+
if str(_REPO_ROOT) not in sys.path:
30+
sys.path.insert(0, str(_REPO_ROOT))
31+
32+
from benchmarks.python import coverage_sdid # noqa: E402
33+
34+
35+
def test_stratified_survey_dgp_post_periods_cover_full_post_tail():
36+
"""The ``stratified_survey`` coverage DGP must not drop any post period.
37+
38+
Regression against PR #355 R13 P1: the harness previously hard-
39+
coded ``range(7, 12)`` as ``post_periods`` even though
40+
``generate_survey_did_data`` is 1-indexed and emits periods
41+
1..n_periods, so period 12 was silently included in the pre window.
42+
That contaminated the ``stratified_survey × bootstrap`` calibration
43+
row and every downstream REGISTRY claim that transcribes from the
44+
artifact. The fix derives ``post_periods`` from
45+
``df["period"].max()``; this test fails fast if a future refactor
46+
reintroduces the off-by-one.
47+
"""
48+
df, post_periods = coverage_sdid._stratified_survey_dgp(seed=0)
49+
50+
# Contract: post_periods covers the full tail from cohort onset
51+
# through df["period"].max(), with no gaps (unique + sorted +
52+
# contiguous + maxed at df["period"].max()).
53+
assert len(post_periods) == len(set(post_periods)), (
54+
f"post_periods has duplicates: {post_periods}"
55+
)
56+
assert post_periods == sorted(post_periods), (
57+
f"post_periods not sorted: {post_periods}"
58+
)
59+
gaps = [
60+
(a, b) for a, b in zip(post_periods, post_periods[1:]) if b - a != 1
61+
]
62+
assert not gaps, (
63+
f"post_periods has gaps: {gaps} in {post_periods}"
64+
)
65+
assert post_periods[-1] == int(df["period"].max()), (
66+
f"post_periods max {post_periods[-1]} != df[period].max() "
67+
f"{int(df['period'].max())} — DGP drops the last post period "
68+
"(off-by-one on 1-indexed generate_survey_did_data)."
69+
)
70+
# Strong form: cohort onset is 7, n_periods=12 → [7,8,9,10,11,12].
71+
assert post_periods == [7, 8, 9, 10, 11, 12], (
72+
f"post_periods={post_periods} must equal [7,8,9,10,11,12] for "
73+
"the documented 6-pre/6-post survey DGP; any other slice "
74+
"changes the calibration interpretation in REGISTRY."
75+
)

tests/test_methodology_sdid.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3519,46 +3519,3 @@ def test_coverage_artifacts_present(self):
35193519
"(strata/PSU/FPC raises NotImplementedError at fit-time)"
35203520
)
35213521

3522-
def test_stratified_survey_dgp_post_periods_cover_full_post_tail(self):
3523-
"""The ``stratified_survey`` coverage DGP must not drop any post period.
3524-
3525-
Regression against PR #355 R13 P1: the harness previously hard-
3526-
coded ``range(7, 12)`` as ``post_periods`` even though
3527-
``generate_survey_did_data`` is 1-indexed and emits periods
3528-
1..n_periods, so period 12 was silently included in the pre
3529-
window. That contaminated the ``stratified_survey × bootstrap``
3530-
calibration row and every downstream REGISTRY claim that
3531-
transcribes from the artifact. The fix derives
3532-
``post_periods`` from ``df["period"].max()``; this test fails
3533-
fast if a future refactor reintroduces the off-by-one.
3534-
"""
3535-
import importlib
3536-
coverage_sdid = importlib.import_module("benchmarks.python.coverage_sdid")
3537-
df, post_periods = coverage_sdid._stratified_survey_dgp(seed=0)
3538-
3539-
# Contract: post_periods covers the full tail from cohort onset
3540-
# through df["period"].max(), with no gaps (unique + sorted +
3541-
# contiguous + maxed at df["period"].max()).
3542-
assert len(post_periods) == len(set(post_periods)), (
3543-
f"post_periods has duplicates: {post_periods}"
3544-
)
3545-
assert post_periods == sorted(post_periods), (
3546-
f"post_periods not sorted: {post_periods}"
3547-
)
3548-
gaps = [
3549-
(a, b) for a, b in zip(post_periods, post_periods[1:]) if b - a != 1
3550-
]
3551-
assert not gaps, (
3552-
f"post_periods has gaps: {gaps} in {post_periods}"
3553-
)
3554-
assert post_periods[-1] == int(df["period"].max()), (
3555-
f"post_periods max {post_periods[-1]} != df[period].max() "
3556-
f"{int(df['period'].max())} — DGP drops the last post period "
3557-
"(off-by-one on 1-indexed generate_survey_did_data)."
3558-
)
3559-
# Strong form: cohort onset is 7, n_periods=12 → [7,8,9,10,11,12].
3560-
assert post_periods == [7, 8, 9, 10, 11, 12], (
3561-
f"post_periods={post_periods} must equal [7,8,9,10,11,12] for "
3562-
"the documented 6-pre/6-post survey DGP; any other slice "
3563-
"changes the calibration interpretation in REGISTRY."
3564-
)

0 commit comments

Comments
 (0)