Skip to content

Commit faa97bc

Browse files
igerberclaude
andcommitted
Address PR #408 R3 review (1 P2 deterministic stale-vs-final df forcing)
R3 P2: the previous mock-based regression checks the helper is called with the final df, but if no later IF site reduces n_valid relative to per-path snapshots, snapshot df coincides with final df and the assertion is vacuous. Add `test_per_path_inference_refreshes_to_lower_final_df`: deter- ministic forcing function via `mock.patch.object` on `_compute_se`, gated by a flag that flips after `_compute_path_effects` returns. After the flag is set, every subsequent `_compute_se` call returns a hardcoded low `n_valid=5` — so global placebo / overall / joiners / leavers all append 5, while per-path effects already snapshotted a high df from the unmodified pre-flag calls. Final `survey_metadata.df_survey = 5 - 1 = 4` is strictly less than the per-path snapshot df, forcing the refresh to demonstrably move per-path inference from the high snapshot df to the low final df. Sanity-checked: temporarily replacing `_refresh_path_inference` with a no-op causes the new test to fail with a stale-p_value assertion, confirming bug-detection. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e304c10 commit faa97bc

1 file changed

Lines changed: 125 additions & 0 deletions

File tree

tests/test_chaisemartin_dhaultfoeuille.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9467,6 +9467,131 @@ def test_per_path_replicate_se_finite(self):
94679467
any_finite = True
94689468
assert any_finite
94699469

9470+
@pytest.mark.slow
9471+
def test_per_path_inference_refreshes_to_lower_final_df(self):
9472+
"""Deterministic stale-vs-final df regression.
9473+
9474+
Forces a later IF site to return a smaller ``n_valid`` than the
9475+
per-path snapshot via monkeypatch on ``_compute_se``: a flag is
9476+
set after ``_compute_path_effects`` returns, and any subsequent
9477+
``_compute_se`` call (global placebo / overall / joiners /
9478+
leavers) returns a hardcoded low ``n_valid``. Per-path effects
9479+
therefore snapshot a HIGH df at their call site, while the
9480+
final ``_replicate_n_valid_list`` is bounded by the lowered
9481+
post-per-path appends, producing a strictly smaller final df.
9482+
9483+
Without ``_refresh_path_inference()`` running from the final
9484+
block, per-path effect inference would retain the stale high
9485+
df. This test asserts every populated per-path entry's
9486+
``t_stat`` / ``p_value`` / ``conf_int`` matches
9487+
``safe_inference(effect, se, df=results.survey_metadata.df_survey)``
9488+
(the LOW final df), proving the refresh moved the values to
9489+
the post-append df.
9490+
9491+
Regression for PR #408 R1 P1 / R3 P2 (deterministic version).
9492+
"""
9493+
import importlib
9494+
import unittest.mock as _mock
9495+
9496+
from diff_diff.survey import SurveyDesign
9497+
from diff_diff.utils import safe_inference
9498+
9499+
_cd_mod = importlib.import_module("diff_diff.chaisemartin_dhaultfoeuille")
9500+
9501+
df = _by_path_survey_data()
9502+
n_obs = len(df)
9503+
rng = np.random.default_rng(7)
9504+
# Use enough replicate columns so the natural n_valid is large
9505+
# and our forced low n_valid is detectably smaller.
9506+
rep_cols = [f"rep_{i}" for i in range(20)]
9507+
for col in rep_cols:
9508+
df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs))
9509+
sd = SurveyDesign(
9510+
weights="survey_weights",
9511+
replicate_weights=rep_cols,
9512+
replicate_method="JK1",
9513+
replicate_scale=1.0,
9514+
)
9515+
est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False)
9516+
9517+
real_compute_se = _cd_mod._compute_se
9518+
real_path_effects = _cd_mod._compute_path_effects
9519+
post_path_flag = [False]
9520+
forced_low_n_valid = 5
9521+
9522+
def wrapped_path_effects(*args, **kwargs):
9523+
result = real_path_effects(*args, **kwargs)
9524+
post_path_flag[0] = True
9525+
return result
9526+
9527+
def wrapped_compute_se(*args, **kwargs):
9528+
se, n_valid = real_compute_se(*args, **kwargs)
9529+
if post_path_flag[0] and n_valid is not None and n_valid > forced_low_n_valid:
9530+
return se, forced_low_n_valid
9531+
return se, n_valid
9532+
9533+
with _mock.patch.object(
9534+
_cd_mod, "_compute_path_effects",
9535+
side_effect=wrapped_path_effects,
9536+
), _mock.patch.object(
9537+
_cd_mod, "_compute_se",
9538+
side_effect=wrapped_compute_se,
9539+
):
9540+
with warnings.catch_warnings():
9541+
warnings.simplefilter("ignore", UserWarning)
9542+
res = est.fit(
9543+
df, outcome="outcome", group="group", time="period",
9544+
treatment="treatment", L_max=3, survey_design=sd,
9545+
)
9546+
9547+
# The forced low n_valid (5) at later IF sites bounds the final
9548+
# effective df at 5 - 1 = 4. JK1 / replicate convention:
9549+
# df_survey = min(n_valid) - 1.
9550+
expected_low_df = forced_low_n_valid - 1
9551+
assert res.survey_metadata is not None
9552+
assert res.survey_metadata.df_survey == expected_low_df, (
9553+
f"Expected forced final df={expected_low_df}, got "
9554+
f"{res.survey_metadata.df_survey}. The monkeypatch did not "
9555+
f"force a divergence — adjust forced_low_n_valid or fixture."
9556+
)
9557+
9558+
# Per-path effects entries snapshot df at fit-time BEFORE the
9559+
# forced lowering kicked in (so their snapshot df > final df).
9560+
# If `_refresh_path_inference` runs from the final block, every
9561+
# entry's t_stat / p_value / conf_int is recomputed at the low
9562+
# final df. If the helper is called from an earlier block (the
9563+
# bug), per-path effects keep the stale high-df inference.
9564+
assert res.path_effects is not None
9565+
any_compared = False
9566+
for path, entry in res.path_effects.items():
9567+
for l_h, vals in entry["horizons"].items():
9568+
if vals["n_obs"] == 0 or not np.isfinite(vals["se"]):
9569+
continue
9570+
t_final, p_final, ci_final = safe_inference(
9571+
vals["effect"], vals["se"],
9572+
alpha=est.alpha, df=expected_low_df,
9573+
)
9574+
np.testing.assert_allclose(
9575+
vals["t_stat"], t_final, atol=1e-12,
9576+
err_msg=(
9577+
f"path={path} l={l_h}: t_stat reflects stale "
9578+
f"snapshot df, not final df={expected_low_df}"
9579+
),
9580+
)
9581+
np.testing.assert_allclose(
9582+
vals["p_value"], p_final, atol=1e-12,
9583+
err_msg=f"path={path} l={l_h}: p_value stale",
9584+
)
9585+
np.testing.assert_allclose(
9586+
vals["conf_int"], ci_final, atol=1e-12,
9587+
err_msg=f"path={path} l={l_h}: conf_int stale",
9588+
)
9589+
any_compared = True
9590+
assert any_compared, (
9591+
"No per-path effects entry had finite SE — forcing function "
9592+
"did not exercise the refresh path."
9593+
)
9594+
94709595
@pytest.mark.slow
94719596
def test_refresh_path_inference_called_from_final_block(self):
94729597
"""Pin the helper's call site to the final R2 P1b block.

0 commit comments

Comments
 (0)