Skip to content

Commit 5d6b11c

Browse files
igerberclaude
andcommitted
Eliminate shared staggered placebo template entirely
Replace all remaining _placebo_step(staggered=True) calls with estimator-specific Step 6 guidance: - SA: control_group comparison (never_treated vs not_yet_treated) - ImputationDiD: no control_group param — cohort subsetting + cross- estimator comparison - TwoStageDiD: same as Imputation (no control_group) - StackedDiD: clean_control variation (not control_group) - EfficientDiD: never_treated vs last_cohort (not not_yet_treated) Tests: negative content assertions on code snippets for Imputation, TwoStage (no control_group in code), Stacked (clean_control), Efficient (no not_yet_treated in code). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent bea06a5 commit 5d6b11c

3 files changed

Lines changed: 92 additions & 8 deletions

File tree

diff_diff/practitioner.py

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,21 @@ def _handle_cs(results: Any):
348348
def _handle_sa(results: Any):
349349
steps = [
350350
_parallel_trends_step(staggered=True),
351-
_placebo_step(staggered=True),
351+
_step(
352+
baker_step=6,
353+
label="Specification-based falsification",
354+
why=(
355+
"Compare results across control group definitions "
356+
"(never_treated vs not_yet_treated) and anticipation "
357+
"settings to assess robustness."
358+
),
359+
code=(
360+
"# Re-estimate with different control group / anticipation:\n"
361+
"# sa_alt = SunAbraham(control_group='not_yet_treated')"
362+
),
363+
priority="medium",
364+
step_name="sensitivity",
365+
),
352366
_robustness_compare_step("CS, BJS, or Gardner"),
353367
_covariates_step(),
354368
]
@@ -359,7 +373,22 @@ def _handle_sa(results: Any):
359373
def _handle_imputation(results: Any):
360374
steps = [
361375
_parallel_trends_step(staggered=True),
362-
_placebo_step(staggered=True),
376+
_step(
377+
baker_step=6,
378+
label="Specification-based falsification",
379+
why=(
380+
"ImputationDiD does not have a control_group parameter. "
381+
"Compare results with and without covariates, vary the "
382+
"sample (drop cohorts), and compare with CS/SA as "
383+
"falsification checks."
384+
),
385+
code=(
386+
"# Compare with alternative estimators as robustness:\n"
387+
"# Leave-one-cohort-out sensitivity analysis"
388+
),
389+
priority="medium",
390+
step_name="sensitivity",
391+
),
363392
_robustness_compare_step("CS, SA, or Gardner"),
364393
_covariates_step(),
365394
]
@@ -370,7 +399,22 @@ def _handle_imputation(results: Any):
370399
def _handle_two_stage(results: Any):
371400
steps = [
372401
_parallel_trends_step(staggered=True),
373-
_placebo_step(staggered=True),
402+
_step(
403+
baker_step=6,
404+
label="Specification-based falsification",
405+
why=(
406+
"TwoStageDiD does not have a control_group parameter. "
407+
"Compare results with and without covariates, vary the "
408+
"sample (drop cohorts), and compare with CS/SA as "
409+
"falsification checks."
410+
),
411+
code=(
412+
"# Compare with alternative estimators as robustness:\n"
413+
"# Leave-one-cohort-out sensitivity analysis"
414+
),
415+
priority="medium",
416+
step_name="sensitivity",
417+
),
374418
_robustness_compare_step("CS, BJS, or SA"),
375419
_covariates_step(),
376420
]
@@ -381,7 +425,21 @@ def _handle_two_stage(results: Any):
381425
def _handle_stacked(results: Any):
382426
steps = [
383427
_parallel_trends_step(staggered=True),
384-
_placebo_step(staggered=True),
428+
_step(
429+
baker_step=6,
430+
label="Vary clean control definition",
431+
why=(
432+
"StackedDiD uses clean_control parameter (not control_group). "
433+
"Compare results with different clean control definitions "
434+
"and event window widths as falsification."
435+
),
436+
code=(
437+
"# Re-estimate with different clean_control settings:\n"
438+
"# stacked_alt = StackedDiD(clean_control='not_yet_treated')"
439+
),
440+
priority="medium",
441+
step_name="sensitivity",
442+
),
385443
_step(
386444
baker_step=7,
387445
label="Check sub-experiment balance",
@@ -493,7 +551,21 @@ def _handle_trop(results: Any):
493551
def _handle_efficient(results: Any):
494552
steps = [
495553
_parallel_trends_step(staggered=True),
496-
_placebo_step(staggered=True),
554+
_step(
555+
baker_step=6,
556+
label="Compare control group definitions",
557+
why=(
558+
"EfficientDiD supports never_treated and last_cohort "
559+
"control groups (not not_yet_treated). Compare results "
560+
"across both to assess robustness."
561+
),
562+
code=(
563+
"# Re-estimate with alternative control group:\n"
564+
"# edid_alt = EfficientDiD(control_group='last_cohort')"
565+
),
566+
priority="medium",
567+
step_name="sensitivity",
568+
),
497569
_step(
498570
baker_step=7,
499571
label="Run Hausman pretest (PT-All vs PT-Post)",

docs/practitioner-guide-evaluation.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ of 9.6), so the model mix does not inflate the reported improvement.
5656
| S3: Test parallel trends | 0.1 | 2.0 | **+1.9** | From near-zero to universal formal PT testing |
5757
| S4: Choose estimator | 2.0 | 2.0 | 0.0 | Already perfect before |
5858
| S5: Estimate (cluster check) | 1.0 | 1.5 | +0.5 | Now discuss wild bootstrap alternative |
59-
| S6: Sensitivity | **0.1** | **2.0** | **+1.9** | From near-zero to universal HonestDiD + placebo |
59+
| S6: Sensitivity | **0.1** | **2.0** | **+1.9** | From near-zero to universal HonestDiD + falsification checks |
6060
| S7: Heterogeneity | 1.4 | 2.0 | +0.6 | Now consistently do group + event study |
6161
| S8: Robustness | 0.9 | 1.75 | +0.85 | Now compare 3 estimators; ~50% add with/without covariates |
6262

6363
### Key Findings
6464

6565
1. **Sensitivity analysis (Step 6) showed the largest improvement**: 0.1 to 2.0
66-
(+1.9 points). Before, 0/10 agents ran HonestDiD or placebo tests. After,
67-
10/10 agents ran both. This was the primary gap the guide was designed to fix.
66+
(+1.9 points). Before, 0/10 agents ran HonestDiD or sensitivity checks.
67+
After, 10/10 ran HonestDiD and/or specification-based falsification.
6868

6969
2. **Target parameter and assumptions (Steps 1-2) went from partial to full**:
7070
Before, agents mentioned "ATT" generically. After, they explicitly name the

tests/test_practitioner.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,14 +239,23 @@ def test_sa_results(self, mock_sa_results):
239239
def test_imputation_results(self, mock_imputation_results):
240240
output = practitioner_next_steps(mock_imputation_results, verbose=False)
241241
assert len(output["next_steps"]) > 0
242+
# ImputationDiD has no control_group parameter — code snippets must not use it
243+
all_code = " ".join(s.get("code", "") for s in output["next_steps"])
244+
assert "control_group" not in all_code
242245

243246
def test_two_stage_results(self, mock_two_stage_results):
244247
output = practitioner_next_steps(mock_two_stage_results, verbose=False)
245248
assert len(output["next_steps"]) > 0
249+
# TwoStageDiD has no control_group parameter — code snippets must not use it
250+
all_code = " ".join(s.get("code", "") for s in output["next_steps"])
251+
assert "control_group" not in all_code
246252

247253
def test_stacked_results(self, mock_stacked_results):
248254
output = practitioner_next_steps(mock_stacked_results, verbose=False)
249255
assert len(output["next_steps"]) > 0
256+
# StackedDiD uses clean_control, not control_group
257+
all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
258+
assert "not_yet_treated" not in all_text or "clean_control" in all_text
250259

251260
def test_synth_results(self, mock_synth_results):
252261
output = practitioner_next_steps(mock_synth_results, verbose=False)
@@ -268,6 +277,9 @@ def test_trop_results(self, mock_trop_results):
268277
def test_efficient_results(self, mock_efficient_results):
269278
output = practitioner_next_steps(mock_efficient_results, verbose=False)
270279
assert len(output["next_steps"]) > 0
280+
# EfficientDiD uses never_treated/last_cohort — code must not suggest not_yet_treated
281+
all_code = " ".join(s.get("code", "") for s in output["next_steps"])
282+
assert "not_yet_treated" not in all_code
271283

272284
def test_continuous_results(self, mock_continuous_results):
273285
output = practitioner_next_steps(mock_continuous_results, verbose=False)

0 commit comments

Comments
 (0)