Skip to content

Commit bea06a5

Browse files
igerberclaude
andcommitted
Make all handlers fully estimator-specific, add negative assertions
Eliminate reuse of staggered templates for non-staggered estimators: - SyntheticDiD/TROP: replace staggered _placebo_step with estimator- specific in-time/in-space placebo guidance (no control_group, anticipation, or cohort-dropping language) - TripleDifference: rewrite to state the actual DDD identifying assumption (weaker than separate PT for two DiDs), remove the "requires PT along two dimensions" mistatement - ContinuousDiD: already fixed (dose-specific guidance) Tests: add negative content assertions ensuring SDiD/TROP don't mention control_group/anticipation, CDiD doesn't emit check_parallel_trends, and DDD doesn't claim "two dimensions". Evaluation doc: tighten qualitative summary to match corrected staggered-safe diagnostics. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 51bcf36 commit bea06a5

3 files changed

Lines changed: 66 additions & 18 deletions

File tree

diff_diff/practitioner.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,22 @@ def _handle_synthetic(results: Any):
416416
),
417417
step_name="sensitivity",
418418
),
419-
_placebo_step(staggered=True),
419+
_step(
420+
baker_step=6,
421+
label="In-time or in-space placebo",
422+
why=(
423+
"Test robustness by re-estimating on a placebo treatment "
424+
"period (in-time) or excluding treated units one at a time "
425+
"(leave-one-out). These are the natural falsification "
426+
"checks for synthetic control methods."
427+
),
428+
code=(
429+
"# In-time placebo: re-estimate with a fake treatment date\n"
430+
"# Leave-one-out: drop each treated unit and re-estimate"
431+
),
432+
priority="medium",
433+
step_name="sensitivity",
434+
),
420435
_step(
421436
baker_step=8,
422437
label="Compare with TROP or staggered estimators",
@@ -453,7 +468,22 @@ def _handle_trop(results: Any):
453468
),
454469
step_name="sensitivity",
455470
),
456-
_placebo_step(staggered=True),
471+
_step(
472+
baker_step=6,
473+
label="In-time or in-space placebo",
474+
why=(
475+
"Test robustness by re-estimating on a placebo treatment "
476+
"period or dropping treated units one at a time. These "
477+
"are the natural falsification checks for factor-model "
478+
"panel estimators."
479+
),
480+
code=(
481+
"# In-time placebo: re-estimate with a fake treatment date\n"
482+
"# Leave-one-out: drop each treated unit and re-estimate"
483+
),
484+
priority="medium",
485+
step_name="sensitivity",
486+
),
457487
_robustness_compare_step("SyntheticDiD or CS"),
458488
]
459489
warnings = _check_nan_att(results)
@@ -537,30 +567,31 @@ def _handle_triple(results: Any):
537567
steps = [
538568
_step(
539569
baker_step=3,
540-
label="Assess parallel trends for triple difference",
570+
label="Assess DDD identifying assumption",
541571
why=(
542-
"DDD requires parallel trends along two dimensions "
543-
"(treatment eligibility and treatment exposure). The "
544-
"generic check_parallel_trends() only tests a single "
545-
"binary comparison. Inspect pre-treatment trends "
546-
"separately for each dimension."
572+
"DDD identification is weaker than requiring separate "
573+
"parallel trends for two DiDs — it allows group-specific "
574+
"and partition-specific PT violations as long as they "
575+
"cancel in the triple difference. No built-in formal "
576+
"test exists; inspect pre-treatment outcome patterns "
577+
"across the treatment/eligibility/time cells."
547578
),
548579
code=(
549-
"# No built-in formal DDD PT test.\n"
550-
"# Inspect pre-treatment trends in the treatment and\n"
551-
"# eligibility dimensions separately."
580+
"# No built-in formal DDD assumption test.\n"
581+
"# Inspect pre-treatment means across treatment x eligibility\n"
582+
"# cells to assess whether the DDD structure is plausible."
552583
),
553584
step_name="parallel_trends",
554585
),
555586
_step(
556587
baker_step=7,
557-
label="Test within-group placebo",
588+
label="Test placebo group",
558589
why=(
559-
"DDD requires parallel trends along both dimensions. "
560-
"Run placebo tests on the within-group (third difference) "
561-
"dimension to verify."
590+
"Re-estimate using a placebo eligibility group to check "
591+
"whether the DDD result could be an artifact of the "
592+
"group structure rather than the treatment."
562593
),
563-
code="# Re-estimate with a placebo group to test the third difference",
594+
code="# Re-estimate with a placebo eligibility group",
564595
step_name="heterogeneity",
565596
),
566597
_covariates_step(),

docs/practitioner-guide-evaluation.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@ of 9.6), so the model mix does not inflate the reported improvement.
9393

9494
**After agents** consistently:
9595
- Structured their code around all 8 Baker steps explicitly
96-
- Ran formal pre-trends diagnostics (check_parallel_trends or CS event-study pre-periods)
96+
- Ran pre-trends diagnostics appropriate to design (CS event-study pre-periods for staggered)
9797
- Ran compute_honest_did() with specific M values
98-
- Ran run_all_placebo_tests()
98+
- Ran sensitivity/falsification checks (HonestDiD, specification comparisons)
9999
- Compared CS vs SA vs BJS
100100
- Called practitioner_next_steps(results)
101101
- Named specific PT variants (PT-GT-NYT, PT-GT-Nev)

tests/test_practitioner.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,10 +252,18 @@ def test_synth_results(self, mock_synth_results):
252252
output = practitioner_next_steps(mock_synth_results, verbose=False)
253253
assert len(output["next_steps"]) > 0
254254
assert output["estimator"] == "SyntheticDiD"
255+
# SDiD should NOT get staggered-specific guidance
256+
all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
257+
assert "control_group" not in all_text
258+
assert "anticipation" not in all_text
255259

256260
def test_trop_results(self, mock_trop_results):
257261
output = practitioner_next_steps(mock_trop_results, verbose=False)
258262
assert len(output["next_steps"]) > 0
263+
# TROP should NOT get staggered-specific guidance
264+
all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
265+
assert "control_group" not in all_text
266+
assert "anticipation" not in all_text
259267

260268
def test_efficient_results(self, mock_efficient_results):
261269
output = practitioner_next_steps(mock_efficient_results, verbose=False)
@@ -264,10 +272,19 @@ def test_efficient_results(self, mock_efficient_results):
264272
def test_continuous_results(self, mock_continuous_results):
265273
output = practitioner_next_steps(mock_continuous_results, verbose=False)
266274
assert len(output["next_steps"]) > 0
275+
# ContinuousDiD should NOT emit check_parallel_trends
276+
all_text = " ".join(s.get("code", "") for s in output["next_steps"])
277+
assert "check_parallel_trends" not in all_text
267278

268279
def test_triple_results(self, mock_triple_results):
269280
output = practitioner_next_steps(mock_triple_results, verbose=False)
270281
assert len(output["next_steps"]) > 0
282+
# DDD should NOT claim "requires PT along two dimensions"
283+
all_text = " ".join(s.get("why", "") for s in output["next_steps"])
284+
assert "two dimensions" not in all_text
285+
assert "check_parallel_trends" not in " ".join(
286+
s.get("code", "") for s in output["next_steps"]
287+
)
271288

272289

273290
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)