Make all handlers fully estimator-specific, add negative assertions

igerber · claude · igerber · commit bea06a59c6d4 · 2026-03-28T17:17:39.000-04:00
Eliminate reuse of staggered templates for non-staggered estimators:

- SyntheticDiD/TROP: replace staggered _placebo_step with estimator-
  specific in-time/in-space placebo guidance (no control_group,
  anticipation, or cohort-dropping language)
- TripleDifference: rewrite to state the actual DDD identifying
  assumption (weaker than separate PT for two DiDs), remove the
  "requires PT along two dimensions" mistatement
- ContinuousDiD: already fixed (dose-specific guidance)

Tests: add negative content assertions ensuring SDiD/TROP don't
mention control_group/anticipation, CDiD doesn't emit
check_parallel_trends, and DDD doesn't claim "two dimensions".

Evaluation doc: tighten qualitative summary to match corrected
staggered-safe diagnostics.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py
@@ -416,7 +416,22 @@ def _handle_synthetic(results: Any):
             ),
             step_name="sensitivity",
         ),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="In-time or in-space placebo",
+            why=(
+                "Test robustness by re-estimating on a placebo treatment "
+                "period (in-time) or excluding treated units one at a time "
+                "(leave-one-out). These are the natural falsification "
+                "checks for synthetic control methods."
+            ),
+            code=(
+                "# In-time placebo: re-estimate with a fake treatment date\n"
+                "# Leave-one-out: drop each treated unit and re-estimate"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _step(
             baker_step=8,
             label="Compare with TROP or staggered estimators",
@@ -453,7 +468,22 @@ def _handle_trop(results: Any):
             ),
             step_name="sensitivity",
         ),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="In-time or in-space placebo",
+            why=(
+                "Test robustness by re-estimating on a placebo treatment "
+                "period or dropping treated units one at a time. These "
+                "are the natural falsification checks for factor-model "
+                "panel estimators."
+            ),
+            code=(
+                "# In-time placebo: re-estimate with a fake treatment date\n"
+                "# Leave-one-out: drop each treated unit and re-estimate"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _robustness_compare_step("SyntheticDiD or CS"),
     ]
     warnings = _check_nan_att(results)
@@ -537,30 +567,31 @@ def _handle_triple(results: Any):
     steps = [
         _step(
             baker_step=3,
-            label="Assess parallel trends for triple difference",
+            label="Assess DDD identifying assumption",
             why=(
-                "DDD requires parallel trends along two dimensions "
-                "(treatment eligibility and treatment exposure). The "
-                "generic check_parallel_trends() only tests a single "
-                "binary comparison. Inspect pre-treatment trends "
-                "separately for each dimension."
+                "DDD identification is weaker than requiring separate "
+                "parallel trends for two DiDs — it allows group-specific "
+                "and partition-specific PT violations as long as they "
+                "cancel in the triple difference. No built-in formal "
+                "test exists; inspect pre-treatment outcome patterns "
+                "across the treatment/eligibility/time cells."
             ),
             code=(
-                "# No built-in formal DDD PT test.\n"
-                "# Inspect pre-treatment trends in the treatment and\n"
-                "# eligibility dimensions separately."
+                "# No built-in formal DDD assumption test.\n"
+                "# Inspect pre-treatment means across treatment x eligibility\n"
+                "# cells to assess whether the DDD structure is plausible."
             ),
             step_name="parallel_trends",
         ),
         _step(
             baker_step=7,
-            label="Test within-group placebo",
+            label="Test placebo group",
             why=(
-                "DDD requires parallel trends along both dimensions. "
-                "Run placebo tests on the within-group (third difference) "
-                "dimension to verify."
+                "Re-estimate using a placebo eligibility group to check "
+                "whether the DDD result could be an artifact of the "
+                "group structure rather than the treatment."
             ),
-            code="# Re-estimate with a placebo group to test the third difference",
+            code="# Re-estimate with a placebo eligibility group",
             step_name="heterogeneity",
         ),
         _covariates_step(),
diff --git a/docs/practitioner-guide-evaluation.md b/docs/practitioner-guide-evaluation.md
@@ -93,9 +93,9 @@ of 9.6), so the model mix does not inflate the reported improvement.
 
 **After agents** consistently:
 - Structured their code around all 8 Baker steps explicitly
-- Ran formal pre-trends diagnostics (check_parallel_trends or CS event-study pre-periods)
+- Ran pre-trends diagnostics appropriate to design (CS event-study pre-periods for staggered)
 - Ran compute_honest_did() with specific M values
-- Ran run_all_placebo_tests()
+- Ran sensitivity/falsification checks (HonestDiD, specification comparisons)
 - Compared CS vs SA vs BJS
 - Called practitioner_next_steps(results)
 - Named specific PT variants (PT-GT-NYT, PT-GT-Nev)
diff --git a/tests/test_practitioner.py b/tests/test_practitioner.py
@@ -252,10 +252,18 @@ def test_synth_results(self, mock_synth_results):
         output = practitioner_next_steps(mock_synth_results, verbose=False)
         assert len(output["next_steps"]) > 0
         assert output["estimator"] == "SyntheticDiD"
+        # SDiD should NOT get staggered-specific guidance
+        all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
+        assert "control_group" not in all_text
+        assert "anticipation" not in all_text
 
     def test_trop_results(self, mock_trop_results):
         output = practitioner_next_steps(mock_trop_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # TROP should NOT get staggered-specific guidance
+        all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
+        assert "control_group" not in all_text
+        assert "anticipation" not in all_text
 
     def test_efficient_results(self, mock_efficient_results):
         output = practitioner_next_steps(mock_efficient_results, verbose=False)
@@ -264,10 +272,19 @@ def test_efficient_results(self, mock_efficient_results):
     def test_continuous_results(self, mock_continuous_results):
         output = practitioner_next_steps(mock_continuous_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # ContinuousDiD should NOT emit check_parallel_trends
+        all_text = " ".join(s.get("code", "") for s in output["next_steps"])
+        assert "check_parallel_trends" not in all_text
 
     def test_triple_results(self, mock_triple_results):
         output = practitioner_next_steps(mock_triple_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # DDD should NOT claim "requires PT along two dimensions"
+        all_text = " ".join(s.get("why", "") for s in output["next_steps"])
+        assert "two dimensions" not in all_text
+        assert "check_parallel_trends" not in " ".join(
+            s.get("code", "") for s in output["next_steps"]
+        )
 
 
 # ---------------------------------------------------------------------------