Eliminate shared staggered placebo template entirely

igerber · claude · igerber · commit 5d6b11c09753 · 2026-03-28T17:29:38.000-04:00
Replace all remaining _placebo_step(staggered=True) calls with
estimator-specific Step 6 guidance:

- SA: control_group comparison (never_treated vs not_yet_treated)
- ImputationDiD: no control_group param — cohort subsetting + cross-
  estimator comparison
- TwoStageDiD: same as Imputation (no control_group)
- StackedDiD: clean_control variation (not control_group)
- EfficientDiD: never_treated vs last_cohort (not not_yet_treated)

Tests: negative content assertions on code snippets for Imputation,
TwoStage (no control_group in code), Stacked (clean_control),
Efficient (no not_yet_treated in code).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py
@@ -348,7 +348,21 @@ def _handle_cs(results: Any):
 def _handle_sa(results: Any):
     steps = [
         _parallel_trends_step(staggered=True),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="Specification-based falsification",
+            why=(
+                "Compare results across control group definitions "
+                "(never_treated vs not_yet_treated) and anticipation "
+                "settings to assess robustness."
+            ),
+            code=(
+                "# Re-estimate with different control group / anticipation:\n"
+                "# sa_alt = SunAbraham(control_group='not_yet_treated')"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _robustness_compare_step("CS, BJS, or Gardner"),
         _covariates_step(),
     ]
@@ -359,7 +373,22 @@ def _handle_sa(results: Any):
 def _handle_imputation(results: Any):
     steps = [
         _parallel_trends_step(staggered=True),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="Specification-based falsification",
+            why=(
+                "ImputationDiD does not have a control_group parameter. "
+                "Compare results with and without covariates, vary the "
+                "sample (drop cohorts), and compare with CS/SA as "
+                "falsification checks."
+            ),
+            code=(
+                "# Compare with alternative estimators as robustness:\n"
+                "# Leave-one-cohort-out sensitivity analysis"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _robustness_compare_step("CS, SA, or Gardner"),
         _covariates_step(),
     ]
@@ -370,7 +399,22 @@ def _handle_imputation(results: Any):
 def _handle_two_stage(results: Any):
     steps = [
         _parallel_trends_step(staggered=True),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="Specification-based falsification",
+            why=(
+                "TwoStageDiD does not have a control_group parameter. "
+                "Compare results with and without covariates, vary the "
+                "sample (drop cohorts), and compare with CS/SA as "
+                "falsification checks."
+            ),
+            code=(
+                "# Compare with alternative estimators as robustness:\n"
+                "# Leave-one-cohort-out sensitivity analysis"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _robustness_compare_step("CS, BJS, or SA"),
         _covariates_step(),
     ]
@@ -381,7 +425,21 @@ def _handle_two_stage(results: Any):
 def _handle_stacked(results: Any):
     steps = [
         _parallel_trends_step(staggered=True),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="Vary clean control definition",
+            why=(
+                "StackedDiD uses clean_control parameter (not control_group). "
+                "Compare results with different clean control definitions "
+                "and event window widths as falsification."
+            ),
+            code=(
+                "# Re-estimate with different clean_control settings:\n"
+                "# stacked_alt = StackedDiD(clean_control='not_yet_treated')"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _step(
             baker_step=7,
             label="Check sub-experiment balance",
@@ -493,7 +551,21 @@ def _handle_trop(results: Any):
 def _handle_efficient(results: Any):
     steps = [
         _parallel_trends_step(staggered=True),
-        _placebo_step(staggered=True),
+        _step(
+            baker_step=6,
+            label="Compare control group definitions",
+            why=(
+                "EfficientDiD supports never_treated and last_cohort "
+                "control groups (not not_yet_treated). Compare results "
+                "across both to assess robustness."
+            ),
+            code=(
+                "# Re-estimate with alternative control group:\n"
+                "# edid_alt = EfficientDiD(control_group='last_cohort')"
+            ),
+            priority="medium",
+            step_name="sensitivity",
+        ),
         _step(
             baker_step=7,
             label="Run Hausman pretest (PT-All vs PT-Post)",
diff --git a/docs/practitioner-guide-evaluation.md b/docs/practitioner-guide-evaluation.md
@@ -56,15 +56,15 @@ of 9.6), so the model mix does not inflate the reported improvement.
 | S3: Test parallel trends | 0.1 | 2.0 | **+1.9** | From near-zero to universal formal PT testing |
 | S4: Choose estimator | 2.0 | 2.0 | 0.0 | Already perfect before |
 | S5: Estimate (cluster check) | 1.0 | 1.5 | +0.5 | Now discuss wild bootstrap alternative |
-| S6: Sensitivity | **0.1** | **2.0** | **+1.9** | From near-zero to universal HonestDiD + placebo |
+| S6: Sensitivity | **0.1** | **2.0** | **+1.9** | From near-zero to universal HonestDiD + falsification checks |
 | S7: Heterogeneity | 1.4 | 2.0 | +0.6 | Now consistently do group + event study |
 | S8: Robustness | 0.9 | 1.75 | +0.85 | Now compare 3 estimators; ~50% add with/without covariates |
 
 ### Key Findings
 
 1. **Sensitivity analysis (Step 6) showed the largest improvement**: 0.1 to 2.0
-   (+1.9 points). Before, 0/10 agents ran HonestDiD or placebo tests. After,
-   10/10 agents ran both. This was the primary gap the guide was designed to fix.
+   (+1.9 points). Before, 0/10 agents ran HonestDiD or sensitivity checks.
+   After, 10/10 ran HonestDiD and/or specification-based falsification.
 
 2. **Target parameter and assumptions (Steps 1-2) went from partial to full**:
    Before, agents mentioned "ATT" generically. After, they explicitly name the
diff --git a/tests/test_practitioner.py b/tests/test_practitioner.py
@@ -239,14 +239,23 @@ def test_sa_results(self, mock_sa_results):
     def test_imputation_results(self, mock_imputation_results):
         output = practitioner_next_steps(mock_imputation_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # ImputationDiD has no control_group parameter — code snippets must not use it
+        all_code = " ".join(s.get("code", "") for s in output["next_steps"])
+        assert "control_group" not in all_code
 
     def test_two_stage_results(self, mock_two_stage_results):
         output = practitioner_next_steps(mock_two_stage_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # TwoStageDiD has no control_group parameter — code snippets must not use it
+        all_code = " ".join(s.get("code", "") for s in output["next_steps"])
+        assert "control_group" not in all_code
 
     def test_stacked_results(self, mock_stacked_results):
         output = practitioner_next_steps(mock_stacked_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # StackedDiD uses clean_control, not control_group
+        all_text = " ".join(s.get("code", "") + s.get("why", "") for s in output["next_steps"])
+        assert "not_yet_treated" not in all_text or "clean_control" in all_text
 
     def test_synth_results(self, mock_synth_results):
         output = practitioner_next_steps(mock_synth_results, verbose=False)
@@ -268,6 +277,9 @@ def test_trop_results(self, mock_trop_results):
     def test_efficient_results(self, mock_efficient_results):
         output = practitioner_next_steps(mock_efficient_results, verbose=False)
         assert len(output["next_steps"]) > 0
+        # EfficientDiD uses never_treated/last_cohort — code must not suggest not_yet_treated
+        all_code = " ".join(s.get("code", "") for s in output["next_steps"])
+        assert "not_yet_treated" not in all_code
 
     def test_continuous_results(self, mock_continuous_results):
         output = practitioner_next_steps(mock_continuous_results, verbose=False)