Address AI review findings on practitioner guardrails

igerber · claude · igerber · commit f34254254dfb · 2026-03-28T14:54:56.000-04:00
P1 fixes:
- Relabel workflow as "based on Baker et al. 2025" (not a 1:1 mapping)
- Add REGISTRY.md deviation notes for reorganized steps and diff-diff conventions
- Gate HonestDiD guidance to MultiPeriodDiD and CallawaySantAnna only
  (the only types supported by compute_honest_did)
- Add aggregate='event_study' requirement for CS + HonestDiD

P2 fixes:
- Fix attribute names: pre_treatment_rmse -&gt; pre_treatment_fit,
  sub_experiments -&gt; n_sub_experiments/stacked_data
- Fix EfficientDiD Hausman pretest (estimator method, not results attr)
- Fix equivalence_test_trends param: threshold -&gt; equivalence_margin
- Fix _covariates_step snippet to note .att vs .overall_att difference
- Fix test mocks to use correct attributes (overall_att/overall_se for
  staggered result types)

P3 fix:
- Replace Unicode box-drawing chars with ASCII in decision tree

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py
@@ -217,7 +217,8 @@ def _covariates_step() -> Dict[str, Any]:
         code=(
             "# Re-estimate without covariates and compare:\n"
             "result_no_cov = estimator.fit(data, ..., covariates=None)\n"
-            "print(f'Without covariates: ATT={result_no_cov.att:.4f}')"
+            "# Compare ATT with and without covariates.\n"
+            "# Use .att (basic DiD) or .overall_att (staggered estimators)."
         ),
         priority="medium",
         step_name="robustness",
@@ -265,7 +266,23 @@ def _handle_multi_period(results: Any):
 def _handle_cs(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _step(
+            baker_step=6,
+            label="Run HonestDiD sensitivity analysis",
+            why=(
+                "Bounds the treatment effect under plausible violations of "
+                "parallel trends. Requires event study effects — refit with "
+                "aggregate='event_study' or 'all' if not already done."
+            ),
+            code=(
+                "from diff_diff import compute_honest_did\n"
+                "# CS results must have event_study_effects:\n"
+                "results = cs.fit(data, ..., aggregate='event_study')\n"
+                "honest = compute_honest_did(results, method='relative_magnitude', M=1.0)\n"
+                "print(honest.summary())"
+            ),
+            step_name="sensitivity",
+        ),
         _step(
             baker_step=7,
             label="Examine group and event study effects",
@@ -292,7 +309,7 @@ def _handle_cs(results: Any):
 def _handle_sa(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _placebo_step(),
         _robustness_compare_step("CS, BJS, or Gardner"),
         _covariates_step(),
     ]
@@ -303,7 +320,7 @@ def _handle_sa(results: Any):
 def _handle_imputation(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _placebo_step(),
         _robustness_compare_step("CS, SA, or Gardner"),
         _covariates_step(),
     ]
@@ -314,7 +331,7 @@ def _handle_imputation(results: Any):
 def _handle_two_stage(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _placebo_step(),
         _robustness_compare_step("CS, BJS, or SA"),
         _covariates_step(),
     ]
@@ -325,15 +342,15 @@ def _handle_two_stage(results: Any):
 def _handle_stacked(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _placebo_step(),
         _step(
             baker_step=7,
             label="Check sub-experiment balance",
             why=(
                 "Stacked DiD constructs sub-experiments for each cohort. "
                 "Verify that each sub-experiment has sufficient controls."
             ),
-            code="# Inspect results.sub_experiments for balance",
+            code="# Check results.n_sub_experiments and inspect results.stacked_data",
             priority="medium",
             step_name="heterogeneity",
         ),
@@ -354,8 +371,8 @@ def _handle_synthetic(results: Any):
                 "approximate the counterfactual well."
             ),
             code=(
-                "# Check pre-treatment RMSE and unit weight concentration:\n"
-                "print(f'Pre-treatment RMSE: {results.pre_treatment_rmse:.4f}')\n"
+                "# Check pre-treatment fit and unit weight concentration:\n"
+                "print(f'Pre-treatment fit (RMSE): {results.pre_treatment_fit:.4f}')\n"
                 "# Highly concentrated weights suggest fragile estimates"
             ),
             step_name="sensitivity",
@@ -407,28 +424,24 @@ def _handle_trop(results: Any):
 def _handle_efficient(results: Any):
     steps = [
         _parallel_trends_step(),
-        _honest_did_step(),
+        _placebo_step(),
+        _step(
+            baker_step=7,
+            label="Run Hausman pretest (PT-All vs PT-Post)",
+            why=(
+                "EfficientDiD supports both PT-All and PT-Post assumptions. "
+                "The Hausman pretest compares them — report which was selected."
+            ),
+            code=(
+                "# Hausman pretest is an estimator method, not a results attribute:\n"
+                "# edid = EfficientDiD()\n"
+                "# results = edid.fit(data, ..., run_pretest=True)"
+            ),
+            step_name="heterogeneity",
+        ),
+        _robustness_compare_step("CS, SA, or BJS"),
+        _covariates_step(),
     ]
-    # Check for Hausman pretest
-    hausman = getattr(results, "hausman_pretest", None)
-    if hausman is not None:
-        steps.append(
-            _step(
-                baker_step=7,
-                label="Report Hausman pretest result",
-                why=(
-                    "The Hausman pretest compares PT-All vs PT-Post "
-                    "assumptions. Report which was selected and why."
-                ),
-                code=(
-                    f"# Hausman test p-value: {getattr(hausman, 'p_value', 'N/A')}\n"
-                    f"# Recommendation: {getattr(hausman, 'recommendation', 'N/A')}"
-                ),
-                step_name="heterogeneity",
-            )
-        )
-    steps.append(_robustness_compare_step("CS, SA, or BJS"))
-    steps.append(_covariates_step())
     warnings = _check_nan_att(results)
     return steps, warnings
 
diff --git a/docs/llms-full.txt b/docs/llms-full.txt
@@ -31,7 +31,7 @@ print(f"ATT: {results.att:.3f} (SE: {results.se:.3f})")
 - **Results objects**: Rich dataclass containers with `summary()`, `to_dict()`, `to_dataframe()`.
 - **Estimator aliases**: Short names available (e.g., `DiD`, `CS`, `SA`, `BJS`, `Gardner`, `SDiD`, `TWFE`, `DDD`, `CDiD`, `EDiD`, `Stacked`, `Bacon`).
 
-## Practitioner Workflow (Baker et al. 2025)
+## Practitioner Workflow (based on Baker et al. 2025)
 
 For rigorous DiD analysis, follow the 8-step framework in docs/llms-practitioner.txt.
 After estimation, call:
diff --git a/docs/llms-practitioner.txt b/docs/llms-practitioner.txt
@@ -1,8 +1,9 @@
-# diff-diff Practitioner Guide (Baker et al. 2025)
+# diff-diff Practitioner Guide
 
-> This guide maps Baker et al. (2025) "Difference-in-Differences Designs: A
-> Practitioner's Guide" to concrete diff-diff API calls. It ensures rigorous
-> causal inference by following an 8-step empiricist workflow.
+> An 8-step workflow for rigorous Difference-in-Differences analysis, based on
+> Baker et al. (2025) "Difference-in-Differences Designs: A Practitioner's
+> Guide" and adapted for the diff-diff library. Some steps are reorganized or
+> extended relative to the paper (see docs/methodology/REGISTRY.md for details).
 
 ## Instructions for AI Agents
 
@@ -90,7 +91,7 @@ print(f"Difference p-value: {pt_result['p_value']:.4f}")
 # Equivalence test (TOST) — tests that trends are meaningfully similar
 equiv = equivalence_test_trends(
     data, outcome='y', time='period', treatment_group='treated',
-    pre_periods=[1, 2, 3], threshold=0.5
+    pre_periods=[1, 2, 3], equivalence_margin=0.5
 )
 ```
 
@@ -106,26 +107,26 @@ Use this decision tree to select the appropriate estimator:
 
 ```
 Is treatment adoption staggered (multiple cohorts, different timing)?
-├── YES: Do NOT use plain TWFE. Use one of:
-│   ├── CallawaySantAnna (CS)  — most general, doubly robust, recommended default
-│   ├── SunAbraham (SA)        — interaction-weighted, good for event studies
-│   ├── ImputationDiD (BJS)    — most efficient under homogeneous effects
-│   ├── TwoStageDiD (Gardner)  — two-stage with GMM variance
-│   ├── StackedDiD (Stacked)   — sub-experiment approach
-│   └── EfficientDiD (EDiD)    �� optimal weighting for tighter SEs
-│
-├── NO, simple 2x2 design:
-│   └── DifferenceInDifferences (DiD)
-│
-├── Few treated units (< 20)?
-│   ├── SyntheticDiD (SDiD)    — synthetic control + DiD hybrid
-│   └── TROP                   — triply robust with factor adjustment
-│
-├── Continuous treatment (doses)?
-│   └── ContinuousDiD (CDiD)
-│
-└���─ Two eligibility criteria?
-    └── TripleDifference (DDD)
+|-- YES: Do NOT use plain TWFE. Use one of:
+|   |-- CallawaySantAnna (CS)  -- most general, doubly robust, recommended default
+|   |-- SunAbraham (SA)        -- interaction-weighted, good for event studies
+|   |-- ImputationDiD (BJS)    -- most efficient under homogeneous effects
+|   |-- TwoStageDiD (Gardner)  -- two-stage with GMM variance
+|   |-- StackedDiD (Stacked)   -- sub-experiment approach
+|   \-- EfficientDiD (EDiD)    -- optimal weighting for tighter SEs
+|
+|-- NO, simple 2x2 design:
+|   \-- DifferenceInDifferences (DiD)
+|
+|-- Few treated units (< 20)?
+|   |-- SyntheticDiD (SDiD)    -- synthetic control + DiD hybrid
+|   \-- TROP                   -- triply robust with factor adjustment
+|
+|-- Continuous treatment (doses)?
+|   \-- ContinuousDiD (CDiD)
+|
+\-- Two eligibility criteria?
+    \-- TripleDifference (DDD)
 ```
 
 Always run BaconDecomposition first if using TWFE, to check for negative
@@ -243,8 +244,9 @@ This step is CRITICAL and most often skipped. Run at least one of:
 
 ### HonestDiD (Rambachan & Roth 2023) — recommended
 Bounds on the treatment effect under violations of parallel trends.
-Works with MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD,
-TwoStageDiD, StackedDiD, and EfficientDiD results.
+Works with MultiPeriodDiD and CallawaySantAnna results only. For CS,
+requires `aggregate='event_study'` or `aggregate='all'` so that event
+study effects are available.
 
 ```python
 from diff_diff import compute_honest_did
diff --git a/docs/llms.txt b/docs/llms.txt
@@ -10,7 +10,7 @@ diff-diff offers 14 estimators covering basic 2x2 DiD, modern staggered adoption
 - Source: https://github.com/igerber/diff-diff
 - Docs: https://diff-diff.readthedocs.io/en/stable/
 
-## Practitioner Workflow (Baker et al. 2025)
+## Practitioner Workflow (based on Baker et al. 2025)
 
 IMPORTANT: For rigorous DiD analysis, follow these 8 steps. Skipping
 diagnostic steps produces unreliable results.
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -1995,6 +1995,24 @@ ContinuousDiD, EfficientDiD):
 
 ---
 
+# Practitioner Guide
+
+The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al. (2025)
+"Difference-in-Differences Designs: A Practitioner's Guide" (arXiv:2503.13323), not a
+1:1 mapping of the paper's forward-engineering framework.
+
+- **Note:** Parallel trends testing is a separate step (Step 3) rather than embedded in
+  the identification assumptions step (paper's Step 2), to ensure AI agents execute it.
+- **Note:** Sources of uncertainty (paper's Step 4) is folded into the estimation step
+  (Step 5) with an explicit cluster-count check directive (>= 50 clusters for asymptotic
+  SEs, otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention, not
+  from the paper.
+- **Note:** Step 8 is "Robustness & Reporting" (compare estimators, report with/without
+  covariates). The paper's Step 8 is "Keep learning" (explore alternative designs). The
+  mandatory with/without covariate comparison is a diff-diff convention.
+
+---
+
 # Version History
 
 - **v1.2** (2026-03-24): Added Survey-Aware Bootstrap section (Phase 6)
diff --git a/tests/test_practitioner.py b/tests/test_practitioner.py
@@ -118,17 +118,16 @@ def mock_trop_results():
 @pytest.fixture
 def mock_efficient_results():
     r = EfficientDiDResults.__new__(EfficientDiDResults)
-    r.att = 0.6
-    r.se = 0.15
-    r.hausman_pretest = None
+    r.overall_att = 0.6
+    r.overall_se = 0.15
     return r
 
 
 @pytest.fixture
 def mock_continuous_results():
     r = ContinuousDiDResults.__new__(ContinuousDiDResults)
-    r.att = 0.4
-    r.se = 0.1
+    r.overall_att = 0.4
+    r.overall_se = 0.1
     return r
 
 
@@ -143,32 +142,32 @@ def mock_triple_results():
 @pytest.fixture
 def mock_sa_results():
     r = SunAbrahamResults.__new__(SunAbrahamResults)
-    r.att = 0.5
-    r.se = 0.1
+    r.overall_att = 0.5
+    r.overall_se = 0.1
     return r
 
 
 @pytest.fixture
 def mock_imputation_results():
     r = ImputationDiDResults.__new__(ImputationDiDResults)
-    r.att = 0.5
-    r.se = 0.1
+    r.overall_att = 0.5
+    r.overall_se = 0.1
     return r
 
 
 @pytest.fixture
 def mock_two_stage_results():
     r = TwoStageDiDResults.__new__(TwoStageDiDResults)
-    r.att = 0.5
-    r.se = 0.1
+    r.overall_att = 0.5
+    r.overall_se = 0.1
     return r
 
 
 @pytest.fixture
 def mock_stacked_results():
     r = StackedDiDResults.__new__(StackedDiDResults)
-    r.att = 0.5
-    r.se = 0.1
+    r.overall_att = 0.5
+    r.overall_se = 0.1
     return r