Align canonical step numbering and fix Bacon/EfficientDiD handlers

igerber · claude · igerber · commit 50bc3fcd913b · 2026-03-28T15:11:28.000-04:00
P1: Align all files to canonical 8-step numbering from llms.txt:
  1-Define, 2-Assumptions, 3-Test PT, 4-Choose estimator,
  5-Estimate (with cluster check), 6-Sensitivity, 7-Heterogeneity,
  8-Robustness. Moved PT testing from Step 2 into separate Step 3
  in llms-practitioner.txt, folded uncertainty into Step 5, updated
  baker_step values in practitioner.py (PT: 2-&gt;3, estimator: 3-&gt;4).

P2: Bacon handler now checks total_weight_later_vs_earlier &gt; 0.01
  instead of negative weights (matches actual BaconDecompositionResults
  API). EfficientDiD snippet uses actual hausman_pretest() classmethod
  instead of nonexistent run_pretest=True parameter.

Tests: Updated step number assertions, added Bacon warning tests (2)
  and EfficientDiD handler tests (2). Suite now 30 tests.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py
@@ -136,7 +136,7 @@ def _step(
 # ---------------------------------------------------------------------------
 def _parallel_trends_step() -> Dict[str, Any]:
     return _step(
-        baker_step=2,
+        baker_step=3,
         label="Test parallel trends assumption",
         why=(
             "Parallel trends is the core identifying assumption. "
@@ -233,7 +233,7 @@ def _handle_did(results: Any):
         _parallel_trends_step(),
         _placebo_step(),
         _step(
-            baker_step=3,
+            baker_step=4,
             label="Check if data is actually staggered",
             why=(
                 "If treatment timing varies across units, basic DiD produces "
@@ -433,9 +433,10 @@ def _handle_efficient(results: Any):
                 "The Hausman pretest compares them — report which was selected."
             ),
             code=(
-                "# Hausman pretest is an estimator method, not a results attribute:\n"
-                "# edid = EfficientDiD()\n"
-                "# results = edid.fit(data, ..., run_pretest=True)"
+                "# Hausman pretest is a classmethod on the estimator:\n"
+                "from diff_diff import EfficientDiD\n"
+                "pretest = EfficientDiD.hausman_pretest(\n"
+                "    data, outcome='y', unit='id', time='t', first_treat='g')"
             ),
             step_name="heterogeneity",
         ),
@@ -503,7 +504,7 @@ def _handle_triple(results: Any):
 def _handle_bacon(results: Any):
     steps = [
         _step(
-            baker_step=3,
+            baker_step=4,
             label="Switch to heterogeneity-robust estimator",
             why=(
                 "Bacon decomposition is diagnostic, not an estimator. "
@@ -521,21 +522,14 @@ def _handle_bacon(results: Any):
         ),
     ]
     warnings = []
-    # Check for negative weights if the attribute exists
-    comparisons = getattr(results, "comparisons", None)
-    if comparisons is not None:
-        try:
-            has_negative = any(
-                getattr(c, "weight", 0) < 0 for c in comparisons
-            )
-            if has_negative:
-                warnings.append(
-                    "Negative weights detected in Bacon decomposition — "
-                    "TWFE estimate is contaminated by forbidden comparisons. "
-                    "Switch to a heterogeneity-robust estimator."
-                )
-        except (TypeError, AttributeError):
-            pass
+    # Check for forbidden comparisons (later vs earlier treated)
+    weight = getattr(results, "total_weight_later_vs_earlier", 0)
+    if isinstance(weight, (int, float)) and weight > 0.01:
+        warnings.append(
+            f"Forbidden comparisons (later vs earlier treated) carry "
+            f"{weight:.0%} of TWFE weight — TWFE estimate is contaminated. "
+            f"Switch to a heterogeneity-robust estimator."
+        )
     return steps, warnings
 
 
diff --git a/docs/llms-practitioner.txt b/docs/llms-practitioner.txt
@@ -75,7 +75,14 @@ Treatment does not affect outcomes before it is implemented. Violated when
 units adjust behavior in anticipation of future treatment. Set `anticipation=k`
 to allow k periods of anticipation.
 
-### Test parallel trends empirically:
+---
+
+## Step 3: Test Parallel Trends
+
+Test the parallel trends assumption empirically BEFORE estimation. This step
+is separated from Step 2 because it requires code execution, not just stating
+assumptions.
+
 ```python
 from diff_diff import check_parallel_trends, equivalence_test_trends
 
@@ -101,7 +108,7 @@ large a violation would need to be to overturn your results.
 
 ---
 
-## Step 3: Choose Estimation Method
+## Step 4: Choose Estimation Method
 
 Use this decision tree to select the appropriate estimator:
 
@@ -146,9 +153,9 @@ print(bacon_result.summary())
 
 ---
 
-## Step 4: Discuss Sources of Uncertainty
+## Step 5: Estimate
 
-Before estimating, you MUST check the cluster count and choose inference
+Before fitting, you MUST check the cluster count and choose inference
 accordingly. Do not assume — always print and decide based on the data.
 
 ```python
@@ -163,37 +170,7 @@ else:
     print(f"-> Only {n_clusters} clusters — use wild cluster bootstrap")
 ```
 
-- **Cluster standard errors** at the level of treatment assignment (e.g.,
-  state, county). Use the `cluster=` parameter.
-- **Few clusters (<50)?** Use wild cluster bootstrap:
-  `inference="wild_bootstrap", n_bootstrap=999`
-- **Survey data?** Wrap in `SurveyDesign()` for design-based inference with
-  stratification, PSU clustering, and finite population corrections.
-
-```python
-from diff_diff import CallawaySantAnna
-
-# Standard cluster-robust SEs (when n_clusters >= 50)
-cs = CallawaySantAnna(cluster='county_id')
-
-# Wild bootstrap for few clusters (when n_clusters < 50)
-from diff_diff import DifferenceInDifferences
-did = DifferenceInDifferences(
-    inference='wild_bootstrap', n_bootstrap=999, cluster='state_id'
-)
-
-# Survey design
-from diff_diff import SurveyDesign
-survey = SurveyDesign(weights='pw', strata='stratum', psu='psu_id')
-cs = CallawaySantAnna()
-results = cs.fit(data, ..., survey_design=survey)
-```
-
----
-
-## Step 5: Estimate
-
-Now run the estimator chosen in Step 3. Examples for common designs:
+Now run the estimator chosen in Step 4. Examples for common designs:
 
 ### Staggered adoption (recommended: Callaway-Sant'Anna)
 ```python
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -2001,15 +2001,18 @@ The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al.
 "Difference-in-Differences Designs: A Practitioner's Guide" (arXiv:2503.13323), not a
 1:1 mapping of the paper's forward-engineering framework.
 
-- **Note:** Parallel trends testing is a separate step (Step 3) rather than embedded in
-  the identification assumptions step (paper's Step 2), to ensure AI agents execute it.
-- **Note:** Sources of uncertainty (paper's Step 4) is folded into the estimation step
-  (Step 5) with an explicit cluster-count check directive (>= 50 clusters for asymptotic
-  SEs, otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention, not
-  from the paper.
+- **Note:** The diff-diff canonical numbering is: 1-Define, 2-Assumptions, 3-Test PT,
+  4-Choose estimator, 5-Estimate, 6-Sensitivity, 7-Heterogeneity, 8-Robustness.
+  Paper's numbering: 1-Define, 2-Assumptions, 3-Estimation method, 4-Uncertainty,
+  5-Estimate, 6-Sensitivity, 7-Heterogeneity, 8-Keep learning.
+- **Note:** Parallel trends testing is a separate Step 3 (paper embeds it in Step 2),
+  to ensure AI agents execute it as a distinct action.
+- **Note:** Sources of uncertainty (paper's Step 4) is folded into Step 5 (Estimate)
+  with an explicit cluster-count check directive (>= 50 clusters for asymptotic SEs,
+  otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention.
 - **Note:** Step 8 is "Robustness & Reporting" (compare estimators, report with/without
-  covariates). The paper's Step 8 is "Keep learning" (explore alternative designs). The
-  mandatory with/without covariate comparison is a diff-diff convention.
+  covariates). Paper's Step 8 is "Keep learning." The mandatory with/without covariate
+  comparison is a diff-diff convention.
 
 ---
 
diff --git a/tests/test_practitioner.py b/tests/test_practitioner.py
@@ -280,9 +280,9 @@ def test_filter_parallel_trends(self, cs_results):
             cs_results, completed_steps=["parallel_trends"], verbose=False
         )
         assert len(filtered["next_steps"]) < len(full["next_steps"])
-        # No step should have baker_step 2 about parallel trends
+        # No step should have baker_step 3 about parallel trends
         for s in filtered["next_steps"]:
-            if s["baker_step"] == 2:
+            if s["baker_step"] == 3:
                 assert "parallel trends" not in s["label"].lower()
 
     def test_filter_sensitivity(self, cs_results):
@@ -342,6 +342,49 @@ def test_nan_att_produces_warning(self):
         assert any("NaN" in w for w in output["warnings"])
 
 
+# ---------------------------------------------------------------------------
+# Tests: Bacon handler warnings
+# ---------------------------------------------------------------------------
+class TestBaconWarnings:
+    def test_forbidden_comparison_warning(self, bacon_results):
+        output = practitioner_next_steps(bacon_results, verbose=False)
+        # Real Bacon results from staggered data should have forbidden comparisons
+        weight = getattr(bacon_results, "total_weight_later_vs_earlier", 0)
+        if weight > 0.01:
+            assert any("contaminated" in w for w in output["warnings"])
+
+    def test_bacon_with_high_forbidden_weight(self):
+        """Mock Bacon results with high forbidden comparison weight."""
+        from diff_diff.bacon import BaconDecompositionResults
+
+        r = BaconDecompositionResults.__new__(BaconDecompositionResults)
+        r.overall_att = 0.5
+        r.total_weight_later_vs_earlier = 0.4
+        r.comparisons = []
+        output = practitioner_next_steps(r, verbose=False)
+        assert any("contaminated" in w for w in output["warnings"])
+        assert any("40%" in w for w in output["warnings"])
+
+
+# ---------------------------------------------------------------------------
+# Tests: EfficientDiD handler path
+# ---------------------------------------------------------------------------
+class TestEfficientDiDHandler:
+    def test_hausman_pretest_in_guidance(self, mock_efficient_results):
+        output = practitioner_next_steps(mock_efficient_results, verbose=False)
+        labels = [s["label"] for s in output["next_steps"]]
+        assert any("hausman" in lbl.lower() or "Hausman" in lbl for lbl in labels)
+
+    def test_hausman_snippet_uses_classmethod(self, mock_efficient_results):
+        output = practitioner_next_steps(mock_efficient_results, verbose=False)
+        hausman_steps = [
+            s for s in output["next_steps"]
+            if "hausman" in s["label"].lower() or "Hausman" in s["label"]
+        ]
+        assert len(hausman_steps) > 0
+        assert "hausman_pretest" in hausman_steps[0]["code"]
+
+
 # ---------------------------------------------------------------------------
 # Tests: unknown result type fallback
 # ---------------------------------------------------------------------------