Skip to content

Commit 50bc3fc

Browse files
igerberclaude
andcommitted
Align canonical step numbering and fix Bacon/EfficientDiD handlers
P1: Align all files to canonical 8-step numbering from llms.txt: 1-Define, 2-Assumptions, 3-Test PT, 4-Choose estimator, 5-Estimate (with cluster check), 6-Sensitivity, 7-Heterogeneity, 8-Robustness. Moved PT testing from Step 2 into separate Step 3 in llms-practitioner.txt, folded uncertainty into Step 5, updated baker_step values in practitioner.py (PT: 2->3, estimator: 3->4). P2: Bacon handler now checks total_weight_later_vs_earlier > 0.01 instead of negative weights (matches actual BaconDecompositionResults API). EfficientDiD snippet uses actual hausman_pretest() classmethod instead of nonexistent run_pretest=True parameter. Tests: Updated step number assertions, added Bacon warning tests (2) and EfficientDiD handler tests (2). Suite now 30 tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f342542 commit 50bc3fc

4 files changed

Lines changed: 83 additions & 66 deletions

File tree

diff_diff/practitioner.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def _step(
136136
# ---------------------------------------------------------------------------
137137
def _parallel_trends_step() -> Dict[str, Any]:
138138
return _step(
139-
baker_step=2,
139+
baker_step=3,
140140
label="Test parallel trends assumption",
141141
why=(
142142
"Parallel trends is the core identifying assumption. "
@@ -233,7 +233,7 @@ def _handle_did(results: Any):
233233
_parallel_trends_step(),
234234
_placebo_step(),
235235
_step(
236-
baker_step=3,
236+
baker_step=4,
237237
label="Check if data is actually staggered",
238238
why=(
239239
"If treatment timing varies across units, basic DiD produces "
@@ -433,9 +433,10 @@ def _handle_efficient(results: Any):
433433
"The Hausman pretest compares them — report which was selected."
434434
),
435435
code=(
436-
"# Hausman pretest is an estimator method, not a results attribute:\n"
437-
"# edid = EfficientDiD()\n"
438-
"# results = edid.fit(data, ..., run_pretest=True)"
436+
"# Hausman pretest is a classmethod on the estimator:\n"
437+
"from diff_diff import EfficientDiD\n"
438+
"pretest = EfficientDiD.hausman_pretest(\n"
439+
" data, outcome='y', unit='id', time='t', first_treat='g')"
439440
),
440441
step_name="heterogeneity",
441442
),
@@ -503,7 +504,7 @@ def _handle_triple(results: Any):
503504
def _handle_bacon(results: Any):
504505
steps = [
505506
_step(
506-
baker_step=3,
507+
baker_step=4,
507508
label="Switch to heterogeneity-robust estimator",
508509
why=(
509510
"Bacon decomposition is diagnostic, not an estimator. "
@@ -521,21 +522,14 @@ def _handle_bacon(results: Any):
521522
),
522523
]
523524
warnings = []
524-
# Check for negative weights if the attribute exists
525-
comparisons = getattr(results, "comparisons", None)
526-
if comparisons is not None:
527-
try:
528-
has_negative = any(
529-
getattr(c, "weight", 0) < 0 for c in comparisons
530-
)
531-
if has_negative:
532-
warnings.append(
533-
"Negative weights detected in Bacon decomposition — "
534-
"TWFE estimate is contaminated by forbidden comparisons. "
535-
"Switch to a heterogeneity-robust estimator."
536-
)
537-
except (TypeError, AttributeError):
538-
pass
525+
# Check for forbidden comparisons (later vs earlier treated)
526+
weight = getattr(results, "total_weight_later_vs_earlier", 0)
527+
if isinstance(weight, (int, float)) and weight > 0.01:
528+
warnings.append(
529+
f"Forbidden comparisons (later vs earlier treated) carry "
530+
f"{weight:.0%} of TWFE weight — TWFE estimate is contaminated. "
531+
f"Switch to a heterogeneity-robust estimator."
532+
)
539533
return steps, warnings
540534

541535

docs/llms-practitioner.txt

Lines changed: 12 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,14 @@ Treatment does not affect outcomes before it is implemented. Violated when
7575
units adjust behavior in anticipation of future treatment. Set `anticipation=k`
7676
to allow k periods of anticipation.
7777

78-
### Test parallel trends empirically:
78+
---
79+
80+
## Step 3: Test Parallel Trends
81+
82+
Test the parallel trends assumption empirically BEFORE estimation. This step
83+
is separated from Step 2 because it requires code execution, not just stating
84+
assumptions.
85+
7986
```python
8087
from diff_diff import check_parallel_trends, equivalence_test_trends
8188

@@ -101,7 +108,7 @@ large a violation would need to be to overturn your results.
101108

102109
---
103110

104-
## Step 3: Choose Estimation Method
111+
## Step 4: Choose Estimation Method
105112

106113
Use this decision tree to select the appropriate estimator:
107114

@@ -146,9 +153,9 @@ print(bacon_result.summary())
146153

147154
---
148155

149-
## Step 4: Discuss Sources of Uncertainty
156+
## Step 5: Estimate
150157

151-
Before estimating, you MUST check the cluster count and choose inference
158+
Before fitting, you MUST check the cluster count and choose inference
152159
accordingly. Do not assume — always print and decide based on the data.
153160

154161
```python
@@ -163,37 +170,7 @@ else:
163170
print(f"-> Only {n_clusters} clusters — use wild cluster bootstrap")
164171
```
165172

166-
- **Cluster standard errors** at the level of treatment assignment (e.g.,
167-
state, county). Use the `cluster=` parameter.
168-
- **Few clusters (<50)?** Use wild cluster bootstrap:
169-
`inference="wild_bootstrap", n_bootstrap=999`
170-
- **Survey data?** Wrap in `SurveyDesign()` for design-based inference with
171-
stratification, PSU clustering, and finite population corrections.
172-
173-
```python
174-
from diff_diff import CallawaySantAnna
175-
176-
# Standard cluster-robust SEs (when n_clusters >= 50)
177-
cs = CallawaySantAnna(cluster='county_id')
178-
179-
# Wild bootstrap for few clusters (when n_clusters < 50)
180-
from diff_diff import DifferenceInDifferences
181-
did = DifferenceInDifferences(
182-
inference='wild_bootstrap', n_bootstrap=999, cluster='state_id'
183-
)
184-
185-
# Survey design
186-
from diff_diff import SurveyDesign
187-
survey = SurveyDesign(weights='pw', strata='stratum', psu='psu_id')
188-
cs = CallawaySantAnna()
189-
results = cs.fit(data, ..., survey_design=survey)
190-
```
191-
192-
---
193-
194-
## Step 5: Estimate
195-
196-
Now run the estimator chosen in Step 3. Examples for common designs:
173+
Now run the estimator chosen in Step 4. Examples for common designs:
197174

198175
### Staggered adoption (recommended: Callaway-Sant'Anna)
199176
```python

docs/methodology/REGISTRY.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,15 +2001,18 @@ The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al.
20012001
"Difference-in-Differences Designs: A Practitioner's Guide" (arXiv:2503.13323), not a
20022002
1:1 mapping of the paper's forward-engineering framework.
20032003

2004-
- **Note:** Parallel trends testing is a separate step (Step 3) rather than embedded in
2005-
the identification assumptions step (paper's Step 2), to ensure AI agents execute it.
2006-
- **Note:** Sources of uncertainty (paper's Step 4) is folded into the estimation step
2007-
(Step 5) with an explicit cluster-count check directive (>= 50 clusters for asymptotic
2008-
SEs, otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention, not
2009-
from the paper.
2004+
- **Note:** The diff-diff canonical numbering is: 1-Define, 2-Assumptions, 3-Test PT,
2005+
4-Choose estimator, 5-Estimate, 6-Sensitivity, 7-Heterogeneity, 8-Robustness.
2006+
Paper's numbering: 1-Define, 2-Assumptions, 3-Estimation method, 4-Uncertainty,
2007+
5-Estimate, 6-Sensitivity, 7-Heterogeneity, 8-Keep learning.
2008+
- **Note:** Parallel trends testing is a separate Step 3 (paper embeds it in Step 2),
2009+
to ensure AI agents execute it as a distinct action.
2010+
- **Note:** Sources of uncertainty (paper's Step 4) is folded into Step 5 (Estimate)
2011+
with an explicit cluster-count check directive (>= 50 clusters for asymptotic SEs,
2012+
otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention.
20102013
- **Note:** Step 8 is "Robustness & Reporting" (compare estimators, report with/without
2011-
covariates). The paper's Step 8 is "Keep learning" (explore alternative designs). The
2012-
mandatory with/without covariate comparison is a diff-diff convention.
2014+
covariates). Paper's Step 8 is "Keep learning." The mandatory with/without covariate
2015+
comparison is a diff-diff convention.
20132016

20142017
---
20152018

tests/test_practitioner.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,9 @@ def test_filter_parallel_trends(self, cs_results):
280280
cs_results, completed_steps=["parallel_trends"], verbose=False
281281
)
282282
assert len(filtered["next_steps"]) < len(full["next_steps"])
283-
# No step should have baker_step 2 about parallel trends
283+
# No step should have baker_step 3 about parallel trends
284284
for s in filtered["next_steps"]:
285-
if s["baker_step"] == 2:
285+
if s["baker_step"] == 3:
286286
assert "parallel trends" not in s["label"].lower()
287287

288288
def test_filter_sensitivity(self, cs_results):
@@ -342,6 +342,49 @@ def test_nan_att_produces_warning(self):
342342
assert any("NaN" in w for w in output["warnings"])
343343

344344

345+
# ---------------------------------------------------------------------------
346+
# Tests: Bacon handler warnings
347+
# ---------------------------------------------------------------------------
348+
class TestBaconWarnings:
349+
def test_forbidden_comparison_warning(self, bacon_results):
350+
output = practitioner_next_steps(bacon_results, verbose=False)
351+
# Real Bacon results from staggered data should have forbidden comparisons
352+
weight = getattr(bacon_results, "total_weight_later_vs_earlier", 0)
353+
if weight > 0.01:
354+
assert any("contaminated" in w for w in output["warnings"])
355+
356+
def test_bacon_with_high_forbidden_weight(self):
357+
"""Mock Bacon results with high forbidden comparison weight."""
358+
from diff_diff.bacon import BaconDecompositionResults
359+
360+
r = BaconDecompositionResults.__new__(BaconDecompositionResults)
361+
r.overall_att = 0.5
362+
r.total_weight_later_vs_earlier = 0.4
363+
r.comparisons = []
364+
output = practitioner_next_steps(r, verbose=False)
365+
assert any("contaminated" in w for w in output["warnings"])
366+
assert any("40%" in w for w in output["warnings"])
367+
368+
369+
# ---------------------------------------------------------------------------
370+
# Tests: EfficientDiD handler path
371+
# ---------------------------------------------------------------------------
372+
class TestEfficientDiDHandler:
373+
def test_hausman_pretest_in_guidance(self, mock_efficient_results):
374+
output = practitioner_next_steps(mock_efficient_results, verbose=False)
375+
labels = [s["label"] for s in output["next_steps"]]
376+
assert any("hausman" in lbl.lower() or "Hausman" in lbl for lbl in labels)
377+
378+
def test_hausman_snippet_uses_classmethod(self, mock_efficient_results):
379+
output = practitioner_next_steps(mock_efficient_results, verbose=False)
380+
hausman_steps = [
381+
s for s in output["next_steps"]
382+
if "hausman" in s["label"].lower() or "Hausman" in s["label"]
383+
]
384+
assert len(hausman_steps) > 0
385+
assert "hausman_pretest" in hausman_steps[0]["code"]
386+
387+
345388
# ---------------------------------------------------------------------------
346389
# Tests: unknown result type fallback
347390
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)