Skip to content

Commit f342542

Browse files
igerberclaude
andcommitted
Address AI review findings on practitioner guardrails
P1 fixes: - Relabel workflow as "based on Baker et al. 2025" (not a 1:1 mapping) - Add REGISTRY.md deviation notes for reorganized steps and diff-diff conventions - Gate HonestDiD guidance to MultiPeriodDiD and CallawaySantAnna only (the only types supported by compute_honest_did) - Add aggregate='event_study' requirement for CS + HonestDiD P2 fixes: - Fix attribute names: pre_treatment_rmse -> pre_treatment_fit, sub_experiments -> n_sub_experiments/stacked_data - Fix EfficientDiD Hausman pretest (estimator method, not results attr) - Fix equivalence_test_trends param: threshold -> equivalence_margin - Fix _covariates_step snippet to note .att vs .overall_att difference - Fix test mocks to use correct attributes (overall_att/overall_se for staggered result types) P3 fix: - Replace Unicode box-drawing chars with ASCII in decision tree Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a6b3d4a commit f342542

6 files changed

Lines changed: 104 additions & 72 deletions

File tree

diff_diff/practitioner.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,8 @@ def _covariates_step() -> Dict[str, Any]:
217217
code=(
218218
"# Re-estimate without covariates and compare:\n"
219219
"result_no_cov = estimator.fit(data, ..., covariates=None)\n"
220-
"print(f'Without covariates: ATT={result_no_cov.att:.4f}')"
220+
"# Compare ATT with and without covariates.\n"
221+
"# Use .att (basic DiD) or .overall_att (staggered estimators)."
221222
),
222223
priority="medium",
223224
step_name="robustness",
@@ -265,7 +266,23 @@ def _handle_multi_period(results: Any):
265266
def _handle_cs(results: Any):
266267
steps = [
267268
_parallel_trends_step(),
268-
_honest_did_step(),
269+
_step(
270+
baker_step=6,
271+
label="Run HonestDiD sensitivity analysis",
272+
why=(
273+
"Bounds the treatment effect under plausible violations of "
274+
"parallel trends. Requires event study effects — refit with "
275+
"aggregate='event_study' or 'all' if not already done."
276+
),
277+
code=(
278+
"from diff_diff import compute_honest_did\n"
279+
"# CS results must have event_study_effects:\n"
280+
"results = cs.fit(data, ..., aggregate='event_study')\n"
281+
"honest = compute_honest_did(results, method='relative_magnitude', M=1.0)\n"
282+
"print(honest.summary())"
283+
),
284+
step_name="sensitivity",
285+
),
269286
_step(
270287
baker_step=7,
271288
label="Examine group and event study effects",
@@ -292,7 +309,7 @@ def _handle_cs(results: Any):
292309
def _handle_sa(results: Any):
293310
steps = [
294311
_parallel_trends_step(),
295-
_honest_did_step(),
312+
_placebo_step(),
296313
_robustness_compare_step("CS, BJS, or Gardner"),
297314
_covariates_step(),
298315
]
@@ -303,7 +320,7 @@ def _handle_sa(results: Any):
303320
def _handle_imputation(results: Any):
304321
steps = [
305322
_parallel_trends_step(),
306-
_honest_did_step(),
323+
_placebo_step(),
307324
_robustness_compare_step("CS, SA, or Gardner"),
308325
_covariates_step(),
309326
]
@@ -314,7 +331,7 @@ def _handle_imputation(results: Any):
314331
def _handle_two_stage(results: Any):
315332
steps = [
316333
_parallel_trends_step(),
317-
_honest_did_step(),
334+
_placebo_step(),
318335
_robustness_compare_step("CS, BJS, or SA"),
319336
_covariates_step(),
320337
]
@@ -325,15 +342,15 @@ def _handle_two_stage(results: Any):
325342
def _handle_stacked(results: Any):
326343
steps = [
327344
_parallel_trends_step(),
328-
_honest_did_step(),
345+
_placebo_step(),
329346
_step(
330347
baker_step=7,
331348
label="Check sub-experiment balance",
332349
why=(
333350
"Stacked DiD constructs sub-experiments for each cohort. "
334351
"Verify that each sub-experiment has sufficient controls."
335352
),
336-
code="# Inspect results.sub_experiments for balance",
353+
code="# Check results.n_sub_experiments and inspect results.stacked_data",
337354
priority="medium",
338355
step_name="heterogeneity",
339356
),
@@ -354,8 +371,8 @@ def _handle_synthetic(results: Any):
354371
"approximate the counterfactual well."
355372
),
356373
code=(
357-
"# Check pre-treatment RMSE and unit weight concentration:\n"
358-
"print(f'Pre-treatment RMSE: {results.pre_treatment_rmse:.4f}')\n"
374+
"# Check pre-treatment fit and unit weight concentration:\n"
375+
"print(f'Pre-treatment fit (RMSE): {results.pre_treatment_fit:.4f}')\n"
359376
"# Highly concentrated weights suggest fragile estimates"
360377
),
361378
step_name="sensitivity",
@@ -407,28 +424,24 @@ def _handle_trop(results: Any):
407424
def _handle_efficient(results: Any):
408425
steps = [
409426
_parallel_trends_step(),
410-
_honest_did_step(),
427+
_placebo_step(),
428+
_step(
429+
baker_step=7,
430+
label="Run Hausman pretest (PT-All vs PT-Post)",
431+
why=(
432+
"EfficientDiD supports both PT-All and PT-Post assumptions. "
433+
"The Hausman pretest compares them — report which was selected."
434+
),
435+
code=(
436+
"# Hausman pretest is an estimator method, not a results attribute:\n"
437+
"# edid = EfficientDiD()\n"
438+
"# results = edid.fit(data, ..., run_pretest=True)"
439+
),
440+
step_name="heterogeneity",
441+
),
442+
_robustness_compare_step("CS, SA, or BJS"),
443+
_covariates_step(),
411444
]
412-
# Check for Hausman pretest
413-
hausman = getattr(results, "hausman_pretest", None)
414-
if hausman is not None:
415-
steps.append(
416-
_step(
417-
baker_step=7,
418-
label="Report Hausman pretest result",
419-
why=(
420-
"The Hausman pretest compares PT-All vs PT-Post "
421-
"assumptions. Report which was selected and why."
422-
),
423-
code=(
424-
f"# Hausman test p-value: {getattr(hausman, 'p_value', 'N/A')}\n"
425-
f"# Recommendation: {getattr(hausman, 'recommendation', 'N/A')}"
426-
),
427-
step_name="heterogeneity",
428-
)
429-
)
430-
steps.append(_robustness_compare_step("CS, SA, or BJS"))
431-
steps.append(_covariates_step())
432445
warnings = _check_nan_att(results)
433446
return steps, warnings
434447

docs/llms-full.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ print(f"ATT: {results.att:.3f} (SE: {results.se:.3f})")
3131
- **Results objects**: Rich dataclass containers with `summary()`, `to_dict()`, `to_dataframe()`.
3232
- **Estimator aliases**: Short names available (e.g., `DiD`, `CS`, `SA`, `BJS`, `Gardner`, `SDiD`, `TWFE`, `DDD`, `CDiD`, `EDiD`, `Stacked`, `Bacon`).
3333

34-
## Practitioner Workflow (Baker et al. 2025)
34+
## Practitioner Workflow (based on Baker et al. 2025)
3535

3636
For rigorous DiD analysis, follow the 8-step framework in docs/llms-practitioner.txt.
3737
After estimation, call:

docs/llms-practitioner.txt

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
# diff-diff Practitioner Guide (Baker et al. 2025)
1+
# diff-diff Practitioner Guide
22

3-
> This guide maps Baker et al. (2025) "Difference-in-Differences Designs: A
4-
> Practitioner's Guide" to concrete diff-diff API calls. It ensures rigorous
5-
> causal inference by following an 8-step empiricist workflow.
3+
> An 8-step workflow for rigorous Difference-in-Differences analysis, based on
4+
> Baker et al. (2025) "Difference-in-Differences Designs: A Practitioner's
5+
> Guide" and adapted for the diff-diff library. Some steps are reorganized or
6+
> extended relative to the paper (see docs/methodology/REGISTRY.md for details).
67

78
## Instructions for AI Agents
89

@@ -90,7 +91,7 @@ print(f"Difference p-value: {pt_result['p_value']:.4f}")
9091
# Equivalence test (TOST) — tests that trends are meaningfully similar
9192
equiv = equivalence_test_trends(
9293
data, outcome='y', time='period', treatment_group='treated',
93-
pre_periods=[1, 2, 3], threshold=0.5
94+
pre_periods=[1, 2, 3], equivalence_margin=0.5
9495
)
9596
```
9697

@@ -106,26 +107,26 @@ Use this decision tree to select the appropriate estimator:
106107

107108
```
108109
Is treatment adoption staggered (multiple cohorts, different timing)?
109-
├── YES: Do NOT use plain TWFE. Use one of:
110-
├── CallawaySantAnna (CS) most general, doubly robust, recommended default
111-
├── SunAbraham (SA) interaction-weighted, good for event studies
112-
├── ImputationDiD (BJS) most efficient under homogeneous effects
113-
├── TwoStageDiD (Gardner) two-stage with GMM variance
114-
├── StackedDiD (Stacked) sub-experiment approach
115-
└── EfficientDiD (EDiD) �� optimal weighting for tighter SEs
116-
117-
├── NO, simple 2x2 design:
118-
└── DifferenceInDifferences (DiD)
119-
120-
├── Few treated units (< 20)?
121-
├── SyntheticDiD (SDiD) synthetic control + DiD hybrid
122-
└── TROP triply robust with factor adjustment
123-
124-
├── Continuous treatment (doses)?
125-
└── ContinuousDiD (CDiD)
126-
127-
└���─ Two eligibility criteria?
128-
└── TripleDifference (DDD)
110+
|-- YES: Do NOT use plain TWFE. Use one of:
111+
| |-- CallawaySantAnna (CS) -- most general, doubly robust, recommended default
112+
| |-- SunAbraham (SA) -- interaction-weighted, good for event studies
113+
| |-- ImputationDiD (BJS) -- most efficient under homogeneous effects
114+
| |-- TwoStageDiD (Gardner) -- two-stage with GMM variance
115+
| |-- StackedDiD (Stacked) -- sub-experiment approach
116+
| \-- EfficientDiD (EDiD) -- optimal weighting for tighter SEs
117+
|
118+
|-- NO, simple 2x2 design:
119+
| \-- DifferenceInDifferences (DiD)
120+
|
121+
|-- Few treated units (< 20)?
122+
| |-- SyntheticDiD (SDiD) -- synthetic control + DiD hybrid
123+
| \-- TROP -- triply robust with factor adjustment
124+
|
125+
|-- Continuous treatment (doses)?
126+
| \-- ContinuousDiD (CDiD)
127+
|
128+
\-- Two eligibility criteria?
129+
\-- TripleDifference (DDD)
129130
```
130131

131132
Always run BaconDecomposition first if using TWFE, to check for negative
@@ -243,8 +244,9 @@ This step is CRITICAL and most often skipped. Run at least one of:
243244

244245
### HonestDiD (Rambachan & Roth 2023) — recommended
245246
Bounds on the treatment effect under violations of parallel trends.
246-
Works with MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD,
247-
TwoStageDiD, StackedDiD, and EfficientDiD results.
247+
Works with MultiPeriodDiD and CallawaySantAnna results only. For CS,
248+
requires `aggregate='event_study'` or `aggregate='all'` so that event
249+
study effects are available.
248250

249251
```python
250252
from diff_diff import compute_honest_did

docs/llms.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ diff-diff offers 14 estimators covering basic 2x2 DiD, modern staggered adoption
1010
- Source: https://github.com/igerber/diff-diff
1111
- Docs: https://diff-diff.readthedocs.io/en/stable/
1212

13-
## Practitioner Workflow (Baker et al. 2025)
13+
## Practitioner Workflow (based on Baker et al. 2025)
1414

1515
IMPORTANT: For rigorous DiD analysis, follow these 8 steps. Skipping
1616
diagnostic steps produces unreliable results.

docs/methodology/REGISTRY.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1995,6 +1995,24 @@ ContinuousDiD, EfficientDiD):
19951995

19961996
---
19971997

1998+
# Practitioner Guide
1999+
2000+
The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al. (2025)
2001+
"Difference-in-Differences Designs: A Practitioner's Guide" (arXiv:2503.13323), not a
2002+
1:1 mapping of the paper's forward-engineering framework.
2003+
2004+
- **Note:** Parallel trends testing is a separate step (Step 3) rather than embedded in
2005+
the identification assumptions step (paper's Step 2), to ensure AI agents execute it.
2006+
- **Note:** Sources of uncertainty (paper's Step 4) is folded into the estimation step
2007+
(Step 5) with an explicit cluster-count check directive (>= 50 clusters for asymptotic
2008+
SEs, otherwise wild bootstrap). The 50-cluster threshold is a diff-diff convention, not
2009+
from the paper.
2010+
- **Note:** Step 8 is "Robustness & Reporting" (compare estimators, report with/without
2011+
covariates). The paper's Step 8 is "Keep learning" (explore alternative designs). The
2012+
mandatory with/without covariate comparison is a diff-diff convention.
2013+
2014+
---
2015+
19982016
# Version History
19992017

20002018
- **v1.2** (2026-03-24): Added Survey-Aware Bootstrap section (Phase 6)

tests/test_practitioner.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,16 @@ def mock_trop_results():
118118
@pytest.fixture
119119
def mock_efficient_results():
120120
r = EfficientDiDResults.__new__(EfficientDiDResults)
121-
r.att = 0.6
122-
r.se = 0.15
123-
r.hausman_pretest = None
121+
r.overall_att = 0.6
122+
r.overall_se = 0.15
124123
return r
125124

126125

127126
@pytest.fixture
128127
def mock_continuous_results():
129128
r = ContinuousDiDResults.__new__(ContinuousDiDResults)
130-
r.att = 0.4
131-
r.se = 0.1
129+
r.overall_att = 0.4
130+
r.overall_se = 0.1
132131
return r
133132

134133

@@ -143,32 +142,32 @@ def mock_triple_results():
143142
@pytest.fixture
144143
def mock_sa_results():
145144
r = SunAbrahamResults.__new__(SunAbrahamResults)
146-
r.att = 0.5
147-
r.se = 0.1
145+
r.overall_att = 0.5
146+
r.overall_se = 0.1
148147
return r
149148

150149

151150
@pytest.fixture
152151
def mock_imputation_results():
153152
r = ImputationDiDResults.__new__(ImputationDiDResults)
154-
r.att = 0.5
155-
r.se = 0.1
153+
r.overall_att = 0.5
154+
r.overall_se = 0.1
156155
return r
157156

158157

159158
@pytest.fixture
160159
def mock_two_stage_results():
161160
r = TwoStageDiDResults.__new__(TwoStageDiDResults)
162-
r.att = 0.5
163-
r.se = 0.1
161+
r.overall_att = 0.5
162+
r.overall_se = 0.1
164163
return r
165164

166165

167166
@pytest.fixture
168167
def mock_stacked_results():
169168
r = StackedDiDResults.__new__(StackedDiDResults)
170-
r.att = 0.5
171-
r.se = 0.1
169+
r.overall_att = 0.5
170+
r.overall_se = 0.1
172171
return r
173172

174173

0 commit comments

Comments
 (0)