Skip to content

Commit d8b0f67

Browse files
authored
Merge pull request #420 from igerber/fix-audit-402
Address residual P2s+P3 from re-audit of PR #402
2 parents 7addf14 + 80cb9ae commit d8b0f67

4 files changed

Lines changed: 81 additions & 25 deletions

File tree

diff_diff/guides/llms-full.txt

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -636,21 +636,26 @@ had.fit(
636636
```python
637637
from diff_diff import HeterogeneousAdoptionDiD, did_had_pretest_workflow
638638

639-
# Vet the testable identifying assumptions first:
639+
# `aggregate="overall"` (the default) is two-period-only: did_had_pretest_workflow
640+
# reduces to a single first-difference and HeterogeneousAdoptionDiD.fit hard-rejects
641+
# panels with more than two periods. `aggregate="event_study"` requires a
642+
# multi-period panel. Use distinct data objects for the two regimes.
643+
644+
# Vet the testable identifying assumptions on the two-period panel first:
640645
report = did_had_pretest_workflow(
641-
data, outcome_col='y', unit_col='unit', time_col='t',
646+
data_2p, outcome_col='y', unit_col='unit', time_col='t',
642647
dose_col='d', first_treat_col='first_treat')
643648
print(report.summary())
644649

645-
# Single-period scalar WAS (aggregate="overall" default):
650+
# Single-period scalar WAS (aggregate="overall" default) on the two-period panel:
646651
est = HeterogeneousAdoptionDiD()
647-
results = est.fit(data, outcome_col='y', unit_col='unit',
652+
results = est.fit(data_2p, outcome_col='y', unit_col='unit',
648653
time_col='t', dose_col='d',
649654
first_treat_col='first_treat')
650655
print(results.summary())
651656

652-
# Multi-period per-horizon WAS:
653-
es = est.fit(data, outcome_col='y', unit_col='unit',
657+
# Multi-period per-horizon WAS on the multi-period panel:
658+
es = est.fit(data_mp, outcome_col='y', unit_col='unit',
654659
time_col='t', dose_col='d',
655660
first_treat_col='first_treat',
656661
aggregate='event_study')

diff_diff/practitioner.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -871,14 +871,18 @@ def _handle_had(results: Any):
871871
"workflow still sets pretrends_joint=None, all_pass=False, "
872872
"and a 'joint pre-trends skipped (no earlier pre-period)' "
873873
"verdict suffix - in that case step 2 stays uncovered "
874-
"even on the event-study path. On survey-weighted "
875-
"fits (survey_design= / survey= / weights=) the workflow "
876-
"skips QUG with a UserWarning (permanent Phase 4.5 C0 "
877-
"deferral - extreme order statistics are not smooth "
878-
"functionals of the empirical CDF) and returns a "
879-
"linearity-conditional verdict only - so step 1 coverage "
880-
"is unweighted-only and the reported verdict on weighted "
881-
"fits is conditional on QUG holding by assumption. "
874+
"even on the event-study path. On supported survey-weighted "
875+
"fits (pweight + PSU/FPC under survey_design= / survey= / "
876+
"weights=) the workflow skips QUG with a UserWarning "
877+
"(permanent Phase 4.5 C0 deferral - extreme order statistics "
878+
"are not smooth functionals of the empirical CDF) and returns "
879+
"a linearity-conditional verdict only - so step 1 coverage "
880+
"is unweighted-only and the reported verdict on supported "
881+
"weighted fits is conditional on QUG holding by assumption. "
882+
"Stratified (SurveyDesign(strata=...)) and replicate-weight "
883+
"(BRR/Fay/JK1/JKn/SDR) designs raise NotImplementedError on "
884+
"the linearity kernels and have no pretest workflow path "
885+
"yet - deferred to a follow-up. "
882886
"Assumptions 3 / 5 / 6 (uniform continuity at the "
883887
"boundary, Design 1 sign / WAS_d_lower identification) "
884888
"are NOT testable via pre-trends - the workflow vets only "
@@ -1024,16 +1028,22 @@ def _handle_had_event_study(results: Any):
10241028
"trends_lin=True where the consumed F-2 placebo gets "
10251029
"dropped), pretrends_joint=None, all_pass=False, and the "
10261030
"verdict carries 'joint pre-trends skipped (no earlier "
1027-
"pre-period)' - step 2 stays uncovered. On survey-weighted fits (survey_design= / survey= / "
1028-
"weights=) the workflow skips QUG with a UserWarning "
1029-
"(permanent Phase 4.5 C0 deferral) and returns a "
1030-
"linearity-conditional verdict only - so step 1 coverage "
1031-
"is unweighted-only on the event-study path too, and the "
1032-
"weighted verdict is conditional on QUG holding by "
1033-
"assumption. The joint Stute pre-trends and joint "
1031+
"pre-period)' - step 2 stays uncovered. On supported "
1032+
"survey-weighted fits (pweight + PSU/FPC under "
1033+
"survey_design= / survey= / weights=) the workflow skips "
1034+
"QUG with a UserWarning (permanent Phase 4.5 C0 deferral) "
1035+
"and returns a linearity-conditional verdict only - so "
1036+
"step 1 coverage is unweighted-only on the event-study "
1037+
"path too, and the weighted verdict is conditional on QUG "
1038+
"holding by assumption. Stratified (SurveyDesign("
1039+
"strata=...)) and replicate-weight (BRR/Fay/JK1/JKn/SDR) "
1040+
"designs raise NotImplementedError on the linearity "
1041+
"kernels and have no pretest workflow path on the "
1042+
"event-study path yet - deferred to a follow-up. "
1043+
"The joint Stute pre-trends and joint "
10341044
"homogeneity-linearity tests themselves remain available "
1035-
"under survey weighting via PSU-level Mammen multiplier "
1036-
"bootstrap."
1045+
"under supported survey weighting via PSU-level Mammen "
1046+
"multiplier bootstrap."
10371047
),
10381048
code=(
10391049
"from diff_diff import did_had_pretest_workflow\n"

docs/methodology/REGISTRY.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2552,8 +2552,8 @@ Shipped in `diff_diff/had_pretests.py` as `stute_joint_pretest()` (residuals-in
25522552
- [x] Phase 3: `did_had_pretest_workflow()` composite helper. Two-period `data`-only entry point (Phase 2a overall-path dispatch); reduces panel via `_aggregate_first_difference` and runs all three IMPLEMENTED tests at a shared `alpha`. `seed` forwards to `stute_test` only (QUG and Yatchew are deterministic). Returns `HADPretestReport` with priority-ordered verdict string. Because Phase 3 ships steps 1 + 3 of the paper's four-step workflow but **not** step 2 (Assumption 7 pre-trends test via Equation 18), the fail-to-reject verdict explicitly flags the Assumption 7 gap rather than claiming unconditional TWFE safety: `"QUG and linearity diagnostics fail-to-reject; Assumption 7 pre-trends test NOT run (paper step 2 deferred to Phase 3 follow-up)"`. Verdict priority follows the paper's one-way rule (TWFE admissible only if NO test rejects): **conclusive rejections are the primary verdict and are NEVER hidden by inconclusive status** — any unresolved-step note is appended via `"; additional steps unresolved: ..."` rather than replacing the rejection. The pure `"inconclusive - QUG NaN"` / `"inconclusive - both Stute and Yatchew linearity tests NaN"` forms only fire when NO conclusive test rejects AND a required step is unresolved. The partial-workflow fail-to-reject verdict may carry a `"(Yatchew NaN - skipped)"` (or Stute) suffix when one linearity test is NaN but the other is conclusive (step 3 resolved via the paper's "Stute OR Yatchew" wording). Bundled rejection-reason strings name each failed assumption in the conclusive-rejection case. `all_pass` is `True` iff QUG is conclusive AND at least one of Stute/Yatchew is conclusive AND no conclusive test rejects. **Non-negative-dose contract**: all three raw linearity helpers (`qug_test`, `stute_test`, `yatchew_hr_test`) raise a front-door `ValueError` on any `d < 0`, mirroring the `_validate_had_panel` guard (paper Section 2 HAD support restriction). Multi-period panels pre-slice to `(F-1, F)` before calling; joint-horizon dispatch deferred to Phase 3 follow-up.
25532553
- [ ] Phase 4: Pierce-Schott (2016) replication harness reproduces Figure 2 values.
25542554
- [ ] Phase 4: Full DGP 1/2/3 coverage-rate reproduction from Table 1.
2555-
- [x] Phase 5 (wave 1, PR #402): `practitioner_next_steps()` integration for HAD results - `_handle_had` and `_handle_had_event_study` route both result classes through HAD-specific Baker et al. (2025) step guidance with bidirectional HAD ↔ ContinuousDiD Step-4 routing closure. The `_check_nan_att` helper extends to ndarray `att` (HAD event-study) via `np.all(np.isnan(arr))` semantics; scalar path bit-exact preserved.
2556-
- [x] Phase 5 (wave 1, PR #402): `llms-full.txt` HeterogeneousAdoptionDiD section + result-class blocks + `## HAD Pretests` index + Choosing-an-Estimator row landed; constructor / fit() signatures match the real API (regression-tested via `inspect.signature`); result-class field tables enumerate every public dataclass field (regression-tested via `dataclasses.fields()`); `llms-practitioner.txt` Step 4 decision tree distinguishes ContinuousDiD (per-dose ATT(d), needs never-treated) from HeterogeneousAdoptionDiD (WAS, universal-rollout-compatible).
2555+
- [x] Phase 5 (wave 1, PR #402): `practitioner_next_steps()` integration for HAD results - `_handle_had` and `_handle_had_event_study` route both result classes through HAD-specific Baker et al. (2025) step guidance with bidirectional HAD ↔ ContinuousDiD Step-4 routing closure. The `_check_nan_att` helper extends to ndarray `att` (HAD event-study) via `np.all(np.isnan(arr))` semantics; scalar path bit-exact preserved. The `llms-full.txt` HAD section's documented constructor and `fit()` parameter lists are regression-locked against `inspect.signature(HeterogeneousAdoptionDiD.__init__)` and `HeterogeneousAdoptionDiD.fit` for parameter-name presence (defaults, type annotations, and return-type unions are not pinned by the current test).
2556+
- [x] Phase 5 (wave 1, PR #402): `llms-full.txt` HeterogeneousAdoptionDiD section + result-class blocks + `## HAD Pretests` index + Choosing-an-Estimator row landed; constructor / fit() parameter names are regression-locked against `inspect.signature(HeterogeneousAdoptionDiD.__init__)` and `HeterogeneousAdoptionDiD.fit` (parameter-name presence only - defaults, type annotations, and return-type unions are not pinned); result-class field tables enumerate every public dataclass field (regression-tested via `dataclasses.fields()`); `llms-practitioner.txt` Step 4 decision tree distinguishes ContinuousDiD (per-dose ATT(d), needs never-treated) from HeterogeneousAdoptionDiD (WAS, universal-rollout-compatible).
25572557
- [x] Phase 5 (partial): README catalog one-liner, bundled `llms.txt` `## Estimators` entry, `docs/api/had.rst` (autoclass for the three classes), and `docs/references.rst` citation landed in PR #372 docs refresh.
25582558
- [x] Phase 5 (wave 2 first slice, PR #409): T21 HAD pretest workflow tutorial (`docs/tutorials/21_had_pretest_workflow.ipynb`) — composite pre-test walkthrough for `did_had_pretest_workflow`. Uses a `Uniform[$0.01K, $50K]` dose-distribution variant of T20's brand-campaign panel (true support strictly positive but near-zero, chosen so QUG fails-to-reject `H0: d_lower = 0` in finite sample). Walks through `aggregate="overall"` (Steps 1 + 3 only, verdict explicitly flags Step 2 deferral) and upgrades to `aggregate="event_study"` (joint pre-trends Stute + joint homogeneity Stute close the gap). Side panel exercises both `yatchew_hr_test` null modes (`linearity` vs `mean_independence`). Companion drift-test file `tests/test_t21_had_pretest_workflow_drift.py` (16 tests pinning panel composition, both verdict pivots, structural anchors, deterministic stats, bootstrap p-value tolerance bands per backend, and `HAD(design="auto")` resolution to `continuous_at_zero` on this panel).
25592559
- [ ] Phase 5 (remaining): T22 weighted/survey HAD tutorial - tracked in `TODO.md`.

tests/test_practitioner.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,47 @@ def test_had_step_3_flags_qug_under_survey_deferral(
894894
"assumption."
895895
)
896896

897+
def test_had_step_3_qualifies_supported_survey_scope(
898+
self, mock_had_results, mock_had_event_study_results
899+
):
900+
# Per diff_diff/had_pretests.py:1725-1740 + :1927-1940, only
901+
# pweight + PSU/FPC survey designs are supported on HAD
902+
# pretests. Stratified (SurveyDesign(strata=...)) and
903+
# replicate-weight (BRR/Fay/JK1/JKn/SDR) designs raise
904+
# NotImplementedError on the linearity kernels. Both HAD
905+
# handlers' Step-3 text must call out the supported subset
906+
# and the deferred regimes so agents don't generate
907+
# `practitioner_next_steps` outputs that overstate what the
908+
# workflow will run on a given survey design.
909+
for fixture in (mock_had_results, mock_had_event_study_results):
910+
output = practitioner_next_steps(fixture, verbose=False)
911+
step_3_steps = [s for s in output["next_steps"] if s["baker_step"] == 3]
912+
assert len(step_3_steps) == 1
913+
text = step_3_steps[0].get("why", "").lower()
914+
# Supported subset must be named explicitly.
915+
assert "pweight" in text and "psu" in text and "fpc" in text, (
916+
"Step-3 text must name the supported survey-pretest scope "
917+
"(pweight + PSU/FPC) so agents do not assume any "
918+
"survey_design= path is supported."
919+
)
920+
# Deferred regimes must be flagged explicitly so agents
921+
# know not to attempt them.
922+
assert "stratif" in text, (
923+
"Step-3 text must explicitly note that stratified "
924+
"(SurveyDesign(strata=...)) survey designs are not yet "
925+
"supported on HAD pretests."
926+
)
927+
assert "replicate" in text, (
928+
"Step-3 text must explicitly note that replicate-weight "
929+
"(BRR/Fay/JK1/JKn/SDR) survey designs are not yet "
930+
"supported on HAD pretests."
931+
)
932+
assert "notimplementederror" in text, (
933+
"Step-3 text must name the actual exception raised "
934+
"(NotImplementedError) so agents can match it in "
935+
"error-handling paths."
936+
)
937+
897938
def test_had_step_3_pretest_assumption_labels_correct(self, mock_had_results):
898939
# Per docs/methodology/REGISTRY.md and diff_diff/had_pretests.py
899940
# docstrings:

0 commit comments

Comments
 (0)