Skip to content

Commit 6bdae48

Browse files
igerberclaude
andcommitted
Address thirteenth round of CI review findings on PR #318
- P1 CS ``not_yet_treated`` sample semantics: ``BusinessReport._extract_sample`` no longer maps ``CallawaySantAnnaResults.n_control_units`` to a generic ``n_control`` / "control" label when ``control_group= "not_yet_treated"``. That field counts only never-treated units (REGISTRY.md §CallawaySantAnna), while the actual comparison group in that mode is the dynamic not-yet-treated set at each (g, t) cell. New behavior: ``n_control`` is ``None`` for this mode, ``control_group`` and ``n_never_treated`` surface the real semantics in the schema, and both ``summary()`` and ``full_report ()`` describe the dynamic comparison group instead of misreporting a possibly-zero never-treated tally as "control". Default ``never_treated`` fits still render the fixed count unchanged. - P3 ``_pt_hausman`` remediation hint: skipped-Hausman reason now points to ``precomputed={'parallel_trends': ...}`` (the actual PT precomputed key) rather than the prior misleading ``'sensitivity'`` alias. - P3 source-of-truth wording: ``diagnostic_report.py`` module docstring, ``REPORTING.md``, and ``llms-full.txt`` all now say "no estimator fitting and no variance re-derivation" rather than "no new statistical computation", and explicitly name the raw- data utilities DR may call (``check_parallel_trends``, ``bacon_decompose``, ``EfficientDiD.hausman_pretest``) when the caller supplies panel + column kwargs. Report-layer aggregations remain enumerated in REPORTING.md. - P3 docs consistency: ``docs/api/business_report.rst`` and ``diff_diff/guides/llms-practitioner.txt`` now show the raw-data passthrough kwargs on ``BusinessReport(...)`` alongside the README pattern, with an explicit note that data-dependent checks are skipped otherwise. - Regressions: ``TestCSNotYetTreatedControlGroupSemantics`` covers both the ``not_yet_treated`` path (suppressed ``n_control``, ``control_group`` + ``n_never_treated`` populated, prose mentions "not-yet-treated" / "dynamic") and the default ``never_treated`` path (fixed count preserved). 134 targeted tests passing (BR + DR); guides fingerprint test still clean (18 ``test_guides`` tests pass, confirming the UTF-8 fingerprint in ``llms-full.txt`` remains intact after the prose edit); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5ee5d37 commit 6bdae48

7 files changed

Lines changed: 163 additions & 20 deletions

File tree

diff_diff/business_report.py

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -470,10 +470,34 @@ def _extract_sample(self) -> Dict[str, Any]:
470470
"""Extract sample metadata from the fitted result."""
471471
r = self._results
472472
survey = self._extract_survey_block()
473+
n_treated = _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None)))
474+
n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None)))
475+
476+
# Control-group semantics. For estimators that expose a
477+
# ``control_group`` kwarg (CS, EfficientDiD), the meaning of
478+
# ``n_control_units`` depends on it. On CallawaySantAnna with
479+
# ``control_group="not_yet_treated"``, ``n_control_units`` counts
480+
# only the never-treated subset, so the actual dynamic
481+
# comparison group can be non-empty even when this count is 0.
482+
# Label the exposed count as never-treated and record the
483+
# active control-group mode so prose can surface the dynamic-
484+
# comparison context instead of misreporting "0 control"
485+
# (round-13 CI review on PR #318).
486+
control_group = getattr(r, "control_group", None)
487+
n_never_treated: Optional[int] = None
488+
n_control: Optional[int] = n_control_units
489+
if isinstance(control_group, str) and control_group == "not_yet_treated":
490+
n_never_treated = n_control_units
491+
# Do not populate a fixed ``n_control`` for this mode: the
492+
# comparison set is dynamic and varies by (g, t) cell.
493+
n_control = None
494+
473495
return {
474496
"n_obs": _safe_int(getattr(r, "n_obs", None)),
475-
"n_treated": _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))),
476-
"n_control": _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))),
497+
"n_treated": n_treated,
498+
"n_control": n_control,
499+
"n_never_treated": n_never_treated,
500+
"control_group": control_group if isinstance(control_group, str) else None,
477501
"n_periods": _safe_int(getattr(r, "n_periods", None)),
478502
"pre_periods": _safe_list_len(getattr(r, "pre_periods", None)),
479503
"post_periods": _safe_list_len(getattr(r, "post_periods", None)),
@@ -1369,21 +1393,32 @@ def _render_summary(schema: Dict[str, Any]) -> str:
13691393
f"pre-period variation."
13701394
)
13711395

1372-
# Sample sentence.
1396+
# Sample sentence. For CS ``control_group="not_yet_treated"`` the
1397+
# fixed control count is suppressed because the comparison group is
1398+
# dynamic; narrate the mode explicitly rather than misreporting a
1399+
# never-treated-only tally as "control" (round-13 CI review).
13731400
sample = schema.get("sample", {}) or {}
13741401
n_obs = sample.get("n_obs")
13751402
n_t = sample.get("n_treated")
13761403
n_c = sample.get("n_control")
1404+
n_nt = sample.get("n_never_treated")
1405+
control_mode = sample.get("control_group")
13771406
if isinstance(n_obs, int):
1378-
sentences.append(
1379-
f"Sample: {n_obs:,} observations"
1380-
+ (
1381-
f" ({n_t:,} treated, {n_c:,} control)"
1382-
if isinstance(n_t, int) and isinstance(n_c, int)
1407+
if isinstance(n_t, int) and isinstance(n_c, int):
1408+
sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).")
1409+
elif control_mode == "not_yet_treated" and isinstance(n_t, int):
1410+
extra = (
1411+
f"; {n_nt:,} never-treated units are also present"
1412+
if isinstance(n_nt, int) and n_nt > 0
13831413
else ""
13841414
)
1385-
+ "."
1386-
)
1415+
sentences.append(
1416+
f"Sample: {n_obs:,} observations ({n_t:,} treated) with a "
1417+
"dynamic not-yet-treated comparison group (the control set "
1418+
f"varies by cohort and period){extra}."
1419+
)
1420+
else:
1421+
sentences.append(f"Sample: {n_obs:,} observations.")
13871422
survey = sample.get("survey")
13881423
if survey and not survey.get("is_trivial"):
13891424
deff = survey.get("design_effect")
@@ -1507,8 +1542,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str:
15071542
lines.append(f"- Observations: {sample['n_obs']:,}")
15081543
if isinstance(sample.get("n_treated"), int):
15091544
lines.append(f"- Treated: {sample['n_treated']:,}")
1545+
# ``n_control`` is only populated for estimators whose control set
1546+
# is a fixed tally. For CS ``control_group="not_yet_treated"`` the
1547+
# comparison group is dynamic per (g, t); report the never-treated
1548+
# count (when non-zero) and the dynamic-comparison mode explicitly.
15101549
if isinstance(sample.get("n_control"), int):
15111550
lines.append(f"- Control: {sample['n_control']:,}")
1551+
elif sample.get("control_group") == "not_yet_treated":
1552+
if isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0:
1553+
lines.append(
1554+
f"- Never-treated units present in the panel: {sample['n_never_treated']:,}"
1555+
)
1556+
lines.append(
1557+
"- Comparison group: dynamic not-yet-treated units "
1558+
"(varies by cohort and period; no fixed control count)"
1559+
)
15121560
survey = sample.get("survey")
15131561
if survey:
15141562
if survey.get("is_trivial"):

diff_diff/diagnostic_report.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,15 @@
1010
1111
- No hard pass/fail gates. Severity is conveyed by natural-language phrasing,
1212
not a traffic-light enum. See ``docs/methodology/REPORTING.md``.
13-
- No new statistical computation. Every reported number is either read from
14-
``results`` or computed by an existing diff-diff utility function.
13+
- No estimator fitting and no variance re-derivation from raw data. Every
14+
effect, SE, p-value, CI, and sensitivity bound is either read from
15+
``results`` or produced by an existing diff-diff utility. May call
16+
``check_parallel_trends`` / ``bacon_decompose`` /
17+
``EfficientDiD.hausman_pretest`` when the caller supplies the panel +
18+
column kwargs. Report-layer cross-period aggregations (joint-Wald /
19+
Bonferroni pre-trends p-value, heterogeneity dispersion over
20+
post-treatment effects) are enumerated in
21+
``docs/methodology/REPORTING.md``.
1522
- Lazy evaluation. ``DiagnosticReport(results, ...)`` is free; ``run_all()``
1623
triggers compute and caches.
1724
- Never prove a null. Pre-trends phrasing uses power information from
@@ -1750,7 +1757,7 @@ def _pt_hausman(self) -> Dict[str, Any]:
17501757
"diagnose a different design than the estimate. "
17511758
"Rerun ``EfficientDiD.hausman_pretest(...)`` "
17521759
"manually with the original fit's kwargs or pass "
1753-
"``precomputed={'sensitivity': ...}`` if you have "
1760+
"``precomputed={'parallel_trends': ...}`` if you have "
17541761
"a pretest result."
17551762
),
17561763
}

diff_diff/guides/llms-full.txt

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,8 +1848,13 @@ Power tier (drives BR phrasing for the `no_detected_violation` verdict):
18481848

18491849
### Methodology notes
18501850

1851-
BR and DR perform no new statistical computation — every reported number
1852-
is read from the fitted result or computed by an existing diff-diff
1853-
utility. Both schemas are experimental in the current release; see
1854-
`docs/methodology/REPORTING.md` for phrasing rules, the no-traffic-light
1855-
decision, unit-translation policy, and schema stability policy.
1851+
BR and DR do no estimator fitting and do not re-derive variance from
1852+
raw data — every effect, SE, p-value, CI, and sensitivity bound is
1853+
read from the fitted result or produced by an existing diff-diff
1854+
utility (may call `check_parallel_trends`, `bacon_decompose`, or
1855+
`EfficientDiD.hausman_pretest` when the panel + column kwargs are
1856+
supplied). Report-layer cross-period aggregations are enumerated in
1857+
`docs/methodology/REPORTING.md`. Both schemas are experimental in the
1858+
current release; see that document for phrasing rules, the
1859+
no-traffic-light decision, unit-translation policy, and schema
1860+
stability policy.

diff_diff/guides/llms-practitioner.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,13 +457,21 @@ print(dr.summary()) # overall interpretation paragraph
457457
dr.to_dict() # AI-legible structured schema
458458

459459
# Or let BusinessReport auto-construct a DiagnosticReport and render the
460-
# full stakeholder narrative in one call:
460+
# full stakeholder narrative in one call. Pass ``data`` + the column
461+
# names so data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD
462+
# Hausman pretest) actually run — without them the auto path still
463+
# produces a report but skips those checks with an explicit reason.
461464
br = BusinessReport(
462465
cs_result,
463466
outcome_label='Revenue per user',
464467
outcome_unit='$',
465468
business_question='Did the campaign lift revenue?',
466469
treatment_label='the campaign',
470+
data=data,
471+
outcome='y',
472+
unit='id',
473+
time='t',
474+
first_treat='g',
467475
)
468476
print(br.summary()) # short paragraph block
469477
print(br.full_report()) # structured markdown

docs/api/business_report.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ to surface pre-trends, sensitivity, and other validity checks as part
1414
of the narrative. Pass ``auto_diagnostics=False`` to skip this, or
1515
``diagnostics=<DiagnosticReport>`` to supply an explicit one.
1616

17+
Data-dependent checks (2x2 parallel trends on simple DiD,
18+
Goodman-Bacon decomposition on staggered estimators, the EfficientDiD
19+
Hausman PT-All vs PT-Post pretest) require the raw panel + column
20+
names. Pass ``data``, ``outcome``, ``treatment``, ``unit``, ``time``,
21+
and/or ``first_treat`` to ``BusinessReport`` and they are forwarded
22+
to the auto-constructed ``DiagnosticReport``. Without these kwargs,
23+
those specific checks are skipped with an explicit reason while the
24+
rest of the report still renders.
25+
1726
Methodology deviations (no traffic-light gates, pre-trends verdict
1827
thresholds, power-aware phrasing, unit-translation policy, schema
1928
stability) are documented in :doc:`../methodology/REPORTING`.
@@ -35,6 +44,15 @@ Example
3544
outcome_unit="$",
3645
business_question="Did the loyalty program lift revenue?",
3746
treatment_label="the loyalty program",
47+
# Optional: panel + column names so auto diagnostics can run the
48+
# data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD
49+
# Hausman). Without these the auto path still runs and just
50+
# skips those checks.
51+
data=df,
52+
outcome="revenue",
53+
unit="store",
54+
time="period",
55+
first_treat="first_treat",
3856
)
3957
print(report.summary())
4058

docs/methodology/REPORTING.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,11 @@ SE, p-value, CI, and sensitivity bound is either read from the fitted
2121
result or produced by an existing diff-diff utility
2222
(`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`,
2323
`check_parallel_trends`, `compute_deff_diagnostics`,
24-
`compute_pretrends_power`). The report layer **does** compose a few
24+
`compute_pretrends_power`). When the caller passes the raw panel +
25+
column kwargs, `DiagnosticReport` may call those utilities on the
26+
supplied data (2x2 PT via `check_parallel_trends`, Goodman-Bacon
27+
decomposition via `bacon_decompose`, and the EfficientDiD Hausman
28+
PT-All vs PT-Post pretest via `EfficientDiD.hausman_pretest`). The report layer **does** compose a few
2529
cross-period summary statistics from per-period inputs already
2630
produced by the estimator — specifically the joint-Wald / Bonferroni
2731
pre-trends p-value from pre-period event-study coefficients (see

tests/test_business_report.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,59 @@ class _Result:
11001100
), f"cluster column must propagate from fit to Hausman pretest; got {captured}"
11011101

11021102

1103+
class TestCSNotYetTreatedControlGroupSemantics:
1104+
"""Round-13 P1 regression: ``BusinessReport`` must not relabel
1105+
``n_control_units`` as generic "control" for a
1106+
``CallawaySantAnna(control_group='not_yet_treated')`` fit — that
1107+
field counts only never-treated units, while the actual comparison
1108+
group is the dynamic not-yet-treated set at each (g, t) cell.
1109+
"""
1110+
1111+
def test_not_yet_treated_fit_does_not_render_misleading_control_count(self):
1112+
sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7)
1113+
# Fit with the dynamic not-yet-treated comparison mode.
1114+
cs = CallawaySantAnna(base_period="universal", control_group="not_yet_treated").fit(
1115+
sdf,
1116+
outcome="outcome",
1117+
unit="unit",
1118+
time="period",
1119+
first_treat="first_treat",
1120+
aggregate="event_study",
1121+
)
1122+
br = BusinessReport(cs, auto_diagnostics=False)
1123+
sample = br.to_dict()["sample"]
1124+
1125+
# Fixed ``n_control`` must NOT be populated — the comparison set
1126+
# is dynamic per (g, t), not a fixed unit tally.
1127+
assert (
1128+
sample["n_control"] is None
1129+
), f"n_control must be None for not_yet_treated; got {sample['n_control']}"
1130+
# The new fields surface the real semantics.
1131+
assert sample["control_group"] == "not_yet_treated"
1132+
assert sample["n_never_treated"] == getattr(cs, "n_control_units", None)
1133+
1134+
# Both summary and full_report must describe the dynamic
1135+
# comparison group rather than asserting a misleading "control"
1136+
# count.
1137+
summary = br.summary()
1138+
# No "(N treated, N control)" phrasing on this path.
1139+
assert " control)" not in summary
1140+
assert "not-yet-treated" in summary or "dynamic" in summary
1141+
1142+
full = br.full_report()
1143+
assert "- Control:" not in full or "not-yet-treated" in full
1144+
assert "dynamic not-yet-treated" in full or "not-yet-treated" in full
1145+
1146+
def test_never_treated_fit_still_shows_fixed_control_count(self, cs_fit):
1147+
"""Default path (``control_group='never_treated'``) keeps the
1148+
fixed ``n_control`` tally so existing prose is unchanged."""
1149+
fit, _ = cs_fit # default is never_treated
1150+
br = BusinessReport(fit, auto_diagnostics=False)
1151+
sample = br.to_dict()["sample"]
1152+
assert isinstance(sample["n_control"], int)
1153+
assert sample["control_group"] == "never_treated"
1154+
1155+
11031156
class TestBRDataKwargsPassthroughToAutoDR:
11041157
"""Round-12 regression: ``BusinessReport`` now accepts
11051158
``data`` / ``outcome`` / ``treatment`` / ``unit`` / ``time`` /

0 commit comments

Comments
 (0)