Address thirteenth round of CI review findings on PR #318

igerber · claude · igerber · commit 6bdae48a5502 · 2026-04-18T21:07:56.000-04:00
- P1 CS ``not_yet_treated`` sample semantics:
  ``BusinessReport._extract_sample`` no longer maps
  ``CallawaySantAnnaResults.n_control_units`` to a generic
  ``n_control`` / "control" label when ``control_group=
  "not_yet_treated"``. That field counts only never-treated units
  (REGISTRY.md §CallawaySantAnna), while the actual comparison group
  in that mode is the dynamic not-yet-treated set at each (g, t)
  cell. New behavior: ``n_control`` is ``None`` for this mode,
  ``control_group`` and ``n_never_treated`` surface the real
  semantics in the schema, and both ``summary()`` and ``full_report
  ()`` describe the dynamic comparison group instead of misreporting
  a possibly-zero never-treated tally as "control". Default
  ``never_treated`` fits still render the fixed count unchanged.
- P3 ``_pt_hausman`` remediation hint: skipped-Hausman reason now
  points to ``precomputed={'parallel_trends': ...}`` (the actual
  PT precomputed key) rather than the prior misleading
  ``'sensitivity'`` alias.
- P3 source-of-truth wording: ``diagnostic_report.py`` module
  docstring, ``REPORTING.md``, and ``llms-full.txt`` all now say
  "no estimator fitting and no variance re-derivation" rather than
  "no new statistical computation", and explicitly name the raw-
  data utilities DR may call (``check_parallel_trends``,
  ``bacon_decompose``, ``EfficientDiD.hausman_pretest``) when the
  caller supplies panel + column kwargs. Report-layer aggregations
  remain enumerated in REPORTING.md.
- P3 docs consistency: ``docs/api/business_report.rst`` and
  ``diff_diff/guides/llms-practitioner.txt`` now show the raw-data
  passthrough kwargs on ``BusinessReport(...)`` alongside the
  README pattern, with an explicit note that data-dependent checks
  are skipped otherwise.
- Regressions: ``TestCSNotYetTreatedControlGroupSemantics`` covers
  both the ``not_yet_treated`` path (suppressed ``n_control``,
  ``control_group`` + ``n_never_treated`` populated, prose mentions
  "not-yet-treated" / "dynamic") and the default ``never_treated``
  path (fixed count preserved).

134 targeted tests passing (BR + DR); guides fingerprint test
still clean (18 ``test_guides`` tests pass, confirming the UTF-8
fingerprint in ``llms-full.txt`` remains intact after the prose
edit); black / ruff / mypy clean on BR/DR modules.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
@@ -470,10 +470,34 @@ def _extract_sample(self) -> Dict[str, Any]:
         """Extract sample metadata from the fitted result."""
         r = self._results
         survey = self._extract_survey_block()
+        n_treated = _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None)))
+        n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None)))
+
+        # Control-group semantics. For estimators that expose a
+        # ``control_group`` kwarg (CS, EfficientDiD), the meaning of
+        # ``n_control_units`` depends on it. On CallawaySantAnna with
+        # ``control_group="not_yet_treated"``, ``n_control_units`` counts
+        # only the never-treated subset, so the actual dynamic
+        # comparison group can be non-empty even when this count is 0.
+        # Label the exposed count as never-treated and record the
+        # active control-group mode so prose can surface the dynamic-
+        # comparison context instead of misreporting "0 control"
+        # (round-13 CI review on PR #318).
+        control_group = getattr(r, "control_group", None)
+        n_never_treated: Optional[int] = None
+        n_control: Optional[int] = n_control_units
+        if isinstance(control_group, str) and control_group == "not_yet_treated":
+            n_never_treated = n_control_units
+            # Do not populate a fixed ``n_control`` for this mode: the
+            # comparison set is dynamic and varies by (g, t) cell.
+            n_control = None
+
         return {
             "n_obs": _safe_int(getattr(r, "n_obs", None)),
-            "n_treated": _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))),
-            "n_control": _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))),
+            "n_treated": n_treated,
+            "n_control": n_control,
+            "n_never_treated": n_never_treated,
+            "control_group": control_group if isinstance(control_group, str) else None,
             "n_periods": _safe_int(getattr(r, "n_periods", None)),
             "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)),
             "post_periods": _safe_list_len(getattr(r, "post_periods", None)),
@@ -1369,21 +1393,32 @@ def _render_summary(schema: Dict[str, Any]) -> str:
                 f"pre-period variation."
             )
 
-    # Sample sentence.
+    # Sample sentence. For CS ``control_group="not_yet_treated"`` the
+    # fixed control count is suppressed because the comparison group is
+    # dynamic; narrate the mode explicitly rather than misreporting a
+    # never-treated-only tally as "control" (round-13 CI review).
     sample = schema.get("sample", {}) or {}
     n_obs = sample.get("n_obs")
     n_t = sample.get("n_treated")
     n_c = sample.get("n_control")
+    n_nt = sample.get("n_never_treated")
+    control_mode = sample.get("control_group")
     if isinstance(n_obs, int):
-        sentences.append(
-            f"Sample: {n_obs:,} observations"
-            + (
-                f" ({n_t:,} treated, {n_c:,} control)"
-                if isinstance(n_t, int) and isinstance(n_c, int)
+        if isinstance(n_t, int) and isinstance(n_c, int):
+            sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).")
+        elif control_mode == "not_yet_treated" and isinstance(n_t, int):
+            extra = (
+                f"; {n_nt:,} never-treated units are also present"
+                if isinstance(n_nt, int) and n_nt > 0
                 else ""
             )
-            + "."
-        )
+            sentences.append(
+                f"Sample: {n_obs:,} observations ({n_t:,} treated) with a "
+                "dynamic not-yet-treated comparison group (the control set "
+                f"varies by cohort and period){extra}."
+            )
+        else:
+            sentences.append(f"Sample: {n_obs:,} observations.")
         survey = sample.get("survey")
         if survey and not survey.get("is_trivial"):
             deff = survey.get("design_effect")
@@ -1507,8 +1542,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str:
         lines.append(f"- Observations: {sample['n_obs']:,}")
     if isinstance(sample.get("n_treated"), int):
         lines.append(f"- Treated: {sample['n_treated']:,}")
+    # ``n_control`` is only populated for estimators whose control set
+    # is a fixed tally. For CS ``control_group="not_yet_treated"`` the
+    # comparison group is dynamic per (g, t); report the never-treated
+    # count (when non-zero) and the dynamic-comparison mode explicitly.
     if isinstance(sample.get("n_control"), int):
         lines.append(f"- Control: {sample['n_control']:,}")
+    elif sample.get("control_group") == "not_yet_treated":
+        if isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0:
+            lines.append(
+                f"- Never-treated units present in the panel: {sample['n_never_treated']:,}"
+            )
+        lines.append(
+            "- Comparison group: dynamic not-yet-treated units "
+            "(varies by cohort and period; no fixed control count)"
+        )
     survey = sample.get("survey")
     if survey:
         if survey.get("is_trivial"):
diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py
@@ -10,8 +10,15 @@
 
 - No hard pass/fail gates. Severity is conveyed by natural-language phrasing,
   not a traffic-light enum. See ``docs/methodology/REPORTING.md``.
-- No new statistical computation. Every reported number is either read from
-  ``results`` or computed by an existing diff-diff utility function.
+- No estimator fitting and no variance re-derivation from raw data. Every
+  effect, SE, p-value, CI, and sensitivity bound is either read from
+  ``results`` or produced by an existing diff-diff utility. May call
+  ``check_parallel_trends`` / ``bacon_decompose`` /
+  ``EfficientDiD.hausman_pretest`` when the caller supplies the panel +
+  column kwargs. Report-layer cross-period aggregations (joint-Wald /
+  Bonferroni pre-trends p-value, heterogeneity dispersion over
+  post-treatment effects) are enumerated in
+  ``docs/methodology/REPORTING.md``.
 - Lazy evaluation. ``DiagnosticReport(results, ...)`` is free; ``run_all()``
   triggers compute and caches.
 - Never prove a null. Pre-trends phrasing uses power information from
@@ -1750,7 +1757,7 @@ def _pt_hausman(self) -> Dict[str, Any]:
                     "diagnose a different design than the estimate. "
                     "Rerun ``EfficientDiD.hausman_pretest(...)`` "
                     "manually with the original fit's kwargs or pass "
-                    "``precomputed={'sensitivity': ...}`` if you have "
+                    "``precomputed={'parallel_trends': ...}`` if you have "
                     "a pretest result."
                 ),
             }
diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
@@ -1848,8 +1848,13 @@ Power tier (drives BR phrasing for the `no_detected_violation` verdict):
 
 ### Methodology notes
 
-BR and DR perform no new statistical computation — every reported number
-is read from the fitted result or computed by an existing diff-diff
-utility. Both schemas are experimental in the current release; see
-`docs/methodology/REPORTING.md` for phrasing rules, the no-traffic-light
-decision, unit-translation policy, and schema stability policy.
+BR and DR do no estimator fitting and do not re-derive variance from
+raw data — every effect, SE, p-value, CI, and sensitivity bound is
+read from the fitted result or produced by an existing diff-diff
+utility (may call `check_parallel_trends`, `bacon_decompose`, or
+`EfficientDiD.hausman_pretest` when the panel + column kwargs are
+supplied). Report-layer cross-period aggregations are enumerated in
+`docs/methodology/REPORTING.md`. Both schemas are experimental in the
+current release; see that document for phrasing rules, the
+no-traffic-light decision, unit-translation policy, and schema
+stability policy.
diff --git a/diff_diff/guides/llms-practitioner.txt b/diff_diff/guides/llms-practitioner.txt
@@ -457,13 +457,21 @@ print(dr.summary())           # overall interpretation paragraph
 dr.to_dict()                  # AI-legible structured schema
 
 # Or let BusinessReport auto-construct a DiagnosticReport and render the
-# full stakeholder narrative in one call:
+# full stakeholder narrative in one call. Pass ``data`` + the column
+# names so data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD
+# Hausman pretest) actually run — without them the auto path still
+# produces a report but skips those checks with an explicit reason.
 br = BusinessReport(
     cs_result,
     outcome_label='Revenue per user',
     outcome_unit='$',
     business_question='Did the campaign lift revenue?',
     treatment_label='the campaign',
+    data=data,
+    outcome='y',
+    unit='id',
+    time='t',
+    first_treat='g',
 )
 print(br.summary())           # short paragraph block
 print(br.full_report())       # structured markdown
diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst
@@ -14,6 +14,15 @@ to surface pre-trends, sensitivity, and other validity checks as part
 of the narrative. Pass ``auto_diagnostics=False`` to skip this, or
 ``diagnostics=<DiagnosticReport>`` to supply an explicit one.
 
+Data-dependent checks (2x2 parallel trends on simple DiD,
+Goodman-Bacon decomposition on staggered estimators, the EfficientDiD
+Hausman PT-All vs PT-Post pretest) require the raw panel + column
+names. Pass ``data``, ``outcome``, ``treatment``, ``unit``, ``time``,
+and/or ``first_treat`` to ``BusinessReport`` and they are forwarded
+to the auto-constructed ``DiagnosticReport``. Without these kwargs,
+those specific checks are skipped with an explicit reason while the
+rest of the report still renders.
+
 Methodology deviations (no traffic-light gates, pre-trends verdict
 thresholds, power-aware phrasing, unit-translation policy, schema
 stability) are documented in :doc:`../methodology/REPORTING`.
@@ -35,6 +44,15 @@ Example
        outcome_unit="$",
        business_question="Did the loyalty program lift revenue?",
        treatment_label="the loyalty program",
+       # Optional: panel + column names so auto diagnostics can run the
+       # data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD
+       # Hausman). Without these the auto path still runs and just
+       # skips those checks.
+       data=df,
+       outcome="revenue",
+       unit="store",
+       time="period",
+       first_treat="first_treat",
    )
    print(report.summary())
 
diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md
@@ -21,7 +21,11 @@ SE, p-value, CI, and sensitivity bound is either read from the fitted
 result or produced by an existing diff-diff utility
 (`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`,
 `check_parallel_trends`, `compute_deff_diagnostics`,
-`compute_pretrends_power`). The report layer **does** compose a few
+`compute_pretrends_power`). When the caller passes the raw panel +
+column kwargs, `DiagnosticReport` may call those utilities on the
+supplied data (2x2 PT via `check_parallel_trends`, Goodman-Bacon
+decomposition via `bacon_decompose`, and the EfficientDiD Hausman
+PT-All vs PT-Post pretest via `EfficientDiD.hausman_pretest`). The report layer **does** compose a few
 cross-period summary statistics from per-period inputs already
 produced by the estimator — specifically the joint-Wald / Bonferroni
 pre-trends p-value from pre-period event-study coefficients (see
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
@@ -1100,6 +1100,59 @@ class _Result:
         ), f"cluster column must propagate from fit to Hausman pretest; got {captured}"
 
 
+class TestCSNotYetTreatedControlGroupSemantics:
+    """Round-13 P1 regression: ``BusinessReport`` must not relabel
+    ``n_control_units`` as generic "control" for a
+    ``CallawaySantAnna(control_group='not_yet_treated')`` fit — that
+    field counts only never-treated units, while the actual comparison
+    group is the dynamic not-yet-treated set at each (g, t) cell.
+    """
+
+    def test_not_yet_treated_fit_does_not_render_misleading_control_count(self):
+        sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7)
+        # Fit with the dynamic not-yet-treated comparison mode.
+        cs = CallawaySantAnna(base_period="universal", control_group="not_yet_treated").fit(
+            sdf,
+            outcome="outcome",
+            unit="unit",
+            time="period",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(cs, auto_diagnostics=False)
+        sample = br.to_dict()["sample"]
+
+        # Fixed ``n_control`` must NOT be populated — the comparison set
+        # is dynamic per (g, t), not a fixed unit tally.
+        assert (
+            sample["n_control"] is None
+        ), f"n_control must be None for not_yet_treated; got {sample['n_control']}"
+        # The new fields surface the real semantics.
+        assert sample["control_group"] == "not_yet_treated"
+        assert sample["n_never_treated"] == getattr(cs, "n_control_units", None)
+
+        # Both summary and full_report must describe the dynamic
+        # comparison group rather than asserting a misleading "control"
+        # count.
+        summary = br.summary()
+        # No "(N treated, N control)" phrasing on this path.
+        assert " control)" not in summary
+        assert "not-yet-treated" in summary or "dynamic" in summary
+
+        full = br.full_report()
+        assert "- Control:" not in full or "not-yet-treated" in full
+        assert "dynamic not-yet-treated" in full or "not-yet-treated" in full
+
+    def test_never_treated_fit_still_shows_fixed_control_count(self, cs_fit):
+        """Default path (``control_group='never_treated'``) keeps the
+        fixed ``n_control`` tally so existing prose is unchanged."""
+        fit, _ = cs_fit  # default is never_treated
+        br = BusinessReport(fit, auto_diagnostics=False)
+        sample = br.to_dict()["sample"]
+        assert isinstance(sample["n_control"], int)
+        assert sample["control_group"] == "never_treated"
+
+
 class TestBRDataKwargsPassthroughToAutoDR:
     """Round-12 regression: ``BusinessReport`` now accepts
     ``data`` / ``outcome`` / ``treatment`` / ``unit`` / ``time`` /