Address twenty-fifth round of CI review findings on PR #318

igerber · claude · igerber · commit 3167f05b75ac · 2026-04-19T10:40:35.000-04:00
P2 code quality (full_report PT label). ``BusinessReport.
full_report()`` hard-coded ``joint p = ...`` in the Pre-Trends
section, which mislabeled the 2x2 ``slope_difference`` and
EfficientDiD ``hausman`` single-statistic tests (both emit a single
``p``, not a joint p) and invented a nonexistent label for
design-enforced SDiD ``synthetic_fit`` / TROP ``factor`` paths that
have no p-value at all. ``summary()`` was already method-aware via
``_pt_method_stat_label``; the markdown path now uses the same
helper and omits the parenthetical entirely for
no-p-value methods.

P3 docs. ``REPORTING.md``'s "single-knob alpha" note said
``alpha`` drives both the CI level and the phrasing threshold.
The implementation and regression tests actually preserve the
fit's native CI on alpha mismatch (the stored CI is the only
quantile the underlying estimator supplied; bootstrap
distributions and finite-df analytical variances are not always
retained) and only change the significance phrasing, with an
``alpha_override_preserved`` caveat. Updated the note to describe
the preserved-native-CI fallback and the reason for the
conservative choice.

P3 coverage. Add ``TestFullReportMethodAwarePTLabel`` with three
regressions using the same fake-DR-schema pattern the summary
tests use:

  * ``slope_difference`` -&gt; markdown uses ``p = ...``, not ``joint p``;
  * ``hausman`` -&gt; markdown uses ``p = ...``, not ``joint p``;
  * ``synthetic_fit`` -&gt; markdown omits any p-value label; verdict
    still renders.

233 BR / DR / practitioner tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
@@ -1820,8 +1820,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str:
         jp = pt.get("joint_p_value")
         verdict = pt.get("verdict")
         tier = pt.get("power_tier")
-        jp_str = f"joint p = {jp:.3g}" if isinstance(jp, (int, float)) else "joint p unavailable"
-        lines.append(f"- Verdict: `{verdict}` ({jp_str})")
+        # Use the method-aware statistic label the summary path already
+        # uses: "joint p" for Wald / Bonferroni event-study, "p" for
+        # slope-difference / Hausman single-statistic tests, and None
+        # for design-enforced SDiD / TROP paths where there is no
+        # p-value at all. Round-25 P2 CI review on PR #318 flagged the
+        # hard-coded "joint p" wording as misdescribing 2x2 / Hausman
+        # fits and inventing a nonexistent p-value for SDiD / TROP.
+        method = pt.get("method")
+        stat_label = _pt_method_stat_label(method)
+        if stat_label and isinstance(jp, (int, float)):
+            lines.append(f"- Verdict: `{verdict}` ({stat_label} = {jp:.3g})")
+        elif stat_label:
+            lines.append(f"- Verdict: `{verdict}` ({stat_label} unavailable)")
+        else:
+            lines.append(f"- Verdict: `{verdict}`")
         if tier:
             lines.append(f"- Power tier: `{tier}`")
         mdv = pt.get("mdv")
diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md
@@ -153,11 +153,24 @@ not new inference.
   coefficients), which would be unsafe in the presence of non-linear
   link functions (Poisson QMLE, logit).
 
-- **Note:** Single-knob `alpha`. BusinessReport exposes only `alpha`
-  (defaults to `results.alpha`); there is no separate
-  `significance_threshold` parameter. `alpha` drives both the CI level
-  (`(1 - alpha) * 100`% interval) and the phrasing tier threshold
-  ("statistically significant at the (1 - alpha) * 100% level").
+- **Note:** Single-knob `alpha` with preserved-native-CI fallback.
+  BusinessReport exposes only `alpha` (defaults to `results.alpha`);
+  there is no separate `significance_threshold` parameter. When the
+  requested `alpha` matches the fit's native level, it drives both the
+  CI level (`(1 - alpha) * 100`% interval) and the phrasing tier
+  threshold ("statistically significant at the (1 - alpha) * 100%
+  level"). When the requested `alpha` differs from the fit's native
+  level (e.g., the user asks for `alpha=0.10` on a result fit with
+  `alpha=0.05`), BusinessReport does NOT recompute the CI at the
+  requested level, because the stored CI is the only quantile the
+  underlying estimator supplied (bootstrap distributions and
+  finite-df analytical variances are not always retained on the
+  result). Instead, the schema preserves the fit's native CI (with its
+  original level) and uses the requested `alpha` only for the
+  significance-phrasing threshold, and emits an
+  `alpha_override_preserved` caveat describing the mismatch. This is
+  the conservative choice: it avoids silently recomputing CIs under
+  assumptions the estimator may not support.
 
 - **Note:** Schema stability policy for the AI-legible `to_dict()`
   surface. New top-level keys count as additive (no version bump); new
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
@@ -921,6 +921,112 @@ def test_dr_summary_uses_hausman_wording_for_efficient_did(self, edid_fit):
         assert "event-study coefficients" not in summary
 
 
+class TestFullReportMethodAwarePTLabel:
+    """Round-25 P2 CI review on PR #318: ``BusinessReport.full_report()``
+    previously hard-coded ``joint p = ...`` in the Pre-Trends section,
+    which mislabels the 2x2 ``slope_difference`` and EfficientDiD
+    ``hausman`` single-statistic tests and invents a nonexistent
+    ``joint p`` label for design-enforced SDiD / TROP paths that have
+    no p-value at all. The markdown path must use the same
+    method-aware label helper the summary path already uses
+    (``_pt_method_stat_label``).
+    """
+
+    @staticmethod
+    def _stub_result_with_method(method: str):
+        from diff_diff.diagnostic_report import DiagnosticReportResults
+
+        class DiDResults:
+            pass
+
+        stub = DiDResults()
+        stub.att = 1.0
+        stub.se = 0.2
+        stub.p_value = 0.001
+        stub.conf_int = (0.6, 1.4)
+        stub.alpha = 0.05
+        stub.n_obs = 100
+        stub.n_treated = 40
+        stub.n_control = 60
+        stub.survey_metadata = None
+        stub.inference_method = "analytical"
+
+        pt_block: dict = {
+            "status": "ran",
+            "method": method,
+            "verdict": "no_detected_violation",
+        }
+        # SDiD's synthetic_fit path has no p-value by design; the other
+        # methods do.
+        if method != "synthetic_fit":
+            pt_block["joint_p_value"] = 0.40
+
+        fake_schema = {
+            "schema_version": "1.0",
+            "estimator": "DiDResults",
+            "headline_metric": {"name": "att", "value": 1.0},
+            "parallel_trends": pt_block,
+            "pretrends_power": {"status": "not_applicable"},
+            "sensitivity": {"status": "not_applicable"},
+            "placebo": {"status": "skipped", "reason": "opt-in"},
+            "bacon": {"status": "not_applicable"},
+            "design_effect": {"status": "not_applicable"},
+            "heterogeneity": {"status": "not_applicable"},
+            "epv": {"status": "not_applicable"},
+            "estimator_native_diagnostics": {"status": "not_applicable"},
+            "skipped": {},
+            "warnings": [],
+            "overall_interpretation": "",
+            "next_steps": [],
+        }
+        fake_dr = DiagnosticReportResults(
+            schema=fake_schema,
+            interpretation="",
+            applicable_checks=("parallel_trends",),
+            skipped_checks={},
+            warnings=(),
+        )
+        return stub, fake_dr
+
+    def _pt_section(self, md: str) -> str:
+        # The Pre-Trends section is delimited by the next ``##`` heading.
+        after = md.split("## Pre-Trends", 1)[1]
+        return after.split("\n## ", 1)[0]
+
+    def test_full_report_slope_difference_uses_single_p_label(self):
+        stub, fake_dr = self._stub_result_with_method("slope_difference")
+        md = BusinessReport(stub, diagnostics=fake_dr).full_report()
+        section = self._pt_section(md)
+        assert "joint p" not in section, (
+            f"2x2 slope_difference is a single-statistic test and must "
+            f"not be labeled ``joint p`` in the markdown. Got: {section!r}"
+        )
+        # The single-statistic label ``p = ...`` must be present.
+        assert "p = 0.4" in section
+
+    def test_full_report_hausman_uses_single_p_label(self):
+        stub, fake_dr = self._stub_result_with_method("hausman")
+        section = self._pt_section(
+            BusinessReport(stub, diagnostics=fake_dr).full_report()
+        )
+        assert "joint p" not in section, (
+            f"EfficientDiD Hausman is a single-statistic test and must "
+            f"not be labeled ``joint p`` in the markdown. Got: {section!r}"
+        )
+        assert "p = 0.4" in section
+
+    def test_full_report_synthetic_fit_omits_p_label(self):
+        stub, fake_dr = self._stub_result_with_method("synthetic_fit")
+        section = self._pt_section(
+            BusinessReport(stub, diagnostics=fake_dr).full_report()
+        )
+        # No p-value of any kind for design-enforced SDiD PT analogue.
+        assert "joint p" not in section
+        assert "p = " not in section
+        # Verdict must still render.
+        assert "Verdict:" in section
+
+
 class TestHausmanPretestPropagatesFitDesign:
     """Round-9 regression: ``_pt_hausman`` must propagate the fitted
     result's ``control_group`` and ``anticipation`` into