Address fourth round of CI review findings on PR #318

igerber · claude · igerber · commit 345f65cc4923 · 2026-04-18T16:58:25.000-04:00
P0 fix:

* **``inference_method == 'wild_bootstrap'`` was not detected as
  bootstrap-like.** My prior bootstrap check caught ``'bootstrap'`` and
  ``variance_method in {bootstrap, jackknife, placebo}`` plus an
  attached ``bootstrap_distribution``, but ``DifferenceInDifferences(
  inference='wild_bootstrap')`` returns ``inference_method='wild_bootstrap'``
  and a percentile-bootstrap CI without necessarily attaching the raw
  distribution. The override path silently replaced that CI with a
  normal-approximation one. Fixed by matching both
  ``'bootstrap'`` and ``'wild_bootstrap'``; the preserved-CI caveat
  now calls out "wild cluster bootstrap" specifically when that path
  triggered. Regression: ``TestWildBootstrapAlphaOverride``.

P1 fix:

* **``_describe_assumption()`` emitted generic DiD PT text for
  ContinuousDiD / TripleDifference / StaggeredTripleDiff**, all of
  which have identifying logic different from ordinary group-time PT
  per ``docs/methodology/REGISTRY.md``. Replaced the generic fallback
  with source-backed branches:

  - ``ContinuousDiDResults``: two-level parallel trends (PT vs Strong
    PT) per Callaway, Goodman-Bacon &amp; Sant'Anna (2024), with explicit
    mention of ATT(d|d), ATT(d), ACRT identification sets.
  - ``TripleDifferenceResults`` / ``StaggeredTripleDiffResults``:
    triple-difference cancellation across the 2x2x2 cells per
    Ortiz-Villavicencio &amp; Sant'Anna (2025); notes that identification
    is weaker than ordinary DiD PT and depends on additive
    separability across the three dimensions.

  The ``parallel_trends_variant`` schema field gains two new values:
  ``"dose_pt_or_strong_pt"`` and ``"triple_difference_cancellation"``.
  Direct regressions in ``TestAssumptionBlockSourceFaithful`` assert
  registry-backed language (attribution phrases + method names) is
  present and generic group-time PT text is absent.

150 targeted tests pass; black, ruff, mypy clean on the new modules.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
@@ -364,8 +364,11 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An
             )
             variance_method = getattr(r, "variance_method", None)
 
+            # Any non-analytic inference surface that stores a sampling /
+            # resampling distribution (wild cluster bootstrap, percentile
+            # bootstrap, jackknife, placebo) should preserve its native CI.
             bootstrap_like = (
-                inference_method == "bootstrap"
+                inference_method in {"bootstrap", "wild_bootstrap"}
                 or has_bootstrap_dist
                 or variance_method in {"bootstrap", "jackknife", "placebo"}
             )
@@ -375,10 +378,15 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An
                 # Preserve the fitted CI at its native level.
                 alpha_was_honored = False
                 alpha = float(result_alpha)
+                if inference_method == "wild_bootstrap":
+                    inference_label = "wild cluster bootstrap"
+                elif bootstrap_like:
+                    inference_label = "bootstrap"
+                else:
+                    inference_label = "finite-df"
                 alpha_override_caveat = (
                     f"Requested alpha was not honored for the confidence "
-                    f"interval because this fit uses "
-                    f"{'bootstrap' if bootstrap_like else 'finite-df'} "
+                    f"interval because this fit uses {inference_label} "
                     f"inference; the displayed CI remains at the fit's "
                     f"native level ({int(round((1.0 - result_alpha) * 100))}%). "
                     f"The significance phrasing still uses the requested alpha."
@@ -611,6 +619,50 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]:
                 "captured through latent factor loadings."
             ),
         }
+    if estimator_name == "ContinuousDiDResults":
+        # Callaway, Goodman-Bacon & Sant'Anna (2024), two-level PT:
+        # REGISTRY.md §ContinuousDiD > Identification.
+        return {
+            "parallel_trends_variant": "dose_pt_or_strong_pt",
+            "no_anticipation": True,
+            "description": (
+                "ContinuousDiD identifies dose-specific treatment effects "
+                "under two possible parallel-trends conditions (Callaway, "
+                "Goodman-Bacon & Sant'Anna 2024). Parallel Trends (PT) "
+                "assumes untreated potential outcome paths are equal across "
+                "all dose groups and the untreated group (conditional on "
+                "dose), identifying ATT(d|d) and the binarized ATT^loc but "
+                "NOT ATT(d), ACRT, or cross-dose comparisons. Strong "
+                "Parallel Trends (SPT) additionally rules out selection "
+                "into dose on the basis of treatment effects and is "
+                "required to identify the dose-response curve ATT(d), "
+                "marginal effect ACRT(d), and cross-dose contrasts."
+            ),
+        }
+    if estimator_name in {"TripleDifferenceResults", "StaggeredTripleDiffResults"}:
+        # Ortiz-Villavicencio & Sant'Anna (2025) — identification is the
+        # triple-difference cancellation across the 2x2x2 cells, not
+        # ordinary DiD parallel trends; see REGISTRY.md §TripleDifference
+        # and §StaggeredTripleDifference.
+        return {
+            "parallel_trends_variant": "triple_difference_cancellation",
+            "no_anticipation": True,
+            "description": (
+                "Triple-difference identification relies on the DDD "
+                "decomposition (Ortiz-Villavicencio & Sant'Anna 2025): "
+                "the ATT is recovered from `DDD = DiD_A + DiD_B - DiD_C` "
+                "across the Group x Period x Eligibility (or Treatment) "
+                "cells, which differences out group-specific and "
+                "period-specific unobservables without requiring separate "
+                "parallel trends to hold between each cell pair. The "
+                "identifying restriction is therefore weaker than ordinary "
+                "DiD parallel trends but assumes that the residual "
+                "unobservable component is additively separable across the "
+                "three dimensions; practical overlap and common-support "
+                "conditions still apply on the propensity score when "
+                "covariates are used."
+            ),
+        }
     if estimator_name in {
         "CallawaySantAnnaResults",
         "SunAbrahamResults",
@@ -620,7 +672,6 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]:
         "EfficientDiDResults",
         "WooldridgeDiDResults",
         "ChaisemartinDHaultfoeuilleResults",
-        "StaggeredTripleDiffResults",
     }:
         return {
             "parallel_trends_variant": "conditional_or_group_time",
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
@@ -606,6 +606,127 @@ def test_finite_df_fit_preserves_fitted_ci_on_alpha_mismatch(self):
         assert "alpha_override_preserved" in caveat_topics
 
 
+class TestWildBootstrapAlphaOverride:
+    """Regression for the round-4 P0 finding that ``inference='wild_bootstrap'``
+    results were falling through to a normal-approximation recomputation."""
+
+    def test_wild_bootstrap_preserves_fitted_ci(self):
+        class _WildBootstrapStub:
+            def __init__(self):
+                self.att = 1.0
+                self.se = 0.5
+                self.p_value = 0.04
+                # 95% CI produced by the wild cluster bootstrap surface.
+                self.conf_int = (0.10, 1.90)
+                self.alpha = 0.05
+                self.n_obs = 100
+                self.n_treated = 40
+                self.n_control = 60
+                self.inference_method = "wild_bootstrap"
+                self.survey_metadata = None
+                # Wild-boot fits don't necessarily carry a raw distribution;
+                # the inference_method string alone must be enough.
+                self.bootstrap_distribution = None
+
+        stub = _WildBootstrapStub()
+        br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False)
+        h = br.to_dict()["headline"]
+        assert h["ci_level"] == 95, (
+            "Wild cluster bootstrap must preserve fitted CI level on alpha "
+            f"mismatch; got {h['ci_level']}"
+        )
+        assert h["ci_lower"] == pytest.approx(0.10)
+        assert h["ci_upper"] == pytest.approx(1.90)
+        caveats = br.caveats()
+        assert any(c.get("topic") == "alpha_override_preserved" for c in caveats)
+        # Caveat message should call out wild cluster bootstrap specifically.
+        preserved_msg = next(
+            c["message"] for c in caveats if c.get("topic") == "alpha_override_preserved"
+        )
+        assert "wild cluster bootstrap" in preserved_msg
+
+
+class TestAssumptionBlockSourceFaithful:
+    """Regression for the round-4 P1 finding that ``_describe_assumption``
+    was producing generic DiD PT text for ContinuousDiD, TripleDifference,
+    and StaggeredTripleDifference — all of which have different identifying
+    logic per the Methodology Registry."""
+
+    def _stub(self, class_name):
+        cls = type(class_name, (), {})
+        obj = cls()
+        obj.att = 1.0
+        obj.se = 0.1
+        obj.p_value = 0.001
+        obj.conf_int = (0.8, 1.2)
+        obj.alpha = 0.05
+        obj.n_obs = 100
+        obj.n_treated = 40
+        obj.n_control = 60
+        obj.survey_metadata = None
+        obj.event_study_effects = None
+        obj.inference_method = "analytical"
+        return obj
+
+    def test_continuous_did_assumption_uses_two_level_pt(self):
+        br = BusinessReport(self._stub("ContinuousDiDResults"), auto_diagnostics=False)
+        assumption = br.to_dict()["assumption"]
+        assert assumption["parallel_trends_variant"] == "dose_pt_or_strong_pt"
+        desc = assumption["description"]
+        # Registry-backed language: PT vs Strong PT + ACRT mention.
+        assert "Strong Parallel Trends" in desc or "SPT" in desc
+        assert "ATT(d" in desc or "ACRT" in desc
+        assert "Callaway" in desc  # attribution to CGBS 2024
+
+    def test_triple_difference_assumption_uses_ddd_decomposition(self):
+        class TripleDifferenceResults:
+            pass
+
+        obj = TripleDifferenceResults()
+        obj.att = 1.0
+        obj.se = 0.1
+        obj.p_value = 0.001
+        obj.conf_int = (0.8, 1.2)
+        obj.alpha = 0.05
+        obj.n_obs = 100
+        obj.n_treated = 40
+        obj.n_control = 60
+        obj.survey_metadata = None
+        obj.inference_method = "analytical"
+
+        br = BusinessReport(obj, auto_diagnostics=False)
+        assumption = br.to_dict()["assumption"]
+        assert assumption["parallel_trends_variant"] == "triple_difference_cancellation"
+        desc = assumption["description"]
+        assert "DDD" in desc
+        assert "Ortiz-Villavicencio" in desc or "2025" in desc
+
+    def test_staggered_triple_diff_assumption_uses_ddd_not_generic_pt(self):
+        class StaggeredTripleDiffResults:
+            pass
+
+        obj = StaggeredTripleDiffResults()
+        obj.overall_att = 1.0
+        obj.overall_se = 0.1
+        obj.overall_p_value = 0.001
+        obj.overall_conf_int = (0.8, 1.2)
+        obj.alpha = 0.05
+        obj.n_obs = 100
+        obj.n_treated = 40
+        obj.n_control = 60
+        obj.survey_metadata = None
+        obj.event_study_effects = None
+        obj.inference_method = "analytical"
+
+        br = BusinessReport(obj, auto_diagnostics=False)
+        assumption = br.to_dict()["assumption"]
+        assert assumption["parallel_trends_variant"] == "triple_difference_cancellation"
+        desc = assumption["description"]
+        assert "triple-difference" in desc.lower() or "DDD" in desc
+        # Must NOT be the generic group-time PT text.
+        assert "group-time ATT" not in desc
+
+
 class TestFullReportSingleM:
     """Regression: ``full_report()`` must not claim full-grid robustness for a
     single-M HonestDiDResults passthrough. The summary path was fixed earlier;