igerber
diff --git a/‎diff_diff/business_report.py‎
Lines changed: 48 additions & 6 deletions b/‎diff_diff/business_report.py‎
Lines changed: 48 additions & 6 deletions
diff --git a/‎diff_diff/diagnostic_report.py‎
Lines changed: 24 additions & 7 deletions b/‎diff_diff/diagnostic_report.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎docs/validation/br_dr_canonical_findings.md‎
Lines changed: 175 additions & 0 deletions b/‎docs/validation/br_dr_canonical_findings.md‎
Lines changed: 175 additions & 0 deletions
@@ -1854,6 +1854,24 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str:
     return "the confidence interval includes zero; the data are consistent with no effect"
 
 
+def _sentence_first_upper(text: str) -> str:
+    """Uppercase only the first character of ``text``, preserving all
+    other casing. Unlike ``str.capitalize()``, which lowercases every
+    character after the first, this keeps user-supplied abbreviations
+    and proper nouns intact.
+
+    Examples
+    --------
+    >>> _sentence_first_upper("the NJ minimum-wage increase")
+    'The NJ minimum-wage increase'
+    >>> _sentence_first_upper("Castle Doctrine law adoption")
+    'Castle Doctrine law adoption'
+    """
+    if not text:
+        return text
+    return text[0].upper() + text[1:]
+
+
 def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str:
     """Return a direction-aware verb for the headline sentence.
 
@@ -1929,7 +1947,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str:
         # is not actually available.
         ci_str = " (inference unavailable: confidence interval is undefined for this fit)"
     by_clause = f" by {magnitude}" if effect != 0 else ""
-    return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}."
+    # Round-1 BR/DR canonical-validation (2026-04-19): Python's
+    # ``str.capitalize()`` lowercases everything except the first
+    # character, so ``"the NJ minimum-wage increase".capitalize()``
+    # returns ``"The nj minimum-wage increase"`` — flattening the
+    # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger,
+    # Castle Doctrine) carry proper-noun / acronym tokens in the
+    # user-supplied ``treatment_label``, so preserve user casing and
+    # only ensure the first character is uppercase.
+    treatment_sentence = _sentence_first_upper(treatment)
+    return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}."
 
 
 def _render_summary(schema: Dict[str, Any]) -> str:
@@ -2088,11 +2115,26 @@ def _render_summary(schema: Dict[str, Any]) -> str:
                 f"pre-period variation."
             )
         elif isinstance(bkd, (int, float)):
-            sentences.append(
-                f"HonestDiD: the result is fragile — the confidence interval "
-                f"includes zero once violations reach {bkd:.2g}x the "
-                f"pre-period variation."
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19):
+            # ``breakdown_M`` at or near zero reads as "0x the
+            # pre-period variation" which is a degenerate sentence
+            # (zero-times-anything is zero). The correct wording when
+            # the CI includes zero at the smallest grid point is to
+            # say the result is fragile to essentially any nonzero
+            # violation, not to quote the ``0x`` multiplier.
+            if bkd <= 0.05:
+                sentences.append(
+                    "HonestDiD: the result is fragile — the confidence "
+                    "interval includes zero even at the smallest "
+                    "parallel-trends violations on the sensitivity "
+                    "grid."
+                )
+            else:
+                sentences.append(
+                    f"HonestDiD: the result is fragile — the confidence "
+                    f"interval includes zero once violations reach {bkd:.2g}x "
+                    f"the pre-period variation."
+                )
 
     # Sample sentence. For fits with a dynamic comparison set (CS /
     # ContinuousDiD / StaggeredTripleDiff / EfficientDiD /
 
@@ -3118,13 +3118,30 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str
                 f"pre-period variation."
             )
         else:
-            sentences.append(
-                f"HonestDiD sensitivity: the result is fragile — the "
-                f"confidence interval includes zero once violations reach "
-                f"{bkd:.2g}x the pre-period variation."
-                if isinstance(bkd, (int, float))
-                else ""
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19): the
+            # "fragile — CI includes zero once violations reach 0x
+            # the pre-period variation" wording is a degenerate
+            # sentence at the ``breakdown_M == 0`` edge case
+            # surfaced by the Cheng-Hoekstra (2013) Castle Doctrine
+            # dataset. Mirror BR's fix: when the breakdown value is
+            # at or near zero, say the CI includes zero at the
+            # smallest grid point rather than quoting a ``0x``
+            # multiplier.
+            if isinstance(bkd, (int, float)):
+                if bkd <= 0.05:
+                    sentences.append(
+                        "HonestDiD sensitivity: the result is fragile — "
+                        "the confidence interval includes zero even at "
+                        "the smallest parallel-trends violations on the "
+                        "sensitivity grid."
+                    )
+                else:
+                    sentences.append(
+                        f"HonestDiD sensitivity: the result is fragile — "
+                        f"the confidence interval includes zero once "
+                        f"violations reach {bkd:.2g}x the pre-period "
+                        f"variation."
+                    )
 
     # Sentence 4: one secondary caveat if present.
     bacon = schema.get("bacon") or {}
 
@@ -0,0 +1,175 @@
+# BR / DR canonical-dataset validation findings
+
+This file records divergences observed in
+``br_dr_canonical_validation.md`` against canonical literature
+interpretations. Generated by running
+``docs/validation/validate_br_dr_canonical.py`` on the bundled
+datasets (Card-Krueger 1994, Callaway-Sant'Anna mpdta benchmark,
+Castle Doctrine / Cheng-Hoekstra 2013). This closes BR/DR
+foundation gap #4 — real-dataset validation — from the
+external-positioning gap list in
+``project_br_dr_foundation.md``.
+
+The goal of the validation exercise is to stress-test BR's prose on
+fits that published applied work has already interpreted, not to
+exactly reproduce their point estimates (the bundled datasets are
+either the R `did` package simulated benchmark or the causaldata
+mirrors, which may differ from the original author data).
+
+## Headline assessment
+
+BR's prose direction, verdicts, and caveat framing match canonical
+interpretations across all four runs:
+
+- **Card-Krueger**: positive sign, CI includes zero, "data consistent
+  with no effect." Matches the famous Card-Krueger finding of no
+  disemployment.
+- **mpdta (CS)**: aggregate ATT negative (-0.021 log-points), pre-trends
+  `no_detected_violation`, HonestDiD `robust_to_M_1.28`. Matches CS
+  tutorial expectations that the fit is robust.
+- **Castle Doctrine (CS)**: positive sign (homicides went up), pre-trends
+  `clear_violation` (joint p = 0.003), HonestDiD `fragile`
+  (breakdown_M = 0). Matches Cheng-Hoekstra's escalation finding AND
+  correctly flags the identifying-assumption fragility the staggered
+  rollout produces.
+- **Castle Doctrine (SA)**: identical point estimates (as expected —
+  CS and SA are algebraically consistent on this data), same clear PT
+  violation verdict.
+
+No wrong-sign or wrong-verdict findings surfaced on any of the four
+runs. The Bacon "already-robust" framing lifted from round-45 reads
+correctly on the staggered fits (CS and SA on Castle Doctrine and
+mpdta): the caveat is scoped as a statement about the rollout
+design, not a switch-estimator recommendation.
+
+## Issues fixed in this PR
+
+Small prose bugs surfaced by the real-data output. Each is a wording
+fix, not a methodology defect. All three are regression-tested under
+``tests/test_business_report.py::TestCanonicalValidationSurfaceFixes``.
+
+### Issue 1 (FIXED): Treatment label first-word capitalization eats abbreviations
+
+Card-Krueger output:
+
+> The **nj** minimum-wage increase lifted FTE employment by 1.47 FTE …
+
+Castle Doctrine output (CS and SA):
+
+> **Castle doctrine** law adoption worsened Homicide rate (per 100k) …
+
+BR used ``str.capitalize()``, which lowercases every character after
+the first. For labels starting with an abbreviation (``"the NJ
+minimum-wage increase"``) or a proper-noun phrase (``"Castle Doctrine
+law adoption"``), this flattened case in a way that looked wrong in
+stakeholder-facing prose.
+
+**Fix**: replaced ``str.capitalize()`` with a new
+``_sentence_first_upper`` helper that uppercases only the first
+character and preserves user-supplied casing for everything else.
+
+### Issue 2 (FIXED): ``breakdown_M = 0`` phrasing reads as "0x" (zero-times-something)
+
+Castle Doctrine (CS):
+
+> HonestDiD: the result is fragile — the confidence interval
+> includes zero once violations reach **0x** the pre-period
+> variation.
+
+When the breakdown M is exactly 0, "reach 0x the pre-period
+variation" is a degenerate reading — the CI already includes zero
+under any nonzero pre-trend violation (or even with zero violation,
+depending on the grid).
+
+**Fix**: when ``breakdown_M <= 0.05``, both BR's summary and DR's
+overall-interpretation sentence emit "the confidence interval
+includes zero even at the smallest parallel-trends violations on
+the sensitivity grid" instead of quoting the ``0x`` multiplier. The
+0.05 threshold also covers near-zero values (e.g., 0.03) where the
+multiplier is equally uninformative to stakeholders.
+
+### Issue 3 (deferred): Outcome-label capitalization in mid-sentence
+
+mpdta output:
+
+> … reduced **Log employment** by 0.0214 log-points …
+
+The user-supplied ``outcome_label="Log employment"`` is
+capitalized as-is, which looks awkward mid-sentence. This is
+stylistic, and arguably user-controllable (the user could pass
+``outcome_label="log employment"``). Deprioritize unless fixing
+Issue 1 is trivially extensible. Noted here for follow-up.
+
+## Issues to track as follow-up (out-of-scope for this PR)
+
+### Follow-up A: ``SunAbrahamResults`` excluded from HonestDiD applicability
+
+BR's applicability matrix (``diagnostic_report.py`` line ~107) lists
+``SunAbrahamResults`` with ``{parallel_trends, pretrends_power, bacon,
+design_effect, heterogeneity}`` — NO ``sensitivity``. But the
+original plan's applicability matrix in
+``project_br_dr_foundation.md`` and the SA methodology surface
+(event-study coefficients + VCov) both support HonestDiD in principle.
+
+Observed on Castle Doctrine (SA):
+
+> ## Sensitivity (HonestDiD)
+>
+> - Sensitivity not computed: sensitivity is not applicable to
+>   SunAbrahamResults.
+
+Given SA shows the same PT violation that CS does on this dataset,
+not having HonestDiD sensitivity on SA is a real usability gap. This
+requires adding an SA adapter to ``compute_honest_did`` and expanding
+the applicability matrix; it is library work beyond BR/DR prose.
+Belongs in the BR/DR gap-list expansion.
+
+### Follow-up B: Target-parameter clarity (gap list item #6)
+
+The assumption block on every staggered fit still reads:
+
+> Identification relies on parallel trends across treatment cohorts
+> and time periods (group-time ATT), plus no anticipation.
+
+But the CS ``overall_att`` for mpdta is a specific weighted average
+of ``ATT(g, t)`` cells, SA is an IW average, Stacked is a
+sub-experiment-weighted average, dCDH is a switchers average. BR's
+headline reports a single number without disambiguating the
+estimand. For Baker et al. (2025) practitioner-guide parity, the
+assumption block should carry the target-parameter clause.
+Already tracked as gap #6.
+
+### Follow-up C: Card-Krueger effect size differs from published ATT
+
+Our bundled ``load_card_krueger`` returns an ATT of +1.47 FTE; the
+published Card-Krueger ATT is ~+0.59 FTE. The direction and
+CI-includes-zero verdict match canonical, but the magnitude does
+not. This is a ``datasets.py`` data-loading question (the
+causaldata mirror may aggregate differently than the original
+author sample), not a BR prose bug. Noted here so a future
+data-validation PR can address it upstream.
+
+## What was validated (summary)
+
+- End-to-end BR / DR flow runs without errors on 4 canonical datasets.
+- Direction of the effect matches canonical interpretation on all 4.
+- Pre-trends verdict tier (no_detected_violation / clear_violation)
+  matches the literature's reading.
+- HonestDiD sensitivity tier (robust vs fragile) matches.
+- Bacon "already-robust" framing from round-45 reads correctly on
+  real staggered data.
+- The identifying-assumption source-faithfulness retags from
+  round-42 (BJS / Gardner untreated-outcome FE model) did not
+  surface on these runs because none of the datasets was run
+  through ImputationDiD or TwoStageDiD — follow-up validation
+  should add those.
+
+## Regeneration
+
+```bash
+python docs/validation/validate_br_dr_canonical.py
+```
+
+The script writes ``br_dr_canonical_validation.md`` (the raw output
+artifact); this file is the findings synthesis and is written by
+hand from that artifact.