Skip to content

Commit 752f2b6

Browse files
authored
Merge pull request igerber#341 from igerber/br-dr-canonical-validation
Close BR/DR gap igerber#4: canonical-dataset regression guards + wording fixes
2 parents 73eef66 + 6e24014 commit 752f2b6

5 files changed

Lines changed: 826 additions & 13 deletions

File tree

diff_diff/business_report.py

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,6 +1854,48 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str:
18541854
return "the confidence interval includes zero; the data are consistent with no effect"
18551855

18561856

1857+
def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]:
1858+
"""If the smallest evaluated M on the HonestDiD sensitivity grid
1859+
already has the robust CI including zero, return that M. Returns
1860+
``None`` when the grid is missing or when the smallest evaluated
1861+
point is still robust — in the latter case ``breakdown_M`` is an
1862+
interpolated threshold between grid points, not a statement about
1863+
the smallest grid point itself.
1864+
1865+
Matches the twin helper in ``diagnostic_report.py``; keep the two
1866+
in sync for cross-surface parity.
1867+
"""
1868+
grid_points = sens.get("grid") or []
1869+
sorted_grid = sorted(
1870+
(p for p in grid_points if isinstance(p.get("M"), (int, float))),
1871+
key=lambda p: p["M"],
1872+
)
1873+
if not sorted_grid:
1874+
return None
1875+
smallest = sorted_grid[0]
1876+
if not smallest.get("robust_to_zero", True):
1877+
return float(smallest["M"])
1878+
return None
1879+
1880+
1881+
def _sentence_first_upper(text: str) -> str:
1882+
"""Uppercase only the first character of ``text``, preserving all
1883+
other casing. Unlike ``str.capitalize()``, which lowercases every
1884+
character after the first, this keeps user-supplied abbreviations
1885+
and proper nouns intact.
1886+
1887+
Examples
1888+
--------
1889+
>>> _sentence_first_upper("the NJ minimum-wage increase")
1890+
'The NJ minimum-wage increase'
1891+
>>> _sentence_first_upper("Castle Doctrine law adoption")
1892+
'Castle Doctrine law adoption'
1893+
"""
1894+
if not text:
1895+
return text
1896+
return text[0].upper() + text[1:]
1897+
1898+
18571899
def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str:
18581900
"""Return a direction-aware verb for the headline sentence.
18591901
@@ -1929,7 +1971,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str:
19291971
# is not actually available.
19301972
ci_str = " (inference unavailable: confidence interval is undefined for this fit)"
19311973
by_clause = f" by {magnitude}" if effect != 0 else ""
1932-
return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}."
1974+
# Round-1 BR/DR canonical-validation (2026-04-19): Python's
1975+
# ``str.capitalize()`` lowercases everything except the first
1976+
# character, so ``"the NJ minimum-wage increase".capitalize()``
1977+
# returns ``"The nj minimum-wage increase"`` — flattening the
1978+
# ``NJ`` abbreviation. Real canonical datasets (Card-Krueger,
1979+
# Castle Doctrine) carry proper-noun / acronym tokens in the
1980+
# user-supplied ``treatment_label``, so preserve user casing and
1981+
# only ensure the first character is uppercase.
1982+
treatment_sentence = _sentence_first_upper(treatment)
1983+
return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}."
19331984

19341985

19351986
def _render_summary(schema: Dict[str, Any]) -> str:
@@ -2088,11 +2139,33 @@ def _render_summary(schema: Dict[str, Any]) -> str:
20882139
f"pre-period variation."
20892140
)
20902141
elif isinstance(bkd, (int, float)):
2091-
sentences.append(
2092-
f"HonestDiD: the result is fragile — the confidence interval "
2093-
f"includes zero once violations reach {bkd:.2g}x the "
2094-
f"pre-period variation."
2095-
)
2142+
# Round-1 BR/DR canonical-validation (2026-04-19) then
2143+
# tightened per CI review on PR #341 R1:
2144+
# ``breakdown_M`` is the smallest M at which the robust
2145+
# CI includes zero (interpolated between grid points) —
2146+
# not a claim about any specific grid point. Earlier fix
2147+
# keyed off ``bkd <= 0.05`` which incorrectly asserted
2148+
# "smallest grid point fails" even for grids that start
2149+
# at M=0 where the smallest evaluated point is still
2150+
# robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The
2151+
# "smallest grid point" wording is only accurate when
2152+
# the smallest evaluated M on the grid itself fails
2153+
# (``robust_to_zero == False``); otherwise fall through
2154+
# to the numeric multiplier.
2155+
smallest_failed_m = _smallest_failing_grid_m(sens)
2156+
if smallest_failed_m is not None:
2157+
sentences.append(
2158+
"HonestDiD: the result is fragile — the confidence "
2159+
"interval includes zero even at the smallest M "
2160+
f"evaluated on the sensitivity grid (M = "
2161+
f"{smallest_failed_m:.2g})."
2162+
)
2163+
else:
2164+
sentences.append(
2165+
f"HonestDiD: the result is fragile — the confidence "
2166+
f"interval includes zero once violations reach {bkd:.2g}x "
2167+
f"the pre-period variation."
2168+
)
20962169

20972170
# Sample sentence. For fits with a dynamic comparison set (CS /
20982171
# ContinuousDiD / StaggeredTripleDiff / EfficientDiD /

diff_diff/diagnostic_report.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2780,6 +2780,32 @@ def _collect_pre_period_coefs(
27802780
return results_list, n_dropped_undefined
27812781

27822782

2783+
def _smallest_failing_grid_m_dr(sens: Dict[str, Any]) -> Optional[float]:
2784+
"""Return the smallest evaluated M on the HonestDiD sensitivity
2785+
grid if it already has the robust CI including zero, else ``None``.
2786+
Matches ``business_report._smallest_failing_grid_m`` — both helpers
2787+
must stay in sync for cross-surface parity. See PR #341 R1 review.
2788+
2789+
``breakdown_M`` is an interpolated threshold between grid points,
2790+
so "the smallest grid point fails" is only a valid claim when the
2791+
smallest actually-evaluated M has ``robust_to_zero == False``. On
2792+
a grid that starts at M=0 where the smallest evaluated point is
2793+
still robust, the breakdown value is information about what
2794+
happens between grid points — not at the smallest grid point.
2795+
"""
2796+
grid_points = sens.get("grid") or []
2797+
sorted_grid = sorted(
2798+
(p for p in grid_points if isinstance(p.get("M"), (int, float))),
2799+
key=lambda p: p["M"],
2800+
)
2801+
if not sorted_grid:
2802+
return None
2803+
smallest = sorted_grid[0]
2804+
if not smallest.get("robust_to_zero", True):
2805+
return float(smallest["M"])
2806+
return None
2807+
2808+
27832809
def _pt_verdict(p: Optional[float]) -> str:
27842810
"""Map a pre-trends joint p-value to the three-bin verdict enum.
27852811
@@ -3118,13 +3144,33 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str
31183144
f"pre-period variation."
31193145
)
31203146
else:
3121-
sentences.append(
3122-
f"HonestDiD sensitivity: the result is fragile — the "
3123-
f"confidence interval includes zero once violations reach "
3124-
f"{bkd:.2g}x the pre-period variation."
3125-
if isinstance(bkd, (int, float))
3126-
else ""
3127-
)
3147+
# Round-1 BR/DR canonical-validation (2026-04-19) then
3148+
# tightened per CI review on PR #341 R1: the "smallest
3149+
# grid point" wording is only semantically correct when
3150+
# the smallest M actually evaluated on the sensitivity
3151+
# grid has ``robust_to_zero == False``. ``breakdown_M``
3152+
# is the interpolated threshold between grid points, so
3153+
# a small breakdown value on a grid starting at M=0
3154+
# (where the smallest evaluated point is still robust)
3155+
# would previously have been narrated as "smallest grid
3156+
# point fails" — stronger than the evaluated grid
3157+
# supports. Mirror BR's fix: check the grid directly.
3158+
if isinstance(bkd, (int, float)):
3159+
smallest_failed_m = _smallest_failing_grid_m_dr(sens)
3160+
if smallest_failed_m is not None:
3161+
sentences.append(
3162+
"HonestDiD sensitivity: the result is fragile — "
3163+
"the confidence interval includes zero even at "
3164+
"the smallest M evaluated on the sensitivity "
3165+
f"grid (M = {smallest_failed_m:.2g})."
3166+
)
3167+
else:
3168+
sentences.append(
3169+
f"HonestDiD sensitivity: the result is fragile — "
3170+
f"the confidence interval includes zero once "
3171+
f"violations reach {bkd:.2g}x the pre-period "
3172+
f"variation."
3173+
)
31283174

31293175
# Sentence 4: one secondary caveat if present.
31303176
bacon = schema.get("bacon") or {}

0 commit comments

Comments
 (0)