Skip to content

Commit 9a93be3

Browse files
igerberclaude
andcommitted
Address fourteenth round of CI review findings on PR #318
Round 14 returned ✅ Looks good again (approved; no P0/P1). One non-blocking P2 picked up: - P2 Centralize diagonal-fallback tier downgrade: the conservative REPORTING.md deviation that downgrades ``well_powered`` to ``moderately_powered`` when ``compute_pretrends_power`` used a diagonal-SE approximation while the full ``event_study_vcov`` was available was previously applied only inside ``BusinessReport`` summary prose. ``BusinessReport.to_dict()`` / ``full_report()`` still carried the raw tier from ``_lift_pre_trends``, and ``DiagnosticReport.summary()`` also read the raw tier, so the same fit could be rendered as "moderately informative" in one surface and "well-powered" in another. Moved the downgrade into ``DiagnosticReport ._check_pretrends_power`` (where the tier is first computed) so every downstream surface reads the same adjusted value. Removed the now-redundant BR-side downgrade. - Regressions: ``TestDiagFallbackDowngradeAppliedCentrally`` covers the centralization on a hand-crafted DR schema with a pre-baked ``diag_fallback_available_full_vcov_unused`` cov-source — asserts BR schema tier is ``moderately_powered`` and none of ``summary`` / ``full_report`` / overall-interpretation prose uses well-powered phrasing. A complementary real-fit test runs on the ``cs_fit`` fixture and, if the helper triggered the flagged fallback, confirms the tier is not ``well_powered``. 136 targeted tests passing; black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6bdae48 commit 9a93be3

3 files changed

Lines changed: 135 additions & 9 deletions

File tree

diff_diff/business_report.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,17 +1280,12 @@ def _render_summary(schema: Dict[str, Any]) -> str:
12801280
if pt.get("status") == "computed":
12811281
jp = pt.get("joint_p_value")
12821282
verdict = pt.get("verdict")
1283+
# ``tier`` already incorporates the diagonal-fallback downgrade —
1284+
# ``DiagnosticReport._check_pretrends_power`` applies it centrally
1285+
# so every report surface (BR summary, BR full_report, BR schema,
1286+
# DR summary) reads the same adjusted value (round-14 CI review).
12831287
tier = pt.get("power_tier")
12841288
method = pt.get("method")
1285-
# ``compute_pretrends_power`` currently falls back to ``np.diag(ses**2)``
1286-
# for CS / SA / ImputationDiD / Stacked / etc., even when the full
1287-
# ``event_study_vcov`` is available. Downgrade any "well_powered" tier
1288-
# to "moderately_powered" when we know the diagonal approximation was
1289-
# the only input — a diagonal-only MDV can be optimistic because it
1290-
# ignores correlations across pre-periods.
1291-
cov_source = pt.get("power_covariance_source")
1292-
if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused":
1293-
tier = "moderately_powered"
12941289
subject = _pt_method_subject(method)
12951290
stat_label = _pt_method_stat_label(method)
12961291
jp_phrase = (

diff_diff/diagnostic_report.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,17 @@ def _check_pretrends_power(self) -> Dict[str, Any]:
10681068
cov_source = "full_pre_period_vcov"
10691069

10701070
tier = _power_tier(ratio)
1071+
# Central diagonal-fallback downgrade. When the helper used the
1072+
# diagonal-SE approximation while the full ``event_study_vcov``
1073+
# was available, a ``well_powered`` verdict can be optimistic
1074+
# because off-diagonal pre-period correlations are ignored.
1075+
# REPORTING.md's conservative deviation says to downgrade in
1076+
# that case. Doing it here (once) ensures every downstream
1077+
# surface — BR ``summary()``, BR ``full_report()``, BR schema,
1078+
# DR ``summary()`` — reads the same adjusted tier (round-14
1079+
# CI review flagged per-surface divergence).
1080+
if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused":
1081+
tier = "moderately_powered"
10711082
return {
10721083
"status": "ran",
10731084
"method": "compute_pretrends_power",

tests/test_business_report.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,126 @@ class _Result:
11001100
), f"cluster column must propagate from fit to Hausman pretest; got {captured}"
11011101

11021102

1103+
class TestDiagFallbackDowngradeAppliedCentrally:
1104+
"""Round-14 regression: when ``compute_pretrends_power`` fell back to
1105+
a diagonal-SE approximation while the full ``event_study_vcov`` was
1106+
available, the ``well_powered`` tier must be downgraded to
1107+
``moderately_powered`` on **every** report surface (BR summary, BR
1108+
full_report, BR schema, DR summary), not just inside one of them.
1109+
Centralize the downgrade in ``_check_pretrends_power`` so every
1110+
consumer reads the same adjusted tier. REPORTING.md lines 126-139.
1111+
"""
1112+
1113+
def test_br_schema_tier_is_downgraded(self):
1114+
"""Smoke-check that the centralized downgrade lands in the DR
1115+
schema when ``covariance_source`` is the flagged fallback value."""
1116+
# Build a hand-crafted DR schema exactly as the centralized
1117+
# downgrade would emit it — mdv ratio < 0.25 (so the pre-
1118+
# downgrade tier is ``well_powered``), cov_source is the
1119+
# diag-fallback-with-full-vcov-available sentinel.
1120+
from diff_diff.diagnostic_report import DiagnosticReportResults
1121+
1122+
schema = {
1123+
"schema_version": "1.0",
1124+
"estimator": "CallawaySantAnnaResults",
1125+
"headline_metric": {"name": "overall_att", "value": 1.0},
1126+
"parallel_trends": {
1127+
"status": "ran",
1128+
"method": "joint_wald_event_study",
1129+
"joint_p_value": 0.40,
1130+
"verdict": "no_detected_violation",
1131+
},
1132+
"pretrends_power": {
1133+
"status": "ran",
1134+
"method": "compute_pretrends_power",
1135+
"mdv": 0.10,
1136+
"mdv_share_of_att": 0.10,
1137+
# Central downgrade: tier already reflects the cov-source.
1138+
"tier": "moderately_powered",
1139+
"covariance_source": "diag_fallback_available_full_vcov_unused",
1140+
},
1141+
"sensitivity": {"status": "not_applicable"},
1142+
"placebo": {"status": "skipped", "reason": "opt-in"},
1143+
"bacon": {"status": "not_applicable"},
1144+
"design_effect": {"status": "not_applicable"},
1145+
"heterogeneity": {"status": "not_applicable"},
1146+
"epv": {"status": "not_applicable"},
1147+
"estimator_native_diagnostics": {"status": "not_applicable"},
1148+
"skipped": {},
1149+
"warnings": [],
1150+
"overall_interpretation": "",
1151+
"next_steps": [],
1152+
}
1153+
1154+
class CallawaySantAnnaResults:
1155+
pass
1156+
1157+
stub = CallawaySantAnnaResults()
1158+
stub.overall_att = 1.0
1159+
stub.overall_se = 0.2
1160+
stub.overall_p_value = 0.001
1161+
stub.overall_conf_int = (0.6, 1.4)
1162+
stub.alpha = 0.05
1163+
stub.n_obs = 100
1164+
stub.n_treated = 40
1165+
stub.n_control = 60
1166+
stub.survey_metadata = None
1167+
1168+
dr_results = DiagnosticReportResults(
1169+
schema=schema,
1170+
interpretation="",
1171+
applicable_checks=("parallel_trends", "pretrends_power"),
1172+
skipped_checks={},
1173+
warnings=(),
1174+
)
1175+
br = BusinessReport(stub, diagnostics=dr_results)
1176+
br_schema = br.to_dict()
1177+
pt_block = br_schema["pre_trends"]
1178+
assert pt_block["power_tier"] == "moderately_powered"
1179+
# All three prose surfaces must reflect the downgraded tier —
1180+
# none should render the well-powered phrasing ("likely have
1181+
# been detected" / well-powered adjective).
1182+
summary = br.summary()
1183+
full = br.full_report()
1184+
for text in (summary, full):
1185+
assert "well-powered" not in text.lower()
1186+
assert "likely have" not in text
1187+
# Positive check: moderately-informative phrasing appears in BR
1188+
# prose and BR's overall-interpretation pass-through.
1189+
assert (
1190+
"moderately informative" in summary
1191+
or "moderately informative" in full
1192+
or "moderately-informative" in summary
1193+
)
1194+
1195+
def test_center_downgrade_fires_on_real_cs_fit(self, cs_fit):
1196+
"""On a real CS fit the central downgrade should land in the DR
1197+
schema when the helper used the diagonal fallback — no separate
1198+
BR-side downgrade is needed."""
1199+
from diff_diff import DiagnosticReport
1200+
1201+
fit, sdf = cs_fit
1202+
dr = DiagnosticReport(
1203+
fit,
1204+
data=sdf,
1205+
outcome="outcome",
1206+
unit="unit",
1207+
time="period",
1208+
first_treat="first_treat",
1209+
)
1210+
pp = dr.to_dict()["pretrends_power"]
1211+
if pp.get("status") != "ran":
1212+
pytest.skip("pretrends_power did not run on this fixture")
1213+
cov = pp.get("covariance_source")
1214+
if cov != "diag_fallback_available_full_vcov_unused":
1215+
pytest.skip(
1216+
"fixture did not trigger the diag_fallback_available path; " "nothing to downgrade"
1217+
)
1218+
# When the flagged cov_source fires, tier must never be
1219+
# ``well_powered`` — centralized downgrade guarantees this.
1220+
assert pp["tier"] != "well_powered"
1221+
1222+
11031223
class TestCSNotYetTreatedControlGroupSemantics:
11041224
"""Round-13 P1 regression: ``BusinessReport`` must not relabel
11051225
``n_control_units`` as generic "control" for a

0 commit comments

Comments
 (0)