Address twenty-eighth round of CI review findings on PR #318

igerber · claude · igerber · commit cd85edac0d23 · 2026-04-19T11:32:21.000-04:00
P2 code quality (TROP applicable_checks mismatch). TROP identification
is factor-model-based, not PT-based; the estimator-native
``_pt_factor()`` handler returns ``status="not_applicable"`` and
REPORTING.md routes TROP PT to factor-model diagnostics. Exposing
``parallel_trends`` in ``_APPLICABILITY["TROPResults"]`` advertised a
handler that never runs, leaving callers who gate workflows on
``applicable_checks`` with a contract mismatch. Remove PT from the
TROP applicability set.

P2 methodology (CS repeated-cross-section count labels).
``CallawaySantAnna(panel=False)`` stores treated / control counts as
OBSERVATIONS rather than units
(``staggered_results.py`` lines 183-184 render them as "obs:" in that
mode). BR previously labeled them "units" / "present in the panel",
which misstates the sample composition on RCS fits. Add a
``count_unit`` field to the BR sample schema (derived from
``results.panel``) and branch the summary / full-report rendering:
RCS fits render "never-treated observations" and "present in the
repeated cross-section sample" instead of the panel-mode phrasing.

P3 coverage (survey PT prose / replay propagation). The round-27 fix
added the ``_survey`` method suffix and ``df_denom`` schema field but
did not carry the provenance through the prose / replay helpers:

  * ``_pt_method_subject`` and ``_pt_method_stat_label`` didn't
    recognize ``joint_wald_survey`` / ``joint_wald_event_study_survey``,
    so BR prose fell through to the generic "Pre-treatment data" /
    "joint p" default;
  * ``_lift_pre_trends`` didn't preserve ``df_denom`` in the BR schema,
    so downstream consumers couldn't see the finite-sample correction
    without re-consulting the DR schema;
  * ``_format_precomputed_pt`` didn't carry ``df_denom`` on replay, so
    a survey-aware DR block round-tripped as a chi-square-style
    passthrough.

All three helpers now recognize / preserve the survey variants.

Tests: 7 new regressions.

  * ``TestCSRepeatedCrossSectionCountLabels`` (3 tests): schema flag,
    panel-mode wording, RCS-mode wording;
  * ``TestTROPApplicableChecksExcludesParallelTrends`` (1 test):
    TROP DR exposes no PT in applicable_checks;
  * ``TestSurveyPTProsePropagation`` (2 tests): ``_lift_pre_trends``
    preserves ``df_denom``, and method helpers return "joint p" +
    event-study subject for both survey variants;
  * ``test_precomputed_survey_pt_replay_preserves_df_denom`` (DR):
    round-trip replay of a ``joint_wald_event_study_survey`` block
    preserves ``method``, ``df_denom``, and ``df``.

249 BR / DR / practitioner tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
@@ -585,6 +585,18 @@ def _extract_sample(self) -> Dict[str, Any]:
                 n_never_treated = n_control_units
                 n_control = None
 
+        # Panel-vs-RCS count semantics. CallawaySantAnnaResults stores
+        # treated/control counts as OBSERVATIONS (not units) when the
+        # fit used ``panel=False`` — ``staggered_results.py L183-L184``
+        # renders those counts as "obs:" rather than "units:". BR
+        # previously labeled them as "units" / "present in the panel",
+        # which misstates the sample composition for repeated cross-
+        # section fits. Carry the flag into the schema so rendering can
+        # branch. Round-28 P2 CI review on PR #318.
+        count_unit = (
+            "observations" if getattr(r, "panel", True) is False else "units"
+        )
+
         sample_block: Dict[str, Any] = {
             "n_obs": _safe_int(getattr(r, "n_obs", None)),
             "n_treated": n_treated,
@@ -595,6 +607,7 @@ def _extract_sample(self) -> Dict[str, Any]:
             "n_periods": _safe_int(getattr(r, "n_periods", None)),
             "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)),
             "post_periods": _safe_list_len(getattr(r, "post_periods", None)),
+            "count_unit": count_unit,
             "survey": survey,
         }
         if n_never_enabled is not None:
@@ -685,6 +698,11 @@ def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         "joint_p_value": pt.get("joint_p_value"),
         "verdict": pt.get("verdict"),
         "n_pre_periods": pt.get("n_pre_periods"),
+        # Carry the denominator df through when the survey F-reference
+        # branch was used so BR consumers can flag the finite-sample
+        # correction without re-consulting the DR schema (round-28 P3
+        # CI review on PR #318).
+        "df_denom": pt.get("df_denom"),
         "power_status": pp.get("status"),
         "power_tier": pp.get("tier"),
         "mdv": pp.get("mdv"),
@@ -1356,7 +1374,19 @@ def _pt_method_subject(method: Optional[str]) -> str:
         return "The pre-period slope-difference test"
     if method == "hausman":
         return "The Hausman PT-All vs PT-Post pretest"
-    if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}:
+    if method in {
+        "joint_wald",
+        "joint_wald_event_study",
+        "joint_wald_no_vcov",
+        "bonferroni",
+        # Survey-aware event-study PT variants use an F reference
+        # distribution with denominator df = ``survey_metadata.df_survey``
+        # (round-27 P1 fix, documented in REPORTING.md). The subject
+        # remains the pre-period event-study coefficients; prose elsewhere
+        # flags the finite-sample correction via ``df_denom``.
+        "joint_wald_survey",
+        "joint_wald_event_study_survey",
+    }:
         return "Pre-treatment event-study coefficients"
     if method == "synthetic_fit":
         return "The synthetic-control pre-treatment fit"
@@ -1368,11 +1398,21 @@ def _pt_method_subject(method: Optional[str]) -> str:
 def _pt_method_stat_label(method: Optional[str]) -> Optional[str]:
     """Return the joint-statistic label appropriate to the PT method.
 
-    Returns ``"joint p"`` for Wald / Bonferroni paths, ``"p"`` for the
-    2x2 slope-difference and Hausman paths (which are single-statistic
-    tests), and ``None`` for design-enforced paths that have no p-value.
+    Returns ``"joint p"`` for Wald / Bonferroni paths (including the
+    survey-aware F-reference variants, which remain joint tests on the
+    pre-period coefficient vector — only the reference distribution
+    changes), ``"p"`` for the 2x2 slope-difference and Hausman paths
+    (single-statistic tests), and ``None`` for design-enforced paths
+    that have no p-value.
     """
-    if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}:
+    if method in {
+        "joint_wald",
+        "joint_wald_event_study",
+        "joint_wald_no_vcov",
+        "bonferroni",
+        "joint_wald_survey",
+        "joint_wald_event_study_survey",
+    }:
         return "joint p"
     if method in {"slope_difference", "hausman"}:
         return "p"
@@ -1716,14 +1756,22 @@ def _render_summary(schema: Dict[str, Any]) -> str:
     n_ne = sample.get("n_never_enabled")
     is_dynamic = sample.get("dynamic_control")
     cg = sample.get("control_group")
+    # Panel-vs-RCS count-unit label. For repeated cross-section fits
+    # (``panel=False`` on CallawaySantAnna), treated / never-treated
+    # tallies are observation counts, not unit counts. Keep the
+    # "N treated" phrasing (the N is still correct), but adjust the
+    # never-treated clause so it does not claim "units present in
+    # the panel" for an RCS sample.
+    count_unit = sample.get("count_unit", "units")
+    ne_unit_word = "observations" if count_unit == "observations" else "units"
     if isinstance(n_obs, int):
         if isinstance(n_t, int) and isinstance(n_c, int):
             sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).")
         elif is_dynamic and isinstance(n_t, int):
             if isinstance(n_ne, int) and n_ne > 0:
-                subset_clause = f"; {n_ne:,} never-enabled units are also present"
+                subset_clause = f"; {n_ne:,} never-enabled {ne_unit_word} are also present"
             elif isinstance(n_nt, int) and n_nt > 0:
-                subset_clause = f"; {n_nt:,} never-treated units are also present"
+                subset_clause = f"; {n_nt:,} never-treated {ne_unit_word} are also present"
             else:
                 subset_clause = ""
             # Estimator-specific dynamic-comparison phrasing. StackedDiD
@@ -1904,16 +1952,28 @@ def _render_full_report(schema: Dict[str, Any]) -> str:
         estimator_block.get("class_name") if isinstance(estimator_block, dict) else None
     )
     cg = sample.get("control_group")
+    # Panel-vs-RCS count-unit label for the full report. Mirrors the
+    # summary path: CallawaySantAnna's ``panel=False`` mode stores
+    # counts as observations, not units (round-28 P2).
+    md_count_unit = sample.get("count_unit", "units")
+    md_ne_unit_word = "observations" if md_count_unit == "observations" else "units"
+    md_sample_location = (
+        "in the repeated cross-section sample"
+        if md_count_unit == "observations"
+        else "in the panel"
+    )
     if isinstance(sample.get("n_control"), int):
         lines.append(f"- Control: {sample['n_control']:,}")
     elif sample.get("dynamic_control"):
         if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0:
             lines.append(
-                f"- Never-enabled units present in the panel: {sample['n_never_enabled']:,}"
+                f"- Never-enabled {md_ne_unit_word} present "
+                f"{md_sample_location}: {sample['n_never_enabled']:,}"
             )
         elif isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0:
             lines.append(
-                f"- Never-treated units present in the panel: {sample['n_never_treated']:,}"
+                f"- Never-treated {md_ne_unit_word} present "
+                f"{md_sample_location}: {sample['n_never_treated']:,}"
             )
         if estimator_name == "StackedDiDResults":
             n_distinct = sample.get("n_distinct_controls_trimmed")
diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py
@@ -141,8 +141,14 @@
         {"parallel_trends", "sensitivity", "design_effect", "estimator_native"}
     ),
     "TROPResults": frozenset(
+        # TROP identification is factor-model-based, not parallel-trends-
+        # based: the estimator native ``_pt_factor()`` handler returns
+        # ``status="not_applicable"``, and REPORTING.md routes TROP PT
+        # to factor-model diagnostics instead. Exposing PT in
+        # ``applicable_checks`` advertised a handler that never runs —
+        # round-28 P2 CI review on PR #318 flagged the contract mismatch
+        # for callers who gate workflows on ``applicable_checks``.
         {
-            "parallel_trends",
             "sensitivity",
             "design_effect",
             "heterogeneity",
@@ -2113,6 +2119,13 @@ def _read(name: str) -> Any:
             out["test_statistic"] = test_statistic
         if df is not None:
             out["df"] = df
+        # Preserve the survey-F denominator df when replaying a schema-
+        # shaped PT block from the default path (round-28 P3 CI review
+        # on PR #318). Without this, the finite-sample correction
+        # recorded on the source block is silently dropped at replay.
+        df_denom = _to_python_float(_read("df_denom"))
+        if df_denom is not None:
+            out["df_denom"] = df_denom
         return out
 
     # -- Headline metric extraction ----------------------------------------
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
@@ -2396,6 +2396,137 @@ def test_efficient_compare_control_groups_persists_after_sensitivity_runs(self):
         )
 
 
+class TestCSRepeatedCrossSectionCountLabels:
+    """Round-28 P2 CI review on PR #318: ``CallawaySantAnna(panel=False)``
+    stores treated / control counts as OBSERVATIONS, not units
+    (``staggered_results.py L183-L184`` renders them as "obs:" in that
+    mode). BR previously labeled them as "units" / "present in the
+    panel", which misstates the sample composition on repeated-cross-
+    section fits. The schema now carries a ``count_unit`` flag and the
+    rendering branches on it.
+    """
+
+    @staticmethod
+    def _stub(panel: bool):
+        class CallawaySantAnnaResults:
+            pass
+
+        stub = CallawaySantAnnaResults()
+        stub.overall_att = 1.0
+        stub.overall_se = 0.2
+        stub.overall_p_value = 0.001
+        stub.overall_conf_int = (0.6, 1.4)
+        stub.alpha = 0.05
+        stub.n_obs = 1000
+        stub.n_treated_units = 200
+        stub.n_control_units = 800
+        stub.survey_metadata = None
+        stub.event_study_effects = None
+        stub.control_group = "not_yet_treated"
+        stub.panel = panel
+        return stub
+
+    def test_schema_exposes_count_unit(self):
+        for panel, expected in [(True, "units"), (False, "observations")]:
+            sample = BusinessReport(
+                self._stub(panel), auto_diagnostics=False
+            ).to_dict()["sample"]
+            assert sample["count_unit"] == expected
+
+    def test_panel_true_renders_unit_wording(self):
+        br = BusinessReport(self._stub(panel=True), auto_diagnostics=False)
+        summary = br.summary()
+        md = br.full_report()
+        assert "never-treated units" in summary
+        assert "present in the panel" in md
+        assert "repeated cross-section sample" not in md
+
+    def test_panel_false_renders_rcs_wording(self):
+        br = BusinessReport(self._stub(panel=False), auto_diagnostics=False)
+        summary = br.summary()
+        md = br.full_report()
+        # RCS-specific wording in both surfaces.
+        assert "never-treated observations" in summary
+        assert "repeated cross-section sample" in md
+        # No misleading "units" or "panel" claims.
+        assert "never-treated units" not in summary
+        assert "present in the panel" not in md
+
+
+class TestTROPApplicableChecksExcludesParallelTrends:
+    """Round-28 P2 CI review on PR #318: TROP identification is
+    factor-model-based; its native PT handler returns
+    ``status="not_applicable"``. Advertising ``parallel_trends`` in
+    ``DiagnosticReport.applicable_checks`` for TROP was a contract
+    mismatch for callers using that set to gate workflows or UI.
+    """
+
+    def test_trop_applicable_checks_omits_parallel_trends(self):
+        from diff_diff import DiagnosticReport
+
+        class TROPResults:
+            pass
+
+        stub = TROPResults()
+        stub.overall_att = 1.0
+        stub.overall_se = 0.2
+        stub.alpha = 0.05
+        stub.n_obs = 100
+
+        dr = DiagnosticReport(stub)
+        assert "parallel_trends" not in dr.applicable_checks, (
+            "TROP PT routes to factor-model diagnostics and is "
+            "not_applicable; it must not appear in applicable_checks."
+        )
+
+
+class TestSurveyPTProsePropagation:
+    """Round-28 P3 CI review on PR #318: the survey F-reference PT
+    variants (``joint_wald_survey``, ``joint_wald_event_study_survey``)
+    must carry through BR's method-aware label helpers so prose uses
+    "joint p" (not the fall-through default) and preserves the
+    ``df_denom`` provenance in the BR schema.
+    """
+
+    def test_lift_pre_trends_preserves_df_denom(self):
+        from diff_diff.business_report import _lift_pre_trends
+
+        fake_dr = {
+            "parallel_trends": {
+                "status": "ran",
+                "method": "joint_wald_event_study_survey",
+                "joint_p_value": 0.35,
+                "df_denom": 30.0,
+                "n_pre_periods": 3,
+                "verdict": "no_detected_violation",
+            },
+            "pretrends_power": {"status": "not_applicable"},
+        }
+        lifted = _lift_pre_trends(fake_dr)
+        assert lifted["method"] == "joint_wald_event_study_survey"
+        assert lifted["df_denom"] == 30.0
+
+    def test_survey_pt_method_stat_label_uses_joint_p(self):
+        from diff_diff.business_report import (
+            _pt_method_stat_label,
+            _pt_method_subject,
+        )
+
+        for method in ("joint_wald_survey", "joint_wald_event_study_survey"):
+            assert _pt_method_stat_label(method) == "joint p", (
+                f"Survey PT variant {method!r} must map to 'joint p' "
+                f"(the joint test remains; only the reference "
+                f"distribution changes)."
+            )
+            assert (
+                _pt_method_subject(method)
+                == "Pre-treatment event-study coefficients"
+            ), (
+                f"Survey PT variant {method!r} must use the event-study "
+                f"subject phrase, not the generic fall-through."
+            )
+
+
 class TestSDiDJackknifeStepPersistsAfterNativeSensitivity:
     """Round-24 P2 CI review on PR #318: the SyntheticDiD practitioner
     step "Leave-one-out influence (jackknife)" must persist after
diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py
@@ -864,6 +864,29 @@ def test_joint_wald_uses_F_reference_when_survey_df_is_finite(self):
         # back to chi-square.
         assert expected_p_survey > expected_p_chi2
 
+    def test_precomputed_survey_pt_replay_preserves_df_denom(self, cs_fit):
+        """Round-28 P3 regression: a schema-shaped PT block carrying the
+        survey ``df_denom`` and ``_survey`` method suffix must round-trip
+        through ``precomputed={"parallel_trends": ...}`` without losing
+        the finite-sample provenance. Previously ``_format_precomputed_pt``
+        dropped ``df_denom``, so replaying a survey-aware DR block
+        silently demoted it to a chi-square-style passthrough.
+        """
+        fit, _ = cs_fit
+        survey_pt = {
+            "method": "joint_wald_event_study_survey",
+            "joint_p_value": 0.18,
+            "test_statistic": 5.2,
+            "df": 3,
+            "df_denom": 20.0,
+        }
+        dr = DiagnosticReport(fit, precomputed={"parallel_trends": survey_pt})
+        pt = dr.to_dict()["parallel_trends"]
+        assert pt["status"] == "ran"
+        assert pt["method"] == "joint_wald_event_study_survey"
+        assert pt["df_denom"] == 20.0
+        assert pt["df"] == 3
+
     def test_joint_wald_ignores_non_finite_survey_df(self):
         """If ``df_survey`` is NaN / inf / non-positive, fall back to
         chi-square (no finite-sample correction available).