Address PR #353 CI review round 2 (1 P1 + 1 P3)

igerber · claude · igerber · commit 84835defc69e · 2026-04-23T20:21:26.000-04:00
P1 - ordered-categorical chronology: raw `t &lt; base_period` /
`t &gt; base_period` comparisons in `joint_pretrends_test`,
`joint_homogeneity_test`, and `did_had_pretest_workflow(aggregate=
"event_study")` silently misorder ordered-categorical time columns
whose lexical and chronological order disagree (e.g. categories
["q1", "q2", "q10"] sort lexically as "q1" &lt; "q10" &lt; "q2"). On
such panels the raw comparison could (a) silently drop valid
pre-period horizons via the raw `&lt;` check, (b) emit a spurious
"joint pre-trends skipped" verdict from the workflow's `earlier_pre`
filter, or (c) raise on valid post-period inputs.

Fix: new private helper `_build_period_rank` returns a
{period_label: chronological_rank} map using the ordered-
categorical category order when applicable, natural sort on
numeric / datetime otherwise. Both wrappers compare period labels
via rank (`rank[t1] &lt; rank[t2]`) instead of raw Python `&lt;`/`&gt;`.
The workflow's `earlier_pre` replaces the raw-&lt; filter with
`list(t_pre_list[:-1])` - `t_pre_list` is already chronologically
sorted by the validator (via its `_sort_key`), so excluding the
last element yields the earlier pre-periods regardless of dtype.

P3 - ordered-categorical regression tests: new
`TestOrderedCategoricalChronology` class (4 tests) with a fixture
using categories `["q1", "q2", "q10", "post"]`. Covers (a) direct
pretrends wrapper picks up both earlier placebos, (b) pretrends
wrapper rejects lexically-ordered-but-chrono-invalid input (e.g.
pre=["q10"], base="q2"), (c) homogeneity wrapper accepts valid
post-period input, (d) workflow event-study dispatch surfaces both
earlier placebos in `pretrends_joint.horizon_labels` without the
false skip note.

123 tests pass (119 + 4 new); black/ruff/mypy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py
@@ -1640,6 +1640,30 @@ def _validate_multi_period_panel(
     )
 
 
+def _build_period_rank(data: pd.DataFrame, time_col: str) -> Dict[Any, int]:
+    """Build a ``{period_label: chronological_rank}`` map.
+
+    For ordered categorical time columns, uses the declared category
+    order so that e.g. ``["q1", "q2", "q10"]`` ranks chronologically
+    even though it sorts lexically in the opposite order. For numeric
+    or datetime time columns, uses natural Python `sorted` order on
+    the unique period labels. Object dtypes would fall back to
+    lexicographic order - callers relying on chronology with object-
+    dtype labels should convert to an ordered categorical first
+    (this mirrors the contract in ``_validate_had_panel_event_study``).
+
+    The rank map lets the joint-pretest wrappers compare period labels
+    chronologically via ``rank[t1] < rank[t2]`` instead of raw Python
+    ``t1 < t2``, which would silently misorder ordered-categorical
+    panels (paper Appendix B.2 support contract).
+    """
+    time_dtype = data[time_col].dtype
+    if isinstance(time_dtype, pd.CategoricalDtype) and time_dtype.ordered:
+        return {c: i for i, c in enumerate(time_dtype.categories)}
+    periods = sorted(data[time_col].unique())
+    return {p: i for i, p in enumerate(periods)}
+
+
 def _aggregate_for_joint_test(
     data: pd.DataFrame,
     outcome_col: str,
@@ -2157,25 +2181,32 @@ def joint_pretrends_test(
             f"base_period={base_period!r} must not appear in " f"pre_periods {list(pre_periods)!r}."
         )
 
-    # Ordering check: all pre_periods strictly < base_period (natural
-    # order on the column dtype). We rely on the time column being
-    # comparable (numeric, datetime, or ordered categorical); other
-    # dtypes would silently misorder. The multi-period validator (when
-    # called via the workflow) enforces an ordered dtype; direct callers
-    # get a TypeError here on incomparable types.
-    try:
-        out_of_order = [t for t in pre_periods if not (t < base_period)]
-    except TypeError as exc:
-        raise TypeError(
-            "pre_periods and base_period must be comparable "
-            "(numeric, datetime, or ordered categorical values). "
-            f"Got pre_periods={list(pre_periods)!r}, "
-            f"base_period={base_period!r}."
-        ) from exc
+    # Ordering check: all pre_periods strictly < base_period in
+    # chronological order. Uses `_build_period_rank` to handle ordered-
+    # categorical time columns correctly (raw Python `<` would fail on
+    # categories whose lexical order disagrees with chronology, e.g.
+    # ["q1", "q2", "q10"]). Numeric / datetime dtypes get natural order.
+    period_rank = _build_period_rank(data, time_col)
+    if base_period not in period_rank:
+        raise ValueError(
+            f"base_period={base_period!r} not found in time_col "
+            f"{time_col!r}. Available: "
+            f"{sorted(period_rank.keys(), key=lambda t: period_rank[t])!r}."
+        )
+    missing_pre_in_data = [t for t in pre_periods if t not in period_rank]
+    if missing_pre_in_data:
+        raise ValueError(
+            f"pre_periods entries {missing_pre_in_data!r} not found in "
+            f"time_col {time_col!r}. Available: "
+            f"{sorted(period_rank.keys(), key=lambda t: period_rank[t])!r}."
+        )
+    base_rank = period_rank[base_period]
+    out_of_order = [t for t in pre_periods if period_rank[t] >= base_rank]
     if out_of_order:
         raise ValueError(
-            f"All pre_periods must be strictly < base_period. "
-            f"Violators: {out_of_order!r} (base_period={base_period!r})."
+            f"All pre_periods must be strictly < base_period in "
+            f"chronological order. Violators: {out_of_order!r} "
+            f"(base_period={base_period!r})."
         )
 
     # Event-study validation contract (paper Appendix B.2):
@@ -2341,21 +2372,31 @@ def joint_homogeneity_test(
             f"post_periods {list(post_periods)!r}."
         )
 
-    # Ordering: all post_periods >= base_period (and in fact strictly
-    # greater under the HAD contract where base is the last pre-period).
-    try:
-        out_of_order = [t for t in post_periods if not (t > base_period)]
-    except TypeError as exc:
-        raise TypeError(
-            "post_periods and base_period must be comparable "
-            "(numeric, datetime, or ordered categorical values). "
-            f"Got post_periods={list(post_periods)!r}, "
-            f"base_period={base_period!r}."
-        ) from exc
+    # Ordering: all post_periods strictly > base_period in
+    # chronological order. Uses `_build_period_rank` for ordered-
+    # categorical correctness (raw Python `>` would misorder e.g.
+    # "q10" > "q2").
+    period_rank = _build_period_rank(data, time_col)
+    if base_period not in period_rank:
+        raise ValueError(
+            f"base_period={base_period!r} not found in time_col "
+            f"{time_col!r}. Available: "
+            f"{sorted(period_rank.keys(), key=lambda t: period_rank[t])!r}."
+        )
+    missing_post_in_data = [t for t in post_periods if t not in period_rank]
+    if missing_post_in_data:
+        raise ValueError(
+            f"post_periods entries {missing_post_in_data!r} not found in "
+            f"time_col {time_col!r}. Available: "
+            f"{sorted(period_rank.keys(), key=lambda t: period_rank[t])!r}."
+        )
+    base_rank = period_rank[base_period]
+    out_of_order = [t for t in post_periods if period_rank[t] <= base_rank]
     if out_of_order:
         raise ValueError(
-            f"All post_periods must be strictly > base_period. "
-            f"Violators: {out_of_order!r} (base_period={base_period!r})."
+            f"All post_periods must be strictly > base_period in "
+            f"chronological order. Violators: {out_of_order!r} "
+            f"(base_period={base_period!r})."
         )
 
     # Event-study validation contract (paper Appendix B.2) - twin of
@@ -2595,7 +2636,15 @@ def did_had_pretest_workflow(
         # strictly before base_period). If only the base pre-period is
         # available (len(t_pre_list) == 1), there are no earlier
         # placebos; set pretrends_joint=None and flag in verdict.
-        earlier_pre = [t for t in t_pre_list if t < base_period]
+        # ``t_pre_list`` is returned chronologically sorted by
+        # ``_validate_had_panel_event_study`` (using the column's
+        # ordered-categorical category order or the natural numeric /
+        # datetime order), so taking everything but the last element
+        # gives the earlier pre-periods regardless of dtype. Raw
+        # ``t < base_period`` would misorder ordered-categorical labels
+        # whose lexical and chronological order disagree (e.g. "q10" <
+        # "q2" lexically but > chronologically).
+        earlier_pre = list(t_pre_list[:-1])
         if len(earlier_pre) >= 1:
             pretrends_joint = joint_pretrends_test(
                 data_filtered,
diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py
@@ -2357,6 +2357,130 @@ def test_event_study_all_conclusive_no_reject_admissible(self):
         assert "TWFE admissible under Section 4" in verdict
 
 
+class TestOrderedCategoricalChronology:
+    """R2 P1 regressions: ordered-categorical time columns whose lexical
+    and chronological order disagree (e.g. ``"q10"`` < ``"q2"``
+    lexically but > chronologically). Raw ``t < base_period`` comparisons
+    misorder these panels; the wrappers and workflow must use validated-
+    rank comparisons to apply the test to the intended horizons."""
+
+    @staticmethod
+    def _categorical_panel(
+        G: int = 60,
+        categories=("q1", "q2", "q10", "post"),
+        first_treat="post",
+        seed: int = 501,
+    ) -> pd.DataFrame:
+        """Panel with ordered-categorical time whose lexical order
+        (``"q1" < "q10" < "q2" < "post"``) differs from chronological
+        order (``"q1" < "q2" < "q10" < "post"``)."""
+        cat_type = pd.CategoricalDtype(categories=list(categories), ordered=True)
+        rng = np.random.default_rng(seed)
+        doses = rng.uniform(0.05, 1.0, size=G)
+        rows = []
+        for g in range(G):
+            for t in categories:
+                is_post = t == first_treat
+                d = float(doses[g]) if is_post else 0.0
+                y = 0.1 * g + (0.4 * d if is_post else 0.0) + rng.normal(0.0, 0.1)
+                rows.append({"unit": g, "period": t, "y": y, "d": d})
+        df = pd.DataFrame(rows)
+        df["period"] = df["period"].astype(cat_type)
+        return df
+
+    def test_joint_pretrends_test_uses_chronological_rank(self):
+        """Direct wrapper call with categories ["q1", "q2", "q10"] where
+        the lexical order puts "q10" BEFORE "q2" but chronologically
+        "q10" comes AFTER "q2". All three pre-periods must be accepted
+        without a false out-of-order error."""
+        df = self._categorical_panel()
+        result = joint_pretrends_test(
+            df,
+            "y",
+            "d",
+            "period",
+            "unit",
+            pre_periods=["q1", "q2"],
+            base_period="q10",
+            n_bootstrap=199,
+            seed=3,
+        )
+        assert result.n_horizons == 2
+        assert set(result.horizon_labels) == {"q1", "q2"}
+        # The detrended-outcome residuals are mean-centered; under null
+        # (no pre-trend correlated with D), p should be > 0.05 on this
+        # weakly-noisy DGP.
+        assert np.isfinite(result.p_value)
+
+    def test_joint_pretrends_raises_on_lexically_ordered_but_chrono_invalid(self):
+        """With base_period="q2" and pre_periods=["q10"], chronologically
+        q10 > q2 so this is out-of-order - the rank-based check must
+        raise. Raw `<` on the lexical side would INCORRECTLY accept
+        it since "q10" < "q2" lexically."""
+        df = self._categorical_panel()
+        with pytest.raises(ValueError, match="chronological order"):
+            joint_pretrends_test(
+                df,
+                "y",
+                "d",
+                "period",
+                "unit",
+                pre_periods=["q10"],
+                base_period="q2",
+                n_bootstrap=199,
+                seed=0,
+            )
+
+    def test_joint_homogeneity_test_uses_chronological_rank(self):
+        """Homogeneity wrapper twin of the pretrends test. Post-period
+        "post" comes after all pre-periods chronologically; base="q10"
+        is the last pre-period. Lexically "post" > "q10" too (coincides
+        here), but the rank-based check must not rely on that."""
+        df = self._categorical_panel()
+        result = joint_homogeneity_test(
+            df,
+            "y",
+            "d",
+            "period",
+            "unit",
+            post_periods=["post"],
+            base_period="q10",
+            n_bootstrap=199,
+            seed=7,
+        )
+        assert result.n_horizons == 1
+        assert result.horizon_labels == ["post"]
+        assert np.isfinite(result.p_value)
+
+    def test_workflow_event_study_ordered_categorical(self):
+        """did_had_pretest_workflow(aggregate="event_study") must pick
+        up BOTH earlier pre-periods ("q1", "q2") from an ordered-
+        categorical panel where lexical order would silently drop one
+        of them. Regression against the `earlier_pre` raw-< fix."""
+        df = self._categorical_panel()
+        report = did_had_pretest_workflow(
+            df,
+            "y",
+            "d",
+            "period",
+            "unit",
+            aggregate="event_study",
+            n_bootstrap=199,
+            seed=13,
+        )
+        assert report.aggregate == "event_study"
+        assert report.pretrends_joint is not None
+        # t_pre_list = ["q1", "q2", "q10"] chronologically; base = "q10"
+        # (last pre-period); earlier_pre should be ["q1", "q2"] - both
+        # placebo horizons must appear in pretrends_joint.
+        assert set(report.pretrends_joint.horizon_labels) == {"q1", "q2"}
+        assert report.homogeneity_joint is not None
+        assert report.homogeneity_joint.horizon_labels == ["post"]
+        # Verdict does not emit the step-2-skipped flag (both earlier
+        # placebos were found).
+        assert "joint pre-trends skipped" not in report.verdict
+
+
 class TestHADPretestReportSerialization:
     """Tests for HADPretestReport serialization branching by aggregate."""