Address PR #350 CI review round 2: P1 first_treat cross-validation + P1 ordered time

igerber · claude · igerber · commit 6e321ad74ed0 · 2026-04-22T07:45:02.000-04:00
**P1 (first_treat_col vs dose mismatch):** The last-cohort filter trusted
`first_treat_col` without validating it against the observed dose path.
A swapped or mistyped cohort label could silently retain the wrong
cohort as F_last.

Fix: `_validate_had_panel_event_study` now cross-validates each unit's
declared first_treat against their actual first-positive-dose period:
- declared == 0: unit must have D=0 at every period
- declared == F_g &gt; 0: unit's first period with D&gt;0 must equal F_g
Any mismatch raises `ValueError` with an example unit, declared value,
and actual first-positive period.

**P1 (unordered time labels):** Event-study chronology was inferred via
raw `sorted()` on period labels. For object/string dtypes that falls
back to lexicographic sort, which silently misorders panels like
"pre1"/"pre2"/"post1"/"post2" or month-name labels.

Fix: Event-study path now requires a numeric, datetime, or ordered-
categorical time column. Object/string dtypes raise a front-door
`ValueError` directing users to convert. Ordered categoricals are
sorted by their declared category order (not the underlying string),
via a dtype-aware `_sort_key` reused by both the validator and the
multi-period aggregator.

**P3 (docstring):** Class docstring no longer says the event-study
extension is "queued for Phase 2b"; now documents both aggregation
modes with pointers to the respective result classes.

**Tests added:**
- `test_first_treat_col_mismatch_with_dose_raises` pins the cross-
  validation contract.
- `test_unordered_string_time_col_rejected` pins front-door rejection
  of object dtypes.
- `test_ordered_categorical_time_col_accepted` confirms ordered
  categoricals sort by category order and fit successfully.

Minor: added `observed=False` to the categorical-groupby in the balance
check to silence the pandas FutureWarning while preserving behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/had.py b/diff_diff/had.py
@@ -903,6 +903,36 @@ def _validate_had_panel_event_study(
             f"single-period WAS)."
         )
 
+    # Ordered-time-type check. Paper Appendix B.2 event-time horizons
+    # require chronological ordering of periods (anchor at F-1, horizons
+    # e = t - F relative to F). Phase 2a two-period panels can use the
+    # dose invariant alone to distinguish pre from post without needing
+    # chronological order, so string labels ("pre", "post") work there.
+    # For multi-period event-study, multiple pre-periods all have D=0
+    # and multiple post-periods may both have D>0, so dose alone cannot
+    # recover chronology: we must trust the time column's natural order.
+    # Raw lexicographic sort on object/string labels silently misorders
+    # panels like "pre1"/"pre2"/"post1"/"post2" or month-name labels.
+    # Require an explicitly-ordered time representation.
+    time_dtype = data[time_col].dtype
+    if not (
+        pd.api.types.is_numeric_dtype(time_dtype)
+        or pd.api.types.is_datetime64_any_dtype(time_dtype)
+        or (isinstance(time_dtype, pd.CategoricalDtype) and bool(time_dtype.ordered))
+    ):
+        raise ValueError(
+            f"HAD aggregate='event_study' requires an ordered time "
+            f"column. time_col={time_col!r} has dtype={time_dtype!r}, "
+            f"which has no defined chronological order; raw sort would "
+            f"fall back to lexicographic ordering and silently misindex "
+            f"event-time horizons (e.g., 'pre1'/'pre2'/'post1'/'post2' "
+            f"sorts lexicographically but not chronologically). "
+            f"Convert time_col to numeric (e.g., integer year), "
+            f"datetime, or ordered categorical "
+            f"(``pd.Categorical(..., ordered=True, categories=[...])``) "
+            f"before calling fit() with aggregate='event_study'."
+        )
+
     # NaN checks on key columns (before any filter).
     for col in [outcome_col, dose_col, unit_col]:
         if bool(data[col].isna().any()):
@@ -936,6 +966,45 @@ def _validate_had_panel_event_study(
                 f"within unit for {n_bad} unit(s). Each unit must have "
                 f"a single first_treat value across all observed periods."
             )
+        # Cross-validate first_treat_col against observed first-positive-
+        # dose period for every unit. A mislabeled cohort column would
+        # otherwise silently select the wrong cohort as F_last and return
+        # event-study estimates for the wrong units. Contract:
+        #   - declared first_treat == 0: unit must have D == 0 at all t
+        #     (never-treated)
+        #   - declared first_treat == F_g > 0: unit's first period with
+        #     D > 0 must equal F_g
+        df_for_check = data.sort_values([unit_col, time_col])
+        pos_rows = df_for_check.loc[df_for_check[dose_col] > 0]
+        actual_first_pos = pos_rows.groupby(unit_col)[time_col].first()
+        declared_ft = df_for_check.groupby(unit_col)[first_treat_col].first()
+        n_mismatch = 0
+        example_mismatch: Optional[Tuple[Any, Any, Any]] = None
+        for u, declared in declared_ft.items():
+            actual = actual_first_pos.get(u, None)
+            if declared == 0:
+                if actual is not None:
+                    n_mismatch += 1
+                    if example_mismatch is None:
+                        example_mismatch = (u, declared, actual)
+            else:
+                if actual is None or actual != declared:
+                    n_mismatch += 1
+                    if example_mismatch is None:
+                        example_mismatch = (u, declared, actual)
+        if n_mismatch > 0:
+            u, declared, actual = example_mismatch  # type: ignore[misc]
+            raise ValueError(
+                f"first_treat_col={first_treat_col!r} disagrees with the "
+                f"observed dose path for {n_mismatch} unit(s). Example: "
+                f"unit={u!r} declares first_treat={declared!r} but the "
+                f"unit's first period with D>0 is {actual!r} "
+                f"(None means never-treated). A mislabeled cohort column "
+                f"would silently select the wrong cohort as F_last in the "
+                f"last-cohort auto-filter. Fix the first_treat_col values "
+                f"to equal each unit's first positive-dose period (or 0 "
+                f"for never-treated) before calling fit()."
+            )
         # Identify cohorts (nonzero first_treat values).
         # Use pd.unique to preserve dtype; sort with a stable key.
         ft_unique = list(pd.unique(ft_raw))
@@ -1015,8 +1084,9 @@ def _validate_had_panel_event_study(
                 )
 
     # Balanced panel on the (possibly-filtered) data: every unit appears
-    # exactly once per period.
-    counts = data_filtered.groupby([unit_col, time_col]).size()
+    # exactly once per period. ``observed=False`` preserves current
+    # behavior on categorical time columns (pandas' default is changing).
+    counts = data_filtered.groupby([unit_col, time_col], observed=False).size()
     if (counts != 1).any():
         n_bad = int((counts != 1).sum())
         raise ValueError(
@@ -1057,36 +1127,35 @@ def _validate_had_panel_event_study(
             f"zero dose; there is no treatment to estimate."
         )
 
-    # Sort by natural ordering on the time column dtype. Tuple key
-    # ``(x is None, x)`` places None at the end and sorts the rest by
-    # natural order (works for int/float/str/datetime when the dtype is
-    # homogeneous; mixed dtypes would raise at comparison time, which is
-    # the desired failure mode).
-    t_pre_list = sorted(t_pre_list_unsorted, key=lambda x: (x is None, x))
-    t_post_list = sorted(t_post_list_unsorted, key=lambda x: (x is None, x))
+    # Sort by natural ordering on the time column dtype. For ordered
+    # categorical dtypes, use the declared category order (since
+    # ``list(categorical)`` strips the ordered semantics and falls back
+    # to string comparison). For numeric / datetime, use natural Python
+    # order. Tuple key places None at the end.
+    if isinstance(time_dtype, pd.CategoricalDtype) and time_dtype.ordered:
+        _cat_order = {c: i for i, c in enumerate(time_dtype.categories)}
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, _cat_order.get(x, len(_cat_order)))
+
+    else:
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, x)
+
+    t_pre_list = sorted(t_pre_list_unsorted, key=_sort_key)
+    t_post_list = sorted(t_post_list_unsorted, key=_sort_key)
 
     # Contiguity check: all pre < all post in the natural ordering.
     # The HAD dose invariant requires a single transition from all-zero
     # to any-nonzero; interleaved pre/post periods indicate a malformed
     # panel (e.g., dose going back to zero after treatment, or mixing
-    # never-treated units with out-of-order labels).
+    # never-treated units with out-of-order labels). Uses ``_sort_key``
+    # so ordered categoricals respect their declared category order.
     if t_pre_list and t_post_list:
         max_pre = t_pre_list[-1]
         min_post = t_post_list[0]
-        # Check all pre-periods are less than all post-periods via the
-        # natural order. If types are comparable, direct comparison works;
-        # otherwise fall back to the sorted-key view.
-        try:
-            contiguous = max_pre < min_post
-        except TypeError:
-            # Mixed incomparable dtypes (e.g., None vs int after removing
-            # None above). Fall back to sorted-position check.
-            contiguous = True
-            for pre_p in t_pre_list:
-                for post_p in t_post_list:
-                    if not (pre_p < post_p):
-                        contiguous = False
-                        break
+        contiguous = _sort_key(max_pre) < _sort_key(min_post)
         if not contiguous:
             raise ValueError(
                 f"HAD dose invariant violated: pre-periods (all D=0) "
@@ -1318,7 +1387,23 @@ def _aggregate_multi_period_first_differences(
         equal to the LAST pre-period).
     """
     df = data.sort_values([unit_col, time_col]).reset_index(drop=True)
-    all_periods = sorted(t_pre_list + t_post_list, key=lambda x: (x is None, x))
+    # Period sort respects ordered categorical dtypes (matches
+    # ``_validate_had_panel_event_study``). The validator already
+    # enforces a numeric / datetime / ordered-categorical dtype on the
+    # event-study path, so ``_sort_key`` lookups are well-defined here.
+    time_dtype = data[time_col].dtype
+    if isinstance(time_dtype, pd.CategoricalDtype) and time_dtype.ordered:
+        _cat_order = {c: i for i, c in enumerate(time_dtype.categories)}
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, _cat_order.get(x, len(_cat_order)))
+
+    else:
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, x)
+
+    all_periods = sorted(t_pre_list + t_post_list, key=_sort_key)
     # Event-time mapping: natural rank of each period relative to F.
     F_idx = all_periods.index(F)
     period_to_event_time: Dict[Any, int] = {p: (i - F_idx) for i, p in enumerate(all_periods)}
@@ -1604,9 +1689,16 @@ class HeterogeneousAdoptionDiD:
     Weighted-Average-Slope (WAS) estimator with three design-dispatch
     paths: Design 1' (continuous-at-zero), Design 1 continuous-near-
     d_lower, and Design 1 mass-point (2SLS sample-average per paper
-    Section 3.2.4). Phase 2a ships the single-period path only; the
-    multi-period event-study extension (Appendix B.2) is queued for
-    Phase 2b.
+    Section 3.2.4). Two aggregation modes:
+
+    - ``aggregate="overall"`` (Phase 2a, default) returns a single-period
+      :class:`HeterogeneousAdoptionDiDResults` on a two-period panel.
+    - ``aggregate="event_study"`` (Phase 2b, paper Appendix B.2) returns
+      a :class:`HeterogeneousAdoptionDiDEventStudyResults` with per-
+      event-time WAS estimates on a multi-period panel, using a uniform
+      ``F-1`` anchor and pointwise CIs per horizon. Staggered-timing
+      panels auto-filter to the last-treatment cohort plus never-treated
+      units (paper Appendix B.2 prescription).
 
     Parameters
     ----------
diff --git a/tests/test_had.py b/tests/test_had.py
@@ -2727,6 +2727,115 @@ def test_time_varying_post_F_dose_rejected(self):
                 panel, "outcome", "dose", "period", "unit", aggregate="event_study"
             )
 
+    def test_first_treat_col_mismatch_with_dose_raises(self):
+        """first_treat_col disagreeing with observed dose path must raise.
+
+        A mislabeled cohort column would otherwise silently select the
+        wrong cohort as F_last in the last-cohort auto-filter and
+        produce event-study estimates for the wrong units. Covers CI
+        reviewer round 2 P1.
+        """
+        rng = np.random.default_rng(0)
+        G = 40
+        rows = []
+        for g in range(G):
+            # Actual first-positive-dose period: t=3 for half, t=5 for half.
+            F_actual = 3 if g < G // 2 else 5
+            # But deliberately mislabel: swap the first_treat labels so
+            # G/2 units declare 5 when actual is 3, and vice versa.
+            F_declared = 5 if g < G // 2 else 3
+            d_g = float(rng.uniform(0.1, 1.0))
+            for t in range(1, 7):
+                dose = d_g if t >= F_actual else 0.0
+                rows.append(
+                    {
+                        "unit": g,
+                        "period": t,
+                        "dose": dose,
+                        "outcome": rng.standard_normal(),
+                        "first_treat": F_declared,
+                    }
+                )
+        panel = pd.DataFrame(rows)
+        with pytest.raises(ValueError, match="disagrees with the observed dose"):
+            HeterogeneousAdoptionDiD(design="auto").fit(
+                panel,
+                "outcome",
+                "dose",
+                "period",
+                "unit",
+                first_treat_col="first_treat",
+                aggregate="event_study",
+            )
+
+    def test_unordered_string_time_col_rejected(self):
+        """Object/string time columns raise on event-study path.
+
+        Raw sort on arbitrary string labels is lexicographic, not
+        chronological (e.g., 'pre1'/'pre2'/'post1'/'post2' would map
+        to wrong event-time horizons). Covers CI reviewer round 2 P1.
+        """
+        rng = np.random.default_rng(0)
+        G = 50
+        rows = []
+        d_post = rng.uniform(0.0, 1.0, G)
+        d_post[0] = 0.0
+        for g in range(G):
+            for label, dose in [
+                ("pre1", 0.0),
+                ("pre2", 0.0),
+                ("post1", d_post[g]),
+                ("post2", d_post[g]),
+            ]:
+                rows.append(
+                    {
+                        "unit": g,
+                        "period": label,  # object dtype
+                        "dose": dose,
+                        "outcome": rng.standard_normal(),
+                    }
+                )
+        panel = pd.DataFrame(rows)
+        with pytest.raises(ValueError, match="ordered time column|dtype"):
+            HeterogeneousAdoptionDiD(design="auto").fit(
+                panel, "outcome", "dose", "period", "unit", aggregate="event_study"
+            )
+
+    def test_ordered_categorical_time_col_accepted(self):
+        """Ordered categorical time dtype passes the ordered-time check."""
+        rng = np.random.default_rng(0)
+        G = 50
+        labels = ["pre1", "pre2", "post1", "post2"]
+        cat_dtype = pd.CategoricalDtype(categories=labels, ordered=True)
+        rows = []
+        d_post = rng.uniform(0.1, 1.0, G)
+        d_post[0] = 0.0
+        for g in range(G):
+            for label, dose in [
+                ("pre1", 0.0),
+                ("pre2", 0.0),
+                ("post1", d_post[g]),
+                ("post2", d_post[g]),
+            ]:
+                rows.append(
+                    {
+                        "unit": g,
+                        "period": label,
+                        "dose": dose,
+                        "outcome": rng.standard_normal(),
+                    }
+                )
+        panel = pd.DataFrame(rows)
+        panel["period"] = panel["period"].astype(cat_dtype)
+        # Should fit without raising the ordered-time error.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            result = HeterogeneousAdoptionDiD(design="auto").fit(
+                panel, "outcome", "dose", "period", "unit", aggregate="event_study"
+            )
+        # post1 is F; e=-2 (pre1) and e=0 (post1), e=1 (post2) expected.
+        assert result.F == "post1"
+
     def test_staggered_without_first_treat_col_rejected(self):
         """Multi-cohort panel without first_treat_col raises (not silent).