Address PR #350 CI review round 3: P0 chronological cohort sort + P3 docstrings

igerber · claude · igerber · commit cb11a1fdee02 · 2026-04-22T08:33:33.000-04:00
**P0 (cohort sort key):** `_validate_had_panel_event_study` sorted
first_treat_col values with raw Python `(x is None, x)` while `time_col`
was already required to be ordered (numeric/datetime/ordered
categorical). On ordered-categorical staggered panels where
chronological order differs from lexicographic order, `F_last =
cohorts[-1]` silently picked the lexicographically latest cohort,
not the chronologically latest. That keeps the wrong cohort and
returns event-study estimates for the wrong estimand.

Fix: Promoted the dtype-aware `_sort_key` (ordered-categorical uses
declared category index; numeric/datetime use natural order) to the
top of the validator, just after the time-dtype check. Cohort
sorting, pre/post period sorting, contiguity check, and the
staggered-without-first_treat detection all now share this single
`_sort_key`. Removed the duplicate `_sort_key` definition that was
sitting further down in the same function.

**P3 (stale docstrings):**
- `fit()` no longer opens with "two-period panel"; now describes both
  aggregation modes with links to the respective result classes.
- `HeterogeneousAdoptionDiDEventStudyResults.n_units` docstring no
  longer says "only last-cohort units"; now accurately reports
  last-cohort PLUS never-treated retained.

**Test added:** `test_staggered_ordered_categorical_chooses_chronological_last`
uses categories `["q1", "q2", "q3", "q10"]` where lex max of the two
cohorts (`"q2", "q10"`) is `"q2"` but chronological last is `"q10"`;
asserts the fix picks `"q10"` as `F_last` and retains only the
q10-cohort units.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/had.py b/diff_diff/had.py
@@ -446,7 +446,9 @@ class HeterogeneousAdoptionDiDEventStudyResults:
         fitted data.
     n_units : int
         Number of unique units contributing to the fit. After staggered
-        auto-filter: only last-cohort units.
+        auto-filter: last-cohort units PLUS never-treated (``first_treat = 0``)
+        units retained as the untreated-group comparison per paper
+        Appendix B.2. Only earlier-treated cohorts are dropped.
     inference_method : str
         ``"analytical_nonparametric"`` (continuous designs) or
         ``"analytical_2sls"`` (mass-point). Shared across horizons.
@@ -933,6 +935,25 @@ def _validate_had_panel_event_study(
             f"before calling fit() with aggregate='event_study'."
         )
 
+    # Construct the chronological sort key once, shared across every
+    # downstream ordering: cohort ranking, pre/post period sorting, and
+    # contiguity checks. Ordered categoricals use their declared
+    # category index (``list(categorical)`` strips the ordering and
+    # falls back to string comparison); numeric / datetime use natural
+    # Python order. Reused by ``_aggregate_multi_period_first_differences``
+    # via a parallel construction in that helper (both read the same
+    # ``time_dtype``).
+    if isinstance(time_dtype, pd.CategoricalDtype) and time_dtype.ordered:
+        _cat_order = {c: i for i, c in enumerate(time_dtype.categories)}
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, _cat_order.get(x, len(_cat_order)))
+
+    else:
+
+        def _sort_key(x: Any) -> Tuple[bool, Any]:
+            return (x is None, x)
+
     # NaN checks on key columns (before any filter).
     for col in [outcome_col, dose_col, unit_col]:
         if bool(data[col].isna().any()):
@@ -1005,12 +1026,17 @@ def _validate_had_panel_event_study(
                 f"to equal each unit's first positive-dose period (or 0 "
                 f"for never-treated) before calling fit()."
             )
-        # Identify cohorts (nonzero first_treat values).
-        # Use pd.unique to preserve dtype; sort with a stable key.
+        # Identify cohorts (nonzero first_treat values). Sort using
+        # ``_sort_key`` (chronological order from ``time_dtype``), NOT
+        # raw Python sort: first_treat values are period labels and
+        # must rank chronologically so ``F_last = cohorts[-1]`` is the
+        # chronologically latest cohort. Under ordered-categorical time
+        # labels (e.g. month names), raw Python sort is lexicographic
+        # and would silently pick the wrong ``F_last``.
         ft_unique = list(pd.unique(ft_raw))
         cohorts = sorted(
             [v for v in ft_unique if v != 0 and not (isinstance(v, float) and np.isnan(v))],
-            key=lambda x: (x is None, x),
+            key=_sort_key,
         )
         if len(cohorts) == 0:
             raise ValueError(
@@ -1127,22 +1153,9 @@ def _validate_had_panel_event_study(
             f"zero dose; there is no treatment to estimate."
         )
 
-    # Sort by natural ordering on the time column dtype. For ordered
-    # categorical dtypes, use the declared category order (since
-    # ``list(categorical)`` strips the ordered semantics and falls back
-    # to string comparison). For numeric / datetime, use natural Python
-    # order. Tuple key places None at the end.
-    if isinstance(time_dtype, pd.CategoricalDtype) and time_dtype.ordered:
-        _cat_order = {c: i for i, c in enumerate(time_dtype.categories)}
-
-        def _sort_key(x: Any) -> Tuple[bool, Any]:
-            return (x is None, _cat_order.get(x, len(_cat_order)))
-
-    else:
-
-        def _sort_key(x: Any) -> Tuple[bool, Any]:
-            return (x is None, x)
-
+    # Sort using the same ``_sort_key`` already constructed for cohorts
+    # (ordered-categorical uses declared category order; numeric /
+    # datetime use natural Python order).
     t_pre_list = sorted(t_pre_list_unsorted, key=_sort_key)
     t_post_list = sorted(t_post_list_unsorted, key=_sort_key)
 
@@ -1203,10 +1216,8 @@ def _sort_key(x: Any) -> Tuple[bool, Any]:
         first_pos_per_unit = df_sorted.loc[pos_mask_global].groupby(unit_col)[time_col].first()
         cohort_labels = list(first_pos_per_unit.unique())
         if len(cohort_labels) > 1:
-            try:
-                distinct_cohorts = sorted(cohort_labels, key=lambda x: (x is None, x))
-            except TypeError:
-                distinct_cohorts = list(cohort_labels)
+            # Sort chronologically via the validated time-column order.
+            distinct_cohorts = sorted(cohort_labels, key=_sort_key)
             raise ValueError(
                 f"Staggered-timing panel detected (first_treat_col is "
                 f"None): {len(distinct_cohorts)} distinct first-positive-"
@@ -1940,7 +1951,15 @@ def fit(
         survey: Any = None,
         weights: Optional[np.ndarray] = None,
     ) -> HeterogeneousAdoptionDiDResults:
-        """Fit the HAD estimator on a two-period panel.
+        """Fit the HAD estimator.
+
+        ``aggregate="overall"`` (default) fits on a two-period panel and
+        returns a :class:`HeterogeneousAdoptionDiDResults` with the
+        single-period WAS estimate. ``aggregate="event_study"`` fits on
+        a multi-period panel (``T > 2``) and returns a
+        :class:`HeterogeneousAdoptionDiDEventStudyResults` with per-
+        event-time WAS estimates using a uniform ``F-1`` anchor (paper
+        Appendix B.2).
 
         Both the overall and event-study paths are **panel-only**: the paper
         (Section 2) defines HAD on panel or repeated-cross-section data,
diff --git a/tests/test_had.py b/tests/test_had.py
@@ -2727,6 +2727,70 @@ def test_time_varying_post_F_dose_rejected(self):
                 panel, "outcome", "dose", "period", "unit", aggregate="event_study"
             )
 
+    def test_staggered_ordered_categorical_chooses_chronological_last(self):
+        """Staggered filter uses chronological (not lexicographic) last.
+
+        Constructs an ordered-categorical time column where lexicographic
+        and chronological orderings disagree. With category order
+        ``["q1", "q2", "q3", "q10"]``, chronological last is ``"q10"``
+        but lexicographic last is ``"q3"``. If cohorts are ``{"q2", "q10"}``,
+        a raw-sort implementation would pick ``F_last = "q2"`` (lex-max
+        of the two strings); the fixed version must pick ``F_last = "q10"``.
+
+        Covers CI reviewer round 3 P0: cohort sorting must use
+        chronological order from ``time_dtype``, not raw Python sort.
+        """
+        rng = np.random.default_rng(0)
+        G = 80
+        periods = ["q1", "q2", "q3", "q10"]
+        cat_dtype = pd.CategoricalDtype(categories=periods, ordered=True)
+        # Half of units treated at q2 (cohort 1), half at q10 (cohort 2).
+        rows = []
+        for g in range(G):
+            F_g = "q2" if g < G // 2 else "q10"
+            d_g = float(rng.uniform(0.1, 1.0))
+            for p in periods:
+                # Dose = d_g once the period >= F_g in chronological order.
+                chrono_g = periods.index(F_g)
+                chrono_p = periods.index(p)
+                dose = d_g if chrono_p >= chrono_g else 0.0
+                rows.append(
+                    {
+                        "unit": g,
+                        "period": p,
+                        "dose": dose,
+                        "outcome": rng.standard_normal(),
+                        "first_treat": F_g,
+                    }
+                )
+        panel = pd.DataFrame(rows)
+        panel["period"] = panel["period"].astype(cat_dtype)
+        panel["first_treat"] = panel["first_treat"].astype(cat_dtype)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            result = HeterogeneousAdoptionDiD(design="auto").fit(
+                panel,
+                "outcome",
+                "dose",
+                "period",
+                "unit",
+                first_treat_col="first_treat",
+                aggregate="event_study",
+            )
+
+        # Chronological last cohort = "q10", not lexicographic last ("q3"
+        # is not even a cohort here; lex last of the two cohorts would
+        # be "q2" since "q10" < "q2" lexicographically).
+        assert result.filter_info is not None
+        assert result.filter_info["F_last"] == "q10"
+        assert result.F == "q10"
+        # q2-cohort units (G/2) are dropped; q10-cohort units (G/2)
+        # retained.
+        assert result.n_units == G // 2
+        # Dropped cohorts should list "q2".
+        assert "q2" in result.filter_info["dropped_cohorts"]
+
     def test_first_treat_col_mismatch_with_dose_raises(self):
         """first_treat_col disagreeing with observed dose path must raise.