Address AI review P1/P2 findings for Phase 3 PR B

igerber · claude · igerber · commit f580dae01006 · 2026-04-13T12:43:08.000-04:00
P1 fixes:
- DID^X residualization no longer leaks into per-period path:
  per_period_effects uses raw Y_mat, only multi-horizon path
  sees residualized outcomes
- Added to_dataframe levels for heterogeneity and linear_trends

P2 fixes:
- Covariate coercion no longer mutates caller's DataFrame
- Vectorized residualization (einsum replaces nested loop)
- Heterogeneity test guards against rank-deficient OLS
- Added estimand contract test for controls + L_max=1
- REGISTRY note clarifies per_period_effects stays unadjusted

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -618,21 +618,26 @@ def fit(
                     f"Control column(s) {missing_controls!r} not found in "
                     f"data. Available columns: {list(data.columns)}"
                 )
+            # Work on a copy to avoid mutating the caller's DataFrame
+            data_controls = data[controls].copy()
             for c in controls:
                 try:
-                    data[c] = pd.to_numeric(data[c])
+                    data_controls[c] = pd.to_numeric(data_controls[c])
                 except (ValueError, TypeError) as exc:
                     raise ValueError(
                         f"Could not coerce control column {c!r} to numeric: {exc}"
                     ) from exc
-                n_nan = int(data[c].isna().sum())
+                n_nan = int(data_controls[c].isna().sum())
                 if n_nan > 0:
                     raise ValueError(
                         f"Control column {c!r} contains {n_nan} NaN value(s). "
                         "Drop or impute missing covariates before fitting."
                     )
-            # Aggregate covariates to cell means (same groupby as treatment/outcome)
-            x_cell_agg = data.groupby([group, time], as_index=False)[controls].mean()
+            # Aggregate covariates to cell means (same groupby as treatment/outcome).
+            # Use the coerced copy joined with group/time from original data.
+            x_agg_input = data[[group, time]].copy()
+            x_agg_input[controls] = data_controls[controls].values
+            x_cell_agg = x_agg_input.groupby([group, time], as_index=False)[controls].mean()
             cell = cell.merge(x_cell_agg, on=[group, time], how="left")
 
         # ------------------------------------------------------------------
@@ -948,13 +953,19 @@ def fit(
             )
             _switch_metadata_computed = True
 
-            Y_mat, covariate_diagnostics = _compute_covariate_residualization(
+            Y_mat_residualized, covariate_diagnostics = _compute_covariate_residualization(
                 Y_mat=Y_mat,
                 X_cell=X_cell,
                 N_mat=N_mat,
                 baselines=baselines,
                 first_switch_idx=first_switch_idx_arr,
             )
+            # Keep raw Y_mat for the per-period DID path (which does not
+            # support covariate residualization - it uses binary joiner/leaver
+            # categorization). The residualized matrix is used only by the
+            # per-group multi-horizon path (L_max >= 1).
+            Y_mat_raw = Y_mat
+            Y_mat = Y_mat_residualized
 
         # ------------------------------------------------------------------
         # Step 7c: First-differencing for linear trends (DID^{fd})
@@ -1061,8 +1072,13 @@ def fit(
             a11_minus_zeroed_arr,
         ) = _compute_per_period_dids(
             D_mat=D_mat,
-            Y_mat=Y_mat,
-            N_mat=N_mat,
+            # Use raw (unadjusted) outcomes for per-period DID. Covariate
+            # residualization applies only to the per-group multi-horizon
+            # path (L_max >= 1). The per-period path uses binary
+            # joiner/leaver categorization and is not part of the DID^X
+            # contract (Web Appendix Section 1.2).
+            Y_mat=Y_mat_raw if controls is not None else Y_mat,
+            N_mat=N_mat_orig,
             periods=all_periods,
         )
         if a11_warnings:
@@ -1489,7 +1505,8 @@ def fit(
             U_centered_leavers,
         ) = _compute_cohort_recentered_inputs(
             D_mat=D_mat,
-            Y_mat=Y_mat,
+            # Phase 1 IF uses per-period structure: use raw outcomes
+            Y_mat=Y_mat_raw if controls is not None else Y_mat,
             N_mat=N_mat_orig,
             n_10_t_arr=n_10_t_arr,
             n_00_t_arr=n_00_t_arr,
@@ -2751,12 +2768,17 @@ def _compute_covariate_residualization(
         }
 
         # Residualize Y at levels for all groups with this baseline.
-        # Y_tilde[g, t] = Y[g, t] - X[g, t] @ theta_hat
+        # Vectorized level residualization: Y_tilde[g, t] = Y[g, t] - X[g, t] @ theta_hat
         group_indices = np.where(d_mask)[0]
-        for g in group_indices:
-            for t in range(n_periods):
-                if N_mat[g, t] > 0 and np.all(np.isfinite(X_cell[g, t])):
-                    Y_resid[g, t] = Y_mat[g, t] - float(X_cell[g, t] @ theta_hat)
+        if len(group_indices) > 0:
+            # X_sub: (n_d_groups, n_periods, n_covariates), theta: (n_covariates,)
+            X_sub = X_cell[group_indices]  # (n_d, T, K)
+            adjustment = np.einsum("gtk,k->gt", X_sub, theta_hat)  # (n_d, T)
+            # Mask: only adjust cells that are observed and have finite covariates
+            valid = (N_mat[group_indices] > 0) & np.all(np.isfinite(X_sub), axis=2)
+            Y_resid[group_indices] = np.where(
+                valid, Y_mat[group_indices] - adjustment, Y_mat[group_indices]
+            )
 
     return Y_resid, diagnostics
 
@@ -2902,14 +2924,29 @@ def _compute_heterogeneity_test(
         else:
             design = x_arr
 
+        # Guard: need more observations than parameters
+        n_params = design.shape[1]
+        if n_obs <= n_params:
+            results[l_h] = {
+                "beta": float("nan"), "se": float("nan"),
+                "t_stat": float("nan"), "p_value": float("nan"),
+                "conf_int": (float("nan"), float("nan")),
+                "n_obs": n_obs,
+            }
+            continue
+
         coefs, _residuals, vcov = solve_ols(
             design, dep_arr,
             return_vcov=True,
             rank_deficient_action="warn",
         )
 
         beta_het = float(coefs[0])
-        se_het = float(np.sqrt(vcov[0, 0])) if vcov is not None else float("nan")
+        # NaN-safe: if vcov is None or target coefficient variance is NaN
+        # (rank-deficient), all inference fields are NaN.
+        se_het = float("nan")
+        if vcov is not None and np.isfinite(vcov[0, 0]) and vcov[0, 0] > 0:
+            se_het = float(np.sqrt(vcov[0, 0]))
         t_stat, p_val, ci = safe_inference(beta_het, se_het, alpha=alpha, df=None)
 
         results[l_h] = {
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_results.py b/diff_diff/chaisemartin_dhaultfoeuille_results.py
@@ -1029,6 +1029,28 @@ def to_dataframe(self, level: str = "overall") -> pd.DataFrame:
                 )
             return self.twfe_weights.copy()
 
+        elif level == "heterogeneity":
+            if self.heterogeneity_effects is None:
+                raise ValueError(
+                    "Heterogeneity test results not available. Pass "
+                    "heterogeneity='column_name' to fit()."
+                )
+            rows = []
+            for h, data in sorted(self.heterogeneity_effects.items()):
+                rows.append({"horizon": h, **data})
+            return pd.DataFrame(rows)
+
+        elif level == "linear_trends":
+            if self.linear_trends_effects is None:
+                raise ValueError(
+                    "Linear trends effects not available. Pass "
+                    "trends_linear=True to fit()."
+                )
+            rows = []
+            for h, data in sorted(self.linear_trends_effects.items()):
+                rows.append({"horizon": h, **data})
+            return pd.DataFrame(rows)
+
         else:
             raise ValueError(
                 f"Unknown level: {level!r}. Use 'overall', 'joiners_leavers', "
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -609,7 +609,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note (deviation from R DIDmultiplegtDYN):** Phase 1 requires panels with a **balanced baseline** (every group observed at the first global period) and **no interior period gaps**. The Step 5b validation in `fit()` enforces this contract: groups missing the baseline raise `ValueError`; groups with interior gaps are dropped with a `UserWarning`; groups with **terminal missingness** (early exit / right-censoring — observed at the baseline but missing one or more later periods) are retained and contribute from their observed periods only. R `DIDmultiplegtDYN` accepts unbalanced panels with documented missing-treatment-before-first-switch handling. Python's restriction is a Phase 1 limitation: the cohort enumeration uses `D_{g,1}` as the canonical baseline (so the baseline observation must exist) and the first-switch detection walks adjacent observed periods (so interior gaps create ambiguous transition counts). Terminal missingness is supported because the per-period `present = (N_mat[:, t] > 0) & (N_mat[:, t-1] > 0)` guard appears at three sites in the variance computation (`_compute_per_period_dids`, `_compute_full_per_group_contributions`, `_compute_cohort_recentered_inputs`) and cleanly masks out missing transitions without propagating NaN into the arithmetic. **Workaround for unbalanced panels:** pre-process your data to back-fill the baseline (or drop late-entry groups before fitting), or use R `DIDmultiplegtDYN` until a future phase lifts the restriction. The Step 5b `ValueError` and `UserWarning` messages name the offending group IDs so you can locate them quickly.
 
-- **Note (Phase 3 DID^X covariate adjustment):** Implements the residualization-style covariate adjustment from Web Appendix Section 1.2 (Assumption 11). For each baseline treatment value `d`, estimates `theta_hat_d` via OLS of first-differenced outcomes on first-differenced covariates with time FEs, restricted to not-yet-treated observations. Residualizes at levels: `Y_tilde[g,t] = Y[g,t] - X[g,t] @ theta_hat_d`. All downstream DID computations use residualized outcomes. This is NOT doubly-robust, NOT IPW, NOT Callaway-Sant'Anna-style. Plug-in IF (treating `theta_hat` as fixed) is valid by FWL theorem. Requires `L_max >= 1`. Activated via `controls=["col1", "col2"]` in `fit()`.
+- **Note (Phase 3 DID^X covariate adjustment):** When `controls` is set, `per_period_effects` (the Phase 1 per-period DID_M decomposition) remains **unadjusted** (computed on raw outcomes). The covariate residualization applies only to the per-group `DID_{g,l}` path (`L_max >= 1`), which produces `event_study_effects` and `overall_att`. This means `per_period_effects` and `event_study_effects[1]` may diverge when controls are active - by design (the per-period path uses binary joiner/leaver categorization and is not part of the DID^X contract). Implements the residualization-style covariate adjustment from Web Appendix Section 1.2 (Assumption 11). For each baseline treatment value `d`, estimates `theta_hat_d` via OLS of first-differenced outcomes on first-differenced covariates with time FEs, restricted to not-yet-treated observations. Residualizes at levels: `Y_tilde[g,t] = Y[g,t] - X[g,t] @ theta_hat_d`. All downstream DID computations use residualized outcomes. This is NOT doubly-robust, NOT IPW, NOT Callaway-Sant'Anna-style. Plug-in IF (treating `theta_hat` as fixed) is valid by FWL theorem. Requires `L_max >= 1`. Activated via `controls=["col1", "col2"]` in `fit()`.
 
 - **Note (Phase 3 DID^{fd} linear trends):** Implements group-specific linear trends from Web Appendix Section 1.3 (Assumption 12, Lemma 6). Uses the Z_mat transformation: `Z[g,t] = Y[g,t] - Y[g,t-1]` (first-differenced outcomes). Since `DID_{g,l}(Z) = DID^{fd}_{g,l}` algebraically, the existing multi-horizon DID code produces trend-adjusted estimates when fed Z_mat. Requires F_g >= 3 (at least 2 pre-switch periods); groups with F_g < 3 are excluded with a `UserWarning`. Cumulated level effects `delta^{fd}_l = sum_{l'=1}^l DID^{fd}_{l'}` stored in `results.linear_trends_effects`. Cumulated SE uses conservative upper bound (sum of per-horizon SEs); cross-horizon covariance from IF vectors is a library extension (paper proves Theorem 1 per-horizon, not cross-horizon). When combined with DID^X, residualization is applied first, then first-differencing (per paper assumption ordering). Activated via `trends_linear=True` in `fit()`.
 
diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py
@@ -2421,6 +2421,36 @@ def test_controls_with_multi_horizon(self):
             assert np.isfinite(r.event_study_effects[h]["effect"])
             assert np.isfinite(r.event_study_effects[h]["se"])
 
+    def test_controls_lmax1_estimand_contract(self):
+        """DID^X with L_max=1: per_period_effects stay raw, overall uses DID^X_1."""
+        df = self._make_panel_with_covariates()
+        est = ChaisemartinDHaultfoeuille(seed=1)
+
+        # Fit without controls for raw per-period baseline
+        r_raw = est.fit(df, "outcome", "group", "period", "treatment")
+        # Fit with controls
+        r_x = est.fit(
+            df, "outcome", "group", "period", "treatment",
+            controls=["X1"], L_max=1,
+        )
+
+        # per_period_effects should be UNADJUSTED (raw Phase 1 DID_M)
+        # because the per-period path does not support covariate adjustment
+        for period_key in r_raw.per_period_effects:
+            if period_key in r_x.per_period_effects:
+                raw_eff = r_raw.per_period_effects[period_key]
+                x_eff = r_x.per_period_effects[period_key]
+                assert raw_eff["did_plus_t"] == pytest.approx(
+                    x_eff["did_plus_t"], abs=1e-10
+                ), f"per_period_effects should be unadjusted at period {period_key}"
+
+        # overall_att should come from event_study_effects[1] (DID^X_1)
+        assert r_x.overall_att == pytest.approx(
+            r_x.event_study_effects[1]["effect"], abs=1e-10
+        )
+        # and should differ from the raw overall_att (covariate effect)
+        assert r_x.overall_att != r_raw.overall_att
+
 
 class TestLinearTrends:
     """DID^{fd} group-specific linear trends (ROADMAP item 3b)."""