Fix CI review P0s: delta dose, placebo sign, sup-t calibration, l=1 consistency

igerber · claude · igerber · commit 7ed7fde5784b · 2026-04-12T13:05:53.000-04:00
- Fix cost-benefit delta to use cumulative dose (sum_{k=0}^{l-1} |D_{g,F_g+k} - D_{g,1}|)
  instead of one-period dose; binary weights now proportional to l * N_l
- Flip dynamic placebo sign to ref-minus-preperiod (Y_{ref} - Y_{backward}),
  matching the Phase 1 convention
- Include l=1 in sup-t bootstrap calibration so bands are truly simultaneous
  over all horizons 1..L_max
- Use per-group DID_{g,l} path for event_study_effects[1] when L_max &gt;= 2,
  making all horizons use a consistent estimand
- Label overall_att as "delta" in summary/to_dataframe when L_max &gt; 1
- Add A11 control-availability warnings for multi-horizon empty control pools

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -1030,6 +1030,21 @@ def fit(
                 T_g=T_g_arr,
                 L_max=L_max,
             )
+            # Surface A11 warnings from multi-horizon computation
+            mh_a11 = multi_horizon_dids.pop("_a11_warnings", None)
+            if mh_a11:
+                warnings.warn(
+                    f"Multi-horizon control-availability violations in "
+                    f"{len(mh_a11)} (group, horizon) pair(s): affected "
+                    f"DID_{{g,l}} values are zeroed but their switcher "
+                    f"counts are retained in N_l (matching the A11 "
+                    f"zero-retention convention). Examples: "
+                    + ", ".join(mh_a11[:3])
+                    + (f" (and {len(mh_a11) - 3} more)" if len(mh_a11) > 3 else ""),
+                    UserWarning,
+                    stacklevel=2,
+                )
+
             multi_horizon_if = _compute_per_group_if_multi_horizon(
                 D_mat=D_mat,
                 Y_mat=Y_mat,
@@ -1051,7 +1066,10 @@ def fit(
 
             multi_horizon_se = {}
             multi_horizon_inference = {}
-            for l_h in range(2, L_max + 1):
+            # Compute inference for ALL horizons 1..L_max (including l=1)
+            # so the event_study_effects dict uses a consistent estimand
+            # (per-group DID_{g,l}) across all horizons.
+            for l_h in range(1, L_max + 1):
                 U_l = multi_horizon_if[l_h]
                 # Cohort IDs for this horizon: (D_{g,1}, F_g, S_g) triples
                 # are the same as Phase 1 (cohort identity depends on first
@@ -1315,7 +1333,12 @@ def fit(
                     [g not in singleton_baseline_set_b for g in all_groups], dtype=bool
                 )
                 mh_boot_inputs = {}
-                for l_h in range(2, L_max + 1):
+                # Include ALL horizons 1..L_max so the sup-t critical
+                # value is calibrated over the same set that receives
+                # cband_conf_int. For l=1, use the per-group IF (not
+                # the Phase 1 per-period IF) so the bootstrap matches
+                # the event_study_effects[1] estimand.
+                for l_h in range(1, L_max + 1):
                     h_data = multi_horizon_dids.get(l_h)
                     if h_data is None or h_data["N_l"] == 0:
                         continue
@@ -1400,22 +1423,24 @@ def fit(
         # ------------------------------------------------------------------
         # Step 20: Build the results dataclass
         # ------------------------------------------------------------------
-        # event_study_effects: l=1 always mirrors the Phase 1 DID_M output.
-        # When L_max >= 2, horizons 2..L_max are populated from the Phase 2
-        # multi-horizon computation.
-        event_study_effects: Dict[int, Dict[str, Any]] = {
-            1: {
-                "effect": overall_att,
-                "se": overall_se,
-                "t_stat": overall_t,
-                "p_value": overall_p,
-                "conf_int": overall_ci,
-                "n_obs": N_S,
+        # event_study_effects: when L_max is None, l=1 mirrors Phase 1
+        # DID_M (per-period path). When L_max >= 2, ALL horizons including
+        # l=1 use the per-group DID_{g,l} path for a consistent estimand.
+        if multi_horizon_inference is not None and 1 in multi_horizon_inference:
+            # Phase 2 mode: use per-group path for all horizons
+            event_study_effects: Dict[int, Dict[str, Any]] = dict(multi_horizon_inference)
+        else:
+            # Phase 1 mode (L_max=None): l=1 from per-period path
+            event_study_effects = {
+                1: {
+                    "effect": overall_att,
+                    "se": overall_se,
+                    "t_stat": overall_t,
+                    "p_value": overall_p,
+                    "conf_int": overall_ci,
+                    "n_obs": N_S,
+                }
             }
-        }
-        if multi_horizon_inference is not None:
-            for l_h, inf_dict in multi_horizon_inference.items():
-                event_study_effects[l_h] = inf_dict
 
         # Phase 2: propagate bootstrap results to event_study_effects
         if bootstrap_results is not None and bootstrap_results.event_study_ses:
@@ -1514,7 +1539,7 @@ def fit(
                 denom = n_data["denominator"]
                 eff = n_data["effect"]
                 # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l
-                se_did_l = multi_horizon_se.get(l_h, float("nan")) if l_h >= 2 else overall_se
+                se_did_l = multi_horizon_se.get(l_h, float("nan"))
                 se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan")
                 t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=None)
                 normalized_effects_out[l_h] = {
@@ -2119,6 +2144,7 @@ def _compute_multi_horizon_dids(
         baseline_f[int(d)] = first_switch_idx[mask]
 
     results: Dict[int, Dict[str, Any]] = {}
+    a11_multi_warnings: List[str] = []
     N_1 = 0  # will be set at l=1 for switcher_fraction
 
     for l in range(1, L_max + 1):  # noqa: E741
@@ -2187,6 +2213,10 @@ def _compute_multi_horizon_dids(
                 # matching the A11 zero-retention convention: the group's
                 # switcher count is still in N_l.
                 did_g_l[g] = 0.0
+                a11_multi_warnings.append(
+                    f"horizon {l}, group_idx {g}: "
+                    f"no baseline-matched controls at outcome period"
+                )
                 continue
 
             ctrl_changes = Y_mat[ctrl_pool, out_idx] - Y_mat[ctrl_pool, ref_idx]
@@ -2206,6 +2236,10 @@ def _compute_multi_horizon_dids(
             "switcher_fraction": N_l / N_1 if N_1 > 0 else float("nan"),
         }
 
+    # Attach A11 warnings to the results for the caller to surface
+    if a11_multi_warnings:
+        results["_a11_warnings"] = a11_multi_warnings  # type: ignore[assignment]
+
     return results
 
 
@@ -2393,8 +2427,9 @@ def _compute_multi_horizon_placebos(
             forward_idx = ref_idx + l
             d_base = int(baselines[g])
 
-            # Switcher's backward outcome change
-            switcher_change = Y_mat[g, backward_idx] - Y_mat[g, ref_idx]
+            # Switcher's backward outcome change: reference minus pre-period
+            # (matching Phase 1 convention: Y_{ref} - Y_{earlier})
+            switcher_change = Y_mat[g, ref_idx] - Y_mat[g, backward_idx]
 
             # Control pool: same baseline, not switched by forward_idx
             ctrl_indices = baseline_groups[d_base]
@@ -2410,7 +2445,7 @@ def _compute_multi_horizon_placebos(
                 pl_g_l[g] = 0.0
                 continue
 
-            ctrl_changes = Y_mat[ctrl_pool, backward_idx] - Y_mat[ctrl_pool, ref_idx]
+            ctrl_changes = Y_mat[ctrl_pool, ref_idx] - Y_mat[ctrl_pool, backward_idx]
             ctrl_avg = float(ctrl_changes.mean())
             pl_g_l[g] = switcher_change - ctrl_avg
 
@@ -2522,9 +2557,14 @@ def _compute_cost_benefit_delta(
         dose_l = 0.0
         for g in np.where(eligible)[0]:
             f_g = first_switch_idx[g]
-            col = f_g - 1 + l
-            if col < D_mat.shape[1]:
-                dose_l += abs(float(D_mat[g, col] - baselines[g]))
+            # Cumulative dose: delta^D_{g,l} = sum_{k=0}^{l-1} |D_{g,F_g+k} - D_{g,1}|
+            # For binary treatment this equals l (each period contributes 1).
+            cum_dose = 0.0
+            for k in range(l):
+                col_k = f_g + k
+                if col_k < D_mat.shape[1]:
+                    cum_dose += abs(float(D_mat[g, col_k] - baselines[g]))
+            dose_l += cum_dose
         per_horizon_dose[l] = dose_l
         total_dose += dose_l
 
@@ -2572,9 +2612,12 @@ def _compute_cost_benefit_delta(
                     if switch_direction[g] != direction:
                         continue
                     f_g = first_switch_idx[g]
-                    col = f_g - 1 + l
-                    if col < D_mat.shape[1]:
-                        dose_l += abs(float(D_mat[g, col] - baselines[g]))
+                    cum_dose = 0.0
+                    for k in range(l):
+                        col_k = f_g + k
+                        if col_k < D_mat.shape[1]:
+                            cum_dose += abs(float(D_mat[g, col_k] - baselines[g]))
+                    dose_l += cum_dose
                 dir_horizon_dose[l] = dose_l
                 dir_dose += dose_l
 
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_results.py b/diff_diff/chaisemartin_dhaultfoeuille_results.py
@@ -505,16 +505,22 @@ def summary(self, alpha: Optional[float] = None) -> str:
                 ]
             )
 
-        # --- Overall DID_M ---
+        # --- Overall ---
+        overall_label = (
+            "Cost-Benefit Delta"
+            if self.L_max is not None and self.L_max >= 2
+            else "DID_M (Contemporaneous-Switch ATT)"
+        )
+        overall_row_label = "delta" if self.L_max is not None and self.L_max >= 2 else "DID_M"
         lines.extend(
             [
                 thin,
-                "DID_M (Contemporaneous-Switch ATT)".center(width),
+                overall_label.center(width),
                 thin,
                 header_row,
                 thin,
                 _format_inference_row(
-                    "DID_M",
+                    overall_row_label,
                     self.overall_att,
                     self.overall_se,
                     self.overall_t_stat,
@@ -772,7 +778,9 @@ def to_dataframe(self, level: str = "overall") -> pd.DataFrame:
             return pd.DataFrame(
                 [
                     {
-                        "estimand": "DID_M",
+                        "estimand": (
+                            "delta" if self.L_max is not None and self.L_max >= 2 else "DID_M"
+                        ),
                         "effect": self.overall_att,
                         "se": self.overall_se,
                         "t_stat": self.overall_t_stat,
diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py
@@ -1836,12 +1836,12 @@ def test_L_max_populates_event_study_effects(self, data):
             assert "n_obs" in entry
             assert entry["n_obs"] > 0
 
-    def test_did_l_equals_did_m_at_l1(self, data):
-        """event_study_effects[1] must equal DID_M from Phase 1."""
+    def test_did_l1_uses_per_group_path_when_L_max(self, data):
+        """When L_max >= 2, event_study_effects[1] uses the per-group
+        DID_{g,1} path (consistent with horizons 2..L_max), which may
+        differ from the Phase 1 per-period DID_M. The per-period DID_M
+        is still available via the L_max=None path."""
         est = ChaisemartinDHaultfoeuille(placebo=False, twfe_diagnostic=False)
-        r_none = est.fit(
-            data, outcome="outcome", group="group", time="period", treatment="treatment"
-        )
         r_multi = est.fit(
             data,
             outcome="outcome",
@@ -1850,7 +1850,9 @@ def test_did_l_equals_did_m_at_l1(self, data):
             treatment="treatment",
             L_max=3,
         )
-        assert r_multi.event_study_effects[1]["effect"] == pytest.approx(r_none.overall_att)
+        # event_study_effects[1] is populated and finite
+        assert np.isfinite(r_multi.event_study_effects[1]["effect"])
+        assert np.isfinite(r_multi.event_study_effects[1]["se"])
 
     def test_N_l_decreases_with_horizon(self, data):
         """n_obs generally decreases for far horizons."""
diff --git a/tests/test_chaisemartin_dhaultfoeuille_parity.py b/tests/test_chaisemartin_dhaultfoeuille_parity.py
@@ -269,12 +269,16 @@ def test_parity_leavers_only_multi_horizon(self, golden_values):
 
     def test_parity_mixed_single_switch_multi_horizon(self, golden_values):
         self._check_multi_horizon(
-            golden_values, "mixed_single_switch_multi_horizon",
-            L_max=5, rtol=self.MIXED_POINT_RTOL,
+            golden_values,
+            "mixed_single_switch_multi_horizon",
+            L_max=5,
+            rtol=self.MIXED_POINT_RTOL,
         )
 
     def test_parity_joiners_only_long_multi_horizon(self, golden_values):
         self._check_multi_horizon(
-            golden_values, "joiners_only_long_multi_horizon",
-            L_max=5, rtol=self.POINT_RTOL,
+            golden_values,
+            "joiners_only_long_multi_horizon",
+            L_max=5,
+            rtol=self.POINT_RTOL,
         )