Fix M2 gradient scaling: use np.sum instead of np.mean over control subsets

igerber · claude · igerber · commit c0c7e5a591ad · 2026-03-30T16:01:25.000-04:00
The M2 gradient terms in PS nuisance corrections used np.mean() over
control subsets, introducing an extra 1/n_c divisor. R's DRDID computes
M2 as colMeans() over the full n-sample (zeros for treated), then divides
by mean(w.cont) — the n's cancel, giving sum(w*resid*X)/sum(w). With
our Hajek-normalized weights (w_norm = w/sum(w)), np.sum(w_norm*resid*X)
directly yields sum(w*resid*X)/sum(w), matching R after cancellation.
The single /n on the correction line remains as the psi-to-phi conversion.

Applied at all 5 PS correction sites (panel survey IPW/DR, panel
non-survey DR, RCS IPW, RCS DR).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py
@@ -2072,8 +2072,10 @@ def _ipw_estimation(
                 asy_lin_rep_psi = score_ps @ H_psi_inv
 
                 att_control_weighted = np.sum(weights_control_norm * control_change)
-                # R: M2 = colMeans(w.cont * (y - att) * X)
-                M2 = np.mean(
+                # R: M2 = colMeans(w.cont * (y - att) * X) / mean(w.cont)
+                # np.sum (not mean): subset sum with normalized weights matches
+                # R's full-sample colMeans/mean(w) after cancellation
+                M2 = np.sum(
                     (weights_control_norm * (control_change - att_control_weighted))[:, None]
                     * X_all_int[n_t:],
                     axis=0,
@@ -2331,7 +2333,7 @@ def _doubly_robust(
                     asy_lin_rep_psi = score_ps @ H_psi_inv
 
                     dr_resid_control = m_control - control_change
-                    M2_dr = np.mean(
+                    M2_dr = np.sum(
                         ((weights_control / sw_t_sum) * dr_resid_control)[:, None]
                         * X_all_int[n_t:],
                         axis=0,
@@ -2394,7 +2396,7 @@ def _doubly_robust(
                     asy_lin_rep_psi = score_ps @ H_psi_inv
 
                     dr_resid_control = m_control - control_change
-                    M2_dr = np.mean(
+                    M2_dr = np.sum(
                         ((weights_control / n_t) * dr_resid_control)[:, None] * X_all_int[n_t:],
                         axis=0,
                     )
@@ -3152,8 +3154,8 @@ def _ipw_estimation_rc(
         cs_slice = slice(n_gt + n_gs + n_ct, None)
 
         M2 = np.zeros(X_all_int.shape[1])
-        M2 += np.mean(ipw_resid_ct[:, None] * X_all_int[ct_slice], axis=0)
-        M2 -= np.mean(ipw_resid_cs[:, None] * X_all_int[cs_slice], axis=0)
+        M2 += np.sum(ipw_resid_ct[:, None] * X_all_int[ct_slice], axis=0)
+        M2 -= np.sum(ipw_resid_cs[:, None] * X_all_int[cs_slice], axis=0)
 
         # psi-scale correction, convert to phi
         inf_all = inf_all + (asy_lin_rep_psi @ M2) / n_all
@@ -3469,12 +3471,12 @@ def _doubly_robust_rc(
 
         M2 = np.zeros(X_all_int.shape[1])
         if sum_w_ipw_ct > 0:
-            M2 -= np.mean(
+            M2 -= np.sum(
                 ((w_ipw_ct * dr_resid_ct / sum_w_ipw_ct)[:, None] * X_all_int[ct_slice]),
                 axis=0,
             )
         if sum_w_ipw_cs > 0:
-            M2 += np.mean(
+            M2 += np.sum(
                 ((w_ipw_cs * dr_resid_cs / sum_w_ipw_cs)[:, None] * X_all_int[cs_slice]),
                 axis=0,
             )