Address PR #113 Round 4 feedback: enforce simultaneous adoption and fix NaN handling

igerber · claude · igerber · commit a7a115de6523 · 2026-01-26T08:43:33.000-05:00
- Add staggered adoption check in _fit_joint() that raises ValueError
  when units are first treated at different periods
- Fix Rust solve_joint NaN weight masking: observations with NaN outcomes
  now get zero effective weight instead of having values imputed to 0.0
- Fix Rust average_treated initialization: use NaN instead of 0.0 so
  periods with all-NaN treated data are excluded from unit distance
- Update methodology registry to reflect enforced simultaneous adoption
- Add test_joint_rejects_staggered_adoption test

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/trop.py b/diff_diff/trop.py
@@ -1318,6 +1318,21 @@ def _fit_joint(
         if n_pre_periods < 2:
             raise ValueError("Need at least 2 pre-treatment periods")
 
+        # Check for staggered adoption (joint method requires simultaneous treatment)
+        first_treat_by_unit = []
+        for i in treated_unit_idx:
+            treated_periods_i = np.where(D[:, i] == 1)[0]
+            if len(treated_periods_i) > 0:
+                first_treat_by_unit.append(treated_periods_i[0])
+
+        unique_starts = sorted(set(first_treat_by_unit))
+        if len(unique_starts) > 1:
+            raise ValueError(
+                f"method='joint' requires simultaneous treatment adoption, but your data "
+                f"shows staggered adoption (units first treated at periods {unique_starts}). "
+                f"Use method='twostep' which properly handles staggered adoption designs."
+            )
+
         # LOOCV grid search for tuning parameters
         # Use Rust backend when available for parallel LOOCV (5-10x speedup)
         best_lambda = None
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -636,9 +636,11 @@ For joint method, LOOCV works as follows:
 - Faster computation for large panels
 
 **Assumptions**:
-- **Simultaneous adoption**: Bootstrap and jackknife variance estimation assume fixed
-  `treated_periods` across all resamples. Treatment timing is inferred once from the
-  data and held constant. For staggered adoption designs, use `method="twostep"`.
+- **Simultaneous adoption (enforced)**: The joint method requires all treated units
+  to receive treatment at the same time. A `ValueError` is raised if staggered
+  adoption is detected (units first treated at different periods). Treatment timing is
+  inferred once and held constant for bootstrap/jackknife variance estimation.
+  For staggered adoption designs, use `method="twostep"`.
 
 **Reference**: Adapted from reference implementation. See also Athey et al. (2025).
 
diff --git a/rust/src/trop.rs b/rust/src/trop.rs
@@ -1075,7 +1075,8 @@ fn compute_joint_weights(
     let n_pre = n_periods.saturating_sub(treated_periods);
 
     // Compute average treated trajectory
-    let mut average_treated = Array1::<f64>::zeros(n_periods);
+    // Initialize to NaN so periods with all-NaN treated data stay NaN (excluded from RMSE)
+    let mut average_treated = Array1::<f64>::from_elem(n_periods, f64::NAN);
     if !treated_unit_idx.is_empty() {
         for t in 0..n_periods {
             let mut sum = 0.0;
@@ -1089,6 +1090,7 @@ fn compute_joint_weights(
             if count > 0 {
                 average_treated[t] = sum / count as f64;
             }
+            // If count == 0, average_treated[t] stays NaN (correctly excluded)
         }
     }
 
@@ -1163,7 +1165,8 @@ fn solve_joint_no_lowrank(
 
     for t in 0..n_periods {
         for i in 0..n_units {
-            let w = delta[[t, i]];
+            // NaN outcomes get zero weight (not imputed to 0.0 with active weight)
+            let w = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
             let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
 
             sum_w += w;
@@ -1196,7 +1199,8 @@ fn solve_joint_no_lowrank(
             if sum_w_by_unit[i] > 1e-10 {
                 let mut num = 0.0;
                 for t in 0..n_periods {
-                    let w = delta[[t, i]];
+                    // NaN outcomes get zero weight
+                    let w = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
                     let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
                     num += w * (y_ti - mu - beta[t] - tau * d[[t, i]]);
                 }
@@ -1209,7 +1213,8 @@ fn solve_joint_no_lowrank(
             if sum_w_by_period[t] > 1e-10 {
                 let mut num = 0.0;
                 for i in 0..n_units {
-                    let w = delta[[t, i]];
+                    // NaN outcomes get zero weight
+                    let w = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
                     let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
                     num += w * (y_ti - mu - alpha[i] - tau * d[[t, i]]);
                 }
@@ -1222,7 +1227,8 @@ fn solve_joint_no_lowrank(
         let mut denom_tau = 0.0;
         for t in 0..n_periods {
             for i in 0..n_units {
-                let w = delta[[t, i]];
+                // NaN outcomes get zero weight
+                let w = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
                 let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
                 let d_ti = d[[t, i]];
                 if d_ti > 0.5 {  // Only treated observations contribute
@@ -1239,7 +1245,8 @@ fn solve_joint_no_lowrank(
         let mut num_mu = 0.0;
         for t in 0..n_periods {
             for i in 0..n_units {
-                let w = delta[[t, i]];
+                // NaN outcomes get zero weight
+                let w = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
                 let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
                 num_mu += w * (y_ti - alpha[i] - beta[t] - tau * d[[t, i]]);
             }
@@ -1279,21 +1286,20 @@ fn solve_joint_with_lowrank(
         let l_old = l.clone();
 
         // Step 1: Fix L, solve for (mu, alpha, beta, tau)
-        // Adjusted outcome: Y - L
+        // Adjusted outcome: Y - L (preserve NaN so solve_joint_no_lowrank masks weights)
         let y_adj = Array2::from_shape_fn((n_periods, n_units), |(t, i)| {
-            let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
-            y_ti - l[[t, i]]
+            y[[t, i]] - l[[t, i]]  // NaN - finite = NaN (preserves NaN info)
         });
 
         let (mu, alpha, beta, tau) = solve_joint_no_lowrank(&y_adj.view(), d, delta)?;
 
         // Step 2: Fix (mu, alpha, beta, tau), update L
-        // Residual: R = Y - mu - alpha - beta - tau*D
+        // Residual: R = Y - mu - alpha - beta - tau*D (preserve NaN)
         let mut r = Array2::<f64>::zeros((n_periods, n_units));
         for t in 0..n_periods {
             for i in 0..n_units {
-                let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
-                r[[t, i]] = y_ti - mu - alpha[i] - beta[t] - tau * d[[t, i]];
+                // NaN - finite = NaN (will be masked in gradient step)
+                r[[t, i]] = y[[t, i]] - mu - alpha[i] - beta[t] - tau * d[[t, i]];
             }
         }
 
@@ -1302,15 +1308,20 @@ fn solve_joint_with_lowrank(
         let eta = if delta_max > 0.0 { 1.0 / delta_max } else { 1.0 };
 
         // gradient_step = L + eta * delta * (R - L)
+        // NaN outcomes get zero weight so they don't affect gradient
         let mut gradient_step = Array2::<f64>::zeros((n_periods, n_units));
         for t in 0..n_periods {
             for i in 0..n_units {
+                // Mask delta for NaN outcomes
+                let delta_ti = if y[[t, i]].is_finite() { delta[[t, i]] } else { 0.0 };
                 let delta_norm = if delta_max > 0.0 {
-                    delta[[t, i]] / delta_max
+                    delta_ti / delta_max
                 } else {
-                    delta[[t, i]]
+                    delta_ti
                 };
-                gradient_step[[t, i]] = l[[t, i]] + delta_norm * (r[[t, i]] - l[[t, i]]);
+                // r[[t,i]] may be NaN, but delta_norm=0 for NaN obs, so contribution=0
+                let r_contrib = if r[[t, i]].is_finite() { r[[t, i]] } else { 0.0 };
+                gradient_step[[t, i]] = l[[t, i]] + delta_norm * (r_contrib - l[[t, i]]);
             }
         }
 
@@ -1324,10 +1335,9 @@ fn solve_joint_with_lowrank(
         }
     }
 
-    // Final solve with converged L
+    // Final solve with converged L (preserve NaN so solve_joint_no_lowrank masks weights)
     let y_adj = Array2::from_shape_fn((n_periods, n_units), |(t, i)| {
-        let y_ti = if y[[t, i]].is_finite() { y[[t, i]] } else { 0.0 };
-        y_ti - l[[t, i]]
+        y[[t, i]] - l[[t, i]]  // NaN - finite = NaN (preserves NaN info)
     });
     let (mu, alpha, beta, tau) = solve_joint_no_lowrank(&y_adj.view(), d, delta)?;
 
diff --git a/tests/test_trop.py b/tests/test_trop.py
@@ -3157,3 +3157,31 @@ def test_joint_unit_no_valid_pre_gets_zero_weight(self, simple_panel_data):
 
         assert np.isfinite(results.att), "ATT should be finite even with unit having no pre-period data"
         assert np.isfinite(results.se), "SE should be finite"
+
+    def test_joint_rejects_staggered_adoption(self):
+        """Joint method raises ValueError for staggered adoption data.
+
+        The joint method assumes all treated units receive treatment at the
+        same time. With staggered adoption (units first treated at different
+        periods), the method's weights and variance estimation are invalid.
+        """
+        # Create data with staggered treatment (units treated at different times)
+        data = []
+        np.random.seed(42)
+        for i in range(10):
+            # Units 0-2 first treated at t=5, units 3-4 first treated at t=7
+            first_treat = 5 if i < 3 else 7
+            is_treated_unit = i < 5  # Units 0-4 are treated, 5-9 are control
+            for t in range(10):
+                treated = 1 if is_treated_unit and t >= first_treat else 0
+                data.append({
+                    'unit': i,
+                    'time': t,
+                    'outcome': np.random.randn(),
+                    'treated': treated
+                })
+        df = pd.DataFrame(data)
+
+        trop = TROP(method="joint")
+        with pytest.raises(ValueError, match="staggered adoption"):
+            trop.fit(df, 'outcome', 'treated', 'unit', 'time')