Address PR #113 Round 2 feedback: true NaN exclusion and jackknife fixes

igerber · claude · igerber · commit f58d690ad7cc · 2026-01-26T07:31:37.000-05:00
- Fix _solve_joint_with_lowrank to mask delta for NaN observations
  (ensures NaN Y values don't contribute to gradient step)
- Fix jackknife to use true leave-one-out via weight zeroing
  (removes incorrect imputation with column means)
- Handle units with no valid pre-period data by setting delta_unit=0
  (previously got max weight due to dist=0)
- Document simultaneous adoption assumption for joint method variance
- Correct notebook weight normalization statement (not "sum to one")
- Add tests for true NaN exclusion and jackknife variation behavior

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/trop.py b/diff_diff/trop.py
@@ -933,10 +933,17 @@ def _compute_joint_weights(
         if n_pre == 0:
             raise ValueError("No pre-treatment periods")
 
-        # Use valid count per unit (avoid division by zero)
-        valid_count = np.maximum(valid_count, 1)
-        dist_unit = np.sqrt(sum_sq / valid_count)
+        # Track units with no valid pre-period data
+        no_valid_pre = valid_count == 0
+
+        # Use valid count per unit (avoid division by zero for calculation)
+        valid_count_safe = np.maximum(valid_count, 1)
+        dist_unit = np.sqrt(sum_sq / valid_count_safe)
+
+        # Units with no valid pre-period data get zero weight
+        # (dist is undefined, so we set it to inf -> delta_unit = exp(-inf) = 0)
         delta_unit = np.exp(-lambda_unit * dist_unit)
+        delta_unit[no_valid_pre] = 0.0
 
         # Outer product: (n_periods x n_units)
         delta = np.outer(delta_time, delta_unit)
@@ -1154,6 +1161,12 @@ def _solve_joint_with_lowrank(
         # The solver will also zero weights for NaN observations
         Y_safe = np.where(np.isfinite(Y), Y, 0.0)
 
+        # Mask delta to exclude NaN outcomes from estimation
+        # This ensures NaN observations don't contribute to the gradient step
+        nan_mask = ~np.isfinite(Y)
+        delta_masked = delta.copy()
+        delta_masked[nan_mask] = 0.0
+
         # Initialize L = 0
         L = np.zeros((n_periods, n_units))
 
@@ -1162,23 +1175,25 @@ def _solve_joint_with_lowrank(
 
             # Step 1: Fix L, solve for (mu, alpha, beta, tau)
             # Adjusted outcome: Y - L (using NaN-safe Y)
+            # Pass masked delta to exclude NaN observations from WLS
             Y_adj = Y_safe - L
-            mu, alpha, beta, tau = self._solve_joint_no_lowrank(Y_adj, D, delta)
+            mu, alpha, beta, tau = self._solve_joint_no_lowrank(Y_adj, D, delta_masked)
 
             # Step 2: Fix (mu, alpha, beta, tau), update L
             # Residual: R = Y - mu - alpha - beta - tau*D (using NaN-safe Y)
             R = Y_safe - mu - alpha[np.newaxis, :] - beta[:, np.newaxis] - tau * D
 
             # Weighted proximal step for L (soft-threshold SVD)
-            # Normalize weights
-            delta_max = np.max(delta)
+            # Normalize weights (using masked delta to exclude NaN observations)
+            delta_max = np.max(delta_masked)
             if delta_max > 0:
-                delta_norm = delta / delta_max
+                delta_norm = delta_masked / delta_max
             else:
-                delta_norm = delta
+                delta_norm = delta_masked
 
             # Weighted average between current L and target R
             # L_next = L + delta_norm * (R - L), then soft-threshold
+            # NaN observations have delta_norm=0, so they don't influence L update
             gradient_step = L + delta_norm * (R - L)
 
             # Soft-threshold singular values
@@ -1223,6 +1238,15 @@ def _fit_joint(
         -------
         TROPResults
             Estimation results.
+
+        Notes
+        -----
+        Bootstrap and jackknife variance estimation assume simultaneous treatment
+        adoption (fixed `treated_periods` across resamples). The treatment timing
+        is inferred from the data once and held constant for all bootstrap/jackknife
+        iterations. For staggered adoption designs where treatment timing varies
+        across units, use `method="twostep"` which computes observation-specific
+        weights that naturally handle heterogeneous timing.
         """
         # Data setup (same as twostep method)
         all_units = sorted(data[unit].unique())
@@ -1730,26 +1754,24 @@ def _jackknife_variance_joint(
         treated_unit_idx = np.where(np.any(D == 1, axis=0))[0]
 
         for leave_out in treated_unit_idx:
-            # Create mask excluding this unit
+            # True leave-one-out: zero the delta weight for the left-out unit
+            # This excludes the unit from estimation without imputation
             Y_jack = Y.copy()
             D_jack = D.copy()
-            Y_jack[:, leave_out] = np.nan
-            D_jack[:, leave_out] = 0
-
-            # Replace NaN with column mean for stability
-            col_means = np.nanmean(Y_jack, axis=0)
-            for i in range(n_units):
-                nan_mask = np.isnan(Y_jack[:, i])
-                Y_jack[nan_mask, i] = col_means[i] if np.isfinite(col_means[i]) else 0.0
+            D_jack[:, leave_out] = 0  # Mark as not treated for weight computation
 
             try:
-                # Compute weights
+                # Compute weights (left-out unit is still in calculation)
                 delta = self._compute_joint_weights(
                     Y_jack, D_jack, lambda_time, lambda_unit,
                     treated_periods, n_units, n_periods
                 )
 
-                # Fit model
+                # Zero the delta weight for the left-out unit
+                # This ensures the unit doesn't contribute to estimation
+                delta[:, leave_out] = 0.0
+
+                # Fit model (left-out unit has zero weight, truly excluded)
                 if lambda_nn >= 1e10:
                     _, _, _, tau = self._solve_joint_no_lowrank(Y_jack, D_jack, delta)
                 else:
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -635,6 +635,11 @@ For joint method, LOOCV works as follows:
 - Single model fit per λ combination vs. N_treated fits
 - Faster computation for large panels
 
+**Assumptions**:
+- **Simultaneous adoption**: Bootstrap and jackknife variance estimation assume fixed
+  `treated_periods` across all resamples. Treatment timing is inferred once from the
+  data and held constant. For staggered adoption designs, use `method="twostep"`.
+
 **Reference**: Adapted from reference implementation. See also Athey et al. (2025).
 
 **Requirements checklist:**
diff --git a/docs/tutorials/10_trop.ipynb b/docs/tutorials/10_trop.ipynb
@@ -3,29 +3,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "# Triply Robust Panel (TROP) Estimator\n",
-    "\n",
-    "This notebook demonstrates the **Triply Robust Panel (TROP)** estimator (Athey, Imbens, Qu & Viviano, 2025), which combines three robustness components:\n",
-    "\n",
-    "1. **Nuclear Norm Regularized Factor Model**: Estimates interactive fixed effects via matrix completion with nuclear norm penalty\n",
-    "2. **Exponential Distance-Based Unit Weights**: ω_j = exp(-λ_unit × dist(j,i)) where dist(j,i) is the root mean squared difference in outcomes between units j and i, computed only on periods where both units are untreated and excluding the target period t (Equation 3 in the paper)\n",
-    "3. **Exponential Time Decay Weights**: θ_s = exp(-λ_time × |s-t|) weighting by proximity to treatment\n",
-    "\n",
-    "**Weight Normalization**: Following the paper, the observation-specific weights ω and θ are treated as probability weights that effectively sum to one within each treated observation's counterfactual estimation.\n",
-    "\n",
-    "TROP is particularly useful when:\n",
-    "- There may be unobserved time-varying confounders with factor structure\n",
-    "- Standard DiD or SDID may be biased due to latent factors\n",
-    "- You want robust inference under factor confounding\n",
-    "\n",
-    "We'll cover:\n",
-    "1. When to use TROP\n",
-    "2. Basic estimation with LOOCV tuning\n",
-    "3. Understanding tuning parameters\n",
-    "4. Examining factor structure\n",
-    "5. Comparing TROP vs SDID"
-   ]
+   "source": "# Triply Robust Panel (TROP) Estimator\n\nThis notebook demonstrates the **Triply Robust Panel (TROP)** estimator (Athey, Imbens, Qu & Viviano, 2025), which combines three robustness components:\n\n1. **Nuclear Norm Regularized Factor Model**: Estimates interactive fixed effects via matrix completion with nuclear norm penalty\n2. **Exponential Distance-Based Unit Weights**: ω_j = exp(-λ_unit × dist(j,i)) where dist(j,i) is the root mean squared difference in outcomes between units j and i, computed only on periods where both units are untreated and excluding the target period t (Equation 3 in the paper)\n3. **Exponential Time Decay Weights**: θ_s = exp(-λ_time × |s-t|) weighting by proximity to treatment\n\n**Weights**: The observation-specific weights ω and θ are importance weights that control the relative contribution of each observation to counterfactual estimation. Higher weights indicate more relevant observations for the target counterfactual.\n\nTROP is particularly useful when:\n- There may be unobserved time-varying confounders with factor structure\n- Standard DiD or SDID may be biased due to latent factors\n- You want robust inference under factor confounding\n\nWe'll cover:\n1. When to use TROP\n2. Basic estimation with LOOCV tuning\n3. Understanding tuning parameters\n4. Examining factor structure\n5. Comparing TROP vs SDID"
   },
   {
    "cell_type": "code",
@@ -693,7 +671,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "## Summary\n\nKey takeaways for TROP:\n\n1. **Best use cases**: Factor confounding, unobserved time-varying confounders with interactive effects\n2. **Factor estimation**: Nuclear norm regularization with LOOCV for tuning\n3. **Three tuning parameters**: λ_time, λ_unit, λ_nn selected automatically via LOOCV\n4. **Unit weights**: Exponential distance-based weighting of control units, where distance is computed as RMS outcome difference on control periods excluding the target period\n5. **Time weights**: Exponential decay weighting of pre-treatment periods\n6. **Weight normalization**: Weights are treated as probability weights that sum to one\n7. **Estimation methods**:\n   - `method='twostep'` (default): Per-observation estimation, allows heterogeneous effects\n   - `method='joint'`: Single scalar treatment effect, faster but assumes homogeneity\n\n**When to use TROP vs SDID**:\n- Use **SDID** when parallel trends is plausible and factors are not a concern\n- Use **TROP** when you suspect factor confounding (regional shocks, economic cycles, latent factors)\n- Running both provides a useful robustness check\n\n**When to use twostep vs joint method**:\n- Use **twostep** (default) for maximum flexibility and heterogeneous treatment effects\n- Use **joint** for faster estimation when effects are expected to be homogeneous\n\n**Reference**:\n- Athey, S., Imbens, G. W., Qu, Z., & Viviano, D. (2025). Triply Robust Panel Estimators. *Working Paper*. https://arxiv.org/abs/2508.21536"
+   "source": "## Summary\n\nKey takeaways for TROP:\n\n1. **Best use cases**: Factor confounding, unobserved time-varying confounders with interactive effects\n2. **Factor estimation**: Nuclear norm regularization with LOOCV for tuning\n3. **Three tuning parameters**: λ_time, λ_unit, λ_nn selected automatically via LOOCV\n4. **Unit weights**: Exponential distance-based weighting of control units, where distance is computed as RMS outcome difference on control periods excluding the target period\n5. **Time weights**: Exponential decay weighting of pre-treatment periods\n6. **Weights**: Importance weights controlling relative contribution of observations (higher = more relevant)\n7. **Estimation methods**:\n   - `method='twostep'` (default): Per-observation estimation, allows heterogeneous effects\n   - `method='joint'`: Single scalar treatment effect, faster but assumes homogeneity\n\n**When to use TROP vs SDID**:\n- Use **SDID** when parallel trends is plausible and factors are not a concern\n- Use **TROP** when you suspect factor confounding (regional shocks, economic cycles, latent factors)\n- Running both provides a useful robustness check\n\n**When to use twostep vs joint method**:\n- Use **twostep** (default) for maximum flexibility and heterogeneous treatment effects\n- Use **joint** for faster estimation when effects are expected to be homogeneous\n\n**Reference**:\n- Athey, S., Imbens, G. W., Qu, Z., & Viviano, D. (2025). Triply Robust Panel Estimators. *Working Paper*. https://arxiv.org/abs/2508.21536"
   },
   {
    "cell_type": "code",
@@ -710,26 +688,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## Summary\n",
-    "\n",
-    "Key takeaways for TROP:\n",
-    "\n",
-    "1. **Best use cases**: Factor confounding, unobserved time-varying confounders with interactive effects\n",
-    "2. **Factor estimation**: Nuclear norm regularization with LOOCV for tuning\n",
-    "3. **Three tuning parameters**: λ_time, λ_unit, λ_nn selected automatically via LOOCV\n",
-    "4. **Unit weights**: Exponential distance-based weighting of control units, where distance is computed as RMS outcome difference on control periods excluding the target period\n",
-    "5. **Time weights**: Exponential decay weighting of pre-treatment periods\n",
-    "6. **Weight normalization**: Weights are treated as probability weights that sum to one\n",
-    "\n",
-    "**When to use TROP vs SDID**:\n",
-    "- Use **SDID** when parallel trends is plausible and factors are not a concern\n",
-    "- Use **TROP** when you suspect factor confounding (regional shocks, economic cycles, latent factors)\n",
-    "- Running both provides a useful robustness check\n",
-    "\n",
-    "**Reference**:\n",
-    "- Athey, S., Imbens, G. W., Qu, Z., & Viviano, D. (2025). Triply Robust Panel Estimators. *Working Paper*. https://arxiv.org/abs/2508.21536"
-   ]
+   "source": "## Summary\n\nKey takeaways for TROP:\n\n1. **Best use cases**: Factor confounding, unobserved time-varying confounders with interactive effects\n2. **Factor estimation**: Nuclear norm regularization with LOOCV for tuning\n3. **Three tuning parameters**: λ_time, λ_unit, λ_nn selected automatically via LOOCV\n4. **Unit weights**: Exponential distance-based weighting of control units, where distance is computed as RMS outcome difference on control periods excluding the target period\n5. **Time weights**: Exponential decay weighting of pre-treatment periods\n6. **Weights**: Importance weights controlling relative contribution of observations (higher = more relevant)\n\n**When to use TROP vs SDID**:\n- Use **SDID** when parallel trends is plausible and factors are not a concern\n- Use **TROP** when you suspect factor confounding (regional shocks, economic cycles, latent factors)\n- Running both provides a useful robustness check\n\n**Reference**:\n- Athey, S., Imbens, G. W., Qu, Z., & Viviano, D. (2025). Triply Robust Panel Estimators. *Working Paper*. https://arxiv.org/abs/2508.21536"
   }
  ],
  "metadata": {
diff --git a/tests/test_trop.py b/tests/test_trop.py