Address PR #110 feedback round 8: three LOOCV/validation fixes

igerber · claude · igerber · commit 6273674e9e88 · 2026-01-25T14:23:00.000-05:00
Issue 1: Final LOOCV score infinity conversion
- Convert inf values before calling loocv_score_for_params in Rust
- Ensures final score uses same converted values that LOOCV evaluated
- λ_time/λ_unit=∞ → 0.0, λ_nn=∞ → 1e10

Issue 2: Rust LOOCV failed observation metadata
- Extend loocv_score_for_params to return Option&lt;(usize, usize)&gt;
- Track first failed observation (t, i) for informative warnings
- Python now includes coordinates in LOOCV failure warnings

Issue 3: D matrix validation for unbalanced panels
- Track missing values before fillna(0) with missing_mask
- Only validate monotonicity between observed periods
- Missing data no longer triggers false absorbing-state violations

Tests: 4 new tests in TestPR110FeedbackRound8 class
Docs: Updated REGISTRY.md with unbalanced panel support

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/trop.py b/diff_diff/trop.py
@@ -900,21 +900,32 @@ def fit(
             .reindex(index=all_periods, columns=all_units)
             .values
         )
-        D = (
+
+        # For D matrix, track missing values BEFORE fillna to support unbalanced panels
+        # Issue 3 fix: Missing observations should not trigger spurious violations
+        D_raw = (
             data.pivot(index=time, columns=unit, values=treatment)
             .reindex(index=all_periods, columns=all_units)
-            .fillna(0)
-            .astype(int)
-            .values
         )
+        missing_mask = pd.isna(D_raw).values  # True where originally missing
+        D = D_raw.fillna(0).astype(int).values
 
         # Validate D is monotonic non-decreasing per unit (absorbing state)
         # D[t, i] must satisfy: once D=1, it must stay 1 for all subsequent periods
         # Vectorized check: diff(D, axis=0) should never be negative
+        # Issue 3 fix: Only check transitions where BOTH periods are observed
         d_diff = np.diff(D, axis=0)
-        if np.any(d_diff < 0):
+
+        # Valid transition mask: neither the current nor next period is missing
+        # missing_mask[:-1] = source period missing, missing_mask[1:] = target period missing
+        valid_transition = ~(missing_mask[:-1] | missing_mask[1:])
+
+        # Only flag violations where both periods are observed
+        violations = (d_diff < 0) & valid_transition
+
+        if np.any(violations):
             # Find which units violate the absorbing state constraint
-            violating_units_mask = np.any(d_diff < 0, axis=0)
+            violating_units_mask = np.any(violations, axis=0)
             violating_unit_ids = [all_units[i] for i in np.where(violating_units_mask)[0]]
             raise ValueError(
                 f"Treatment indicator is not an absorbing state for units: {violating_unit_ids}. "
@@ -977,31 +988,43 @@ def fit(
                 lambda_unit_arr = np.array(self.lambda_unit_grid, dtype=np.float64)
                 lambda_nn_arr = np.array(self.lambda_nn_grid, dtype=np.float64)
 
-                best_lt, best_lu, best_ln, best_score, n_valid, n_attempted = _rust_loocv_grid_search(
+                result = _rust_loocv_grid_search(
                     Y, D.astype(np.float64), control_mask_u8,
                     time_dist_matrix,
                     lambda_time_arr, lambda_unit_arr, lambda_nn_arr,
                     self.max_loocv_samples, self.max_iter, self.tol,
                     self.seed if self.seed is not None else 0
                 )
+                # Unpack result - 7 values including optional first_failed_obs
+                best_lt, best_lu, best_ln, best_score, n_valid, n_attempted, first_failed_obs = result
                 # Only accept finite scores - infinite means all fits failed
                 if np.isfinite(best_score):
                     best_lambda = (best_lt, best_lu, best_ln)
                 # else: best_lambda stays None, triggering defaults fallback
                 # Emit warnings consistent with Python implementation
                 if n_valid == 0:
+                    # Include failed observation coordinates if available (Issue 2 fix)
+                    obs_info = ""
+                    if first_failed_obs is not None:
+                        t_idx, i_idx = first_failed_obs
+                        obs_info = f" First failure at observation ({t_idx}, {i_idx})."
                     warnings.warn(
                         f"LOOCV: All {n_attempted} fits failed for "
                         f"λ=({best_lt}, {best_lu}, {best_ln}). "
-                        "Returning infinite score.",
+                        f"Returning infinite score.{obs_info}",
                         UserWarning
                     )
                 elif n_attempted > 0 and (n_attempted - n_valid) > 0.1 * n_attempted:
                     n_failed = n_attempted - n_valid
+                    # Include failed observation coordinates if available
+                    obs_info = ""
+                    if first_failed_obs is not None:
+                        t_idx, i_idx = first_failed_obs
+                        obs_info = f" First failure at observation ({t_idx}, {i_idx})."
                     warnings.warn(
                         f"LOOCV: {n_failed}/{n_attempted} fits failed for "
                         f"λ=({best_lt}, {best_lu}, {best_ln}). "
-                        "This may indicate numerical instability.",
+                        f"This may indicate numerical instability.{obs_info}",
                         UserWarning
                     )
             except Exception as e:
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -550,8 +550,10 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]²
   - Handling: Raises `ValueError` with list of violating unit IDs and remediation guidance
   - Error message includes: "convert to absorbing state: D[t, i] = 1 for all t >= first treatment period"
   - **Rationale**: Event-style D (0→1→0) silently biases ATT; runtime validation prevents misuse
+  - **Unbalanced panels**: Missing unit-period observations are allowed. Monotonicity validation only checks transitions between observed periods. A unit with D=1 at t=3 and missing data at t=5 is NOT flagged as a violation (the apparent 1→0 transition is due to missing data, not a real violation).
 - Wrong D specification: if user provides event-style D (only first treatment period),
   the absorbing-state validation will raise ValueError with helpful guidance
+- **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages
 
 **Reference implementation(s):**
 - Authors' replication code (forthcoming)
@@ -566,6 +568,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]²
 - [x] ATT averages over all D==1 cells (general assignment patterns)
 - [x] No post_periods parameter (D matrix determines treatment timing)
 - [x] D matrix semantics documented (absorbing state, not event indicator)
+- [x] Unbalanced panels supported (missing observations don't trigger false violations)
 
 ---
 
diff --git a/rust/src/trop.rs b/rust/src/trop.rs
@@ -220,7 +220,7 @@ fn univariate_loocv_search(
                 },
             };
 
-            let (score, _) = loocv_score_for_params(
+            let (score, _, _) = loocv_score_for_params(
                 y, d, control_mask, time_dist, control_obs,
                 lambda_time, lambda_unit, lambda_nn,
                 max_iter, tol,
@@ -318,9 +318,10 @@ fn cycling_parameter_search(
 /// * `seed` - Random seed for subsampling
 ///
 /// # Returns
-/// (best_lambda_time, best_lambda_unit, best_lambda_nn, best_score, n_valid, n_attempted)
+/// (best_lambda_time, best_lambda_unit, best_lambda_nn, best_score, n_valid, n_attempted, first_failed_obs)
 /// where n_valid and n_attempted are the counts for the best parameter combination,
 /// allowing Python to emit warnings when >10% of fits fail.
+/// first_failed_obs is Some((t, i)) if a fit failed during final score computation, None otherwise.
 #[pyfunction]
 #[pyo3(signature = (y, d, control_mask, time_dist_matrix, lambda_time_grid, lambda_unit_grid, lambda_nn_grid, max_loocv_samples, max_iter, tol, seed))]
 #[allow(clippy::too_many_arguments)]
@@ -337,7 +338,7 @@ pub fn loocv_grid_search<'py>(
     max_iter: usize,
     tol: f64,
     seed: u64,
-) -> PyResult<(f64, f64, f64, f64, usize, usize)> {
+) -> PyResult<(f64, f64, f64, f64, usize, usize, Option<(usize, usize)>)> {
     let y_arr = y.as_array();
     let d_arr = d.as_array();
     let control_mask_arr = control_mask.as_array();
@@ -383,14 +384,24 @@ pub fn loocv_grid_search<'py>(
         max_iter, tol, 10,
     );
 
-    // Compute final score
-    let (best_score, n_valid) = loocv_score_for_params(
+    // Convert infinity values BEFORE computing final score (Issue 1 fix)
+    // Per paper Equations 2-3:
+    // - λ_time/λ_unit=∞ → uniform weights → use 0.0
+    // - λ_nn=∞ → infinite penalty → L≈0 (factor model disabled) → use 1e10
+    // This ensures final score computation matches what LOOCV evaluated.
+    let best_time_eff = if best_time.is_infinite() { 0.0 } else { best_time };
+    let best_unit_eff = if best_unit.is_infinite() { 0.0 } else { best_unit };
+    let best_nn_eff = if best_nn.is_infinite() { 1e10 } else { best_nn };
+
+    // Compute final score with converted values
+    let (best_score, n_valid, first_failed) = loocv_score_for_params(
         &y_arr, &d_arr, &control_mask_arr, &time_dist_arr, &control_obs,
-        best_time, best_unit, best_nn,
+        best_time_eff, best_unit_eff, best_nn_eff,
         max_iter, tol,
     );
 
-    Ok((best_time, best_unit, best_nn, best_score, n_valid, n_attempted))
+    // Return ORIGINAL grid values (for user visibility) but score computed with converted
+    Ok((best_time, best_unit, best_nn, best_score, n_valid, n_attempted, first_failed))
 }
 
 /// Get sampled control observations for LOOCV.
@@ -429,7 +440,8 @@ fn get_control_observations(
 /// Compute LOOCV score for a specific parameter combination.
 ///
 /// # Returns
-/// (score, n_valid) - the LOOCV score and number of successful fits
+/// (score, n_valid, first_failed_obs) - the LOOCV score, number of successful fits,
+/// and the first failed observation (t, i) if any fit failed, None otherwise.
 #[allow(clippy::too_many_arguments)]
 fn loocv_score_for_params(
     y: &ArrayView2<f64>,
@@ -442,7 +454,7 @@ fn loocv_score_for_params(
     lambda_nn: f64,
     max_iter: usize,
     tol: f64,
-) -> (f64, usize) {
+) -> (f64, usize, Option<(usize, usize)>) {
     let n_periods = y.nrows();
     let n_units = y.ncols();
 
@@ -484,17 +496,18 @@ fn loocv_score_for_params(
             None => {
                 // Per Equation 5: Q(λ) must sum over ALL D==0 cells
                 // Any failure means this λ cannot produce valid estimates for all cells
-                return (f64::INFINITY, n_valid);
+                // Return the failed observation (t, i) for warning metadata
+                return (f64::INFINITY, n_valid, Some((t, i)));
             }
         }
     }
 
     if n_valid == 0 {
-        (f64::INFINITY, 0)
+        (f64::INFINITY, 0, None)
     } else {
         // Return SUM of squared pseudo-treatment effects per Equation 5 (page 8):
         // Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]²
-        (tau_sq_sum, n_valid)
+        (tau_sq_sum, n_valid, None)
     }
 }
 
diff --git a/tests/test_trop.py b/tests/test_trop.py