Address PR #115 AI review: NaN propagation, fallback, and performance

igerber · claude · igerber · commit f3b1566ad53f · 2026-01-26T19:22:15.000-05:00
P0: Return f64::NAN instead of 0.0 in TROP bootstrap when &lt; 2 samples
P1: Add Python fallback in _solve_ols_rust for numerical instability
P2: Gate expensive O(n³) residual check behind LU pivot ratio detection

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
@@ -251,10 +251,10 @@ def _solve_ols_rust(
     cluster_ids: Optional[np.ndarray] = None,
     return_vcov: bool = True,
     return_fitted: bool = False,
-) -> Union[
+) -> Optional[Union[
     Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
     Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
-]:
+]]:
     """
     Rust backend implementation of solve_ols for full-rank matrices.
 
@@ -296,15 +296,30 @@ def _solve_ols_rust(
         Fitted values if return_fitted=True.
     vcov : np.ndarray, optional
         Variance-covariance matrix if return_vcov=True.
+    None
+        If Rust backend detects numerical instability and caller should
+        fall back to Python backend.
     """
     # Convert cluster_ids to int64 for Rust (handles string/categorical IDs)
     if cluster_ids is not None:
         cluster_ids = _factorize_cluster_ids(cluster_ids)
 
-    # Call Rust backend
-    coefficients, residuals, vcov = _rust_solve_ols(
-        X, y, cluster_ids=cluster_ids, return_vcov=return_vcov
-    )
+    # Call Rust backend with fallback on numerical instability
+    try:
+        coefficients, residuals, vcov = _rust_solve_ols(
+            X, y, cluster_ids=cluster_ids, return_vcov=return_vcov
+        )
+    except ValueError as e:
+        error_msg = str(e).lower()
+        if "numerically unstable" in error_msg or "singular" in error_msg:
+            warnings.warn(
+                f"Rust backend detected numerical instability: {e}. "
+                "Falling back to Python backend.",
+                UserWarning,
+                stacklevel=3,
+            )
+            return None  # Signal caller to use Python fallback
+        raise
 
     # Convert to numpy arrays
     coefficients = np.asarray(coefficients)
@@ -499,6 +514,7 @@ def solve_ols(
     # Routing strategy:
     # - Full-rank + Rust available → fast Rust backend (SVD-based solve)
     # - Rank-deficient → Python backend (proper NA handling, valid SEs)
+    # - Rust numerical instability → Python fallback (via None return)
     # - No Rust → Python backend (works for all cases)
     if HAS_RUST_BACKEND and _rust_solve_ols is not None and not is_rank_deficient:
         result = _solve_ols_rust(
@@ -508,6 +524,19 @@ def solve_ols(
             return_fitted=return_fitted,
         )
 
+        # Check for None: Rust backend detected numerical instability and
+        # signaled us to fall back to Python backend
+        if result is None:
+            return _solve_ols_numpy(
+                X, y,
+                cluster_ids=cluster_ids,
+                return_vcov=return_vcov,
+                return_fitted=return_fitted,
+                rank_deficient_action=rank_deficient_action,
+                column_names=column_names,
+                _precomputed_rank_info=None,  # Force fresh rank detection
+            )
+
         # Check for NaN vcov: Rust SVD may detect rank-deficiency that QR missed
         # for ill-conditioned matrices (QR and SVD have different numerical properties).
         # When this happens, fall back to Python's R-style handling.
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
@@ -286,8 +286,12 @@ fn ndarray_to_faer(arr: &Array2<f64>) -> faer::Mat<f64> {
 /// Invert a symmetric positive-definite matrix.
 ///
 /// Uses LU decomposition with partial pivoting. Includes both NaN/Inf check
-/// and residual-based verification to catch near-singular matrices that
-/// produce finite but numerically inaccurate results.
+/// and conditional residual-based verification to catch near-singular matrices
+/// that produce finite but numerically inaccurate results.
+///
+/// Performance optimization: The expensive O(n³) residual check (A * A⁻¹ - I)
+/// is only performed when LU pivot ratios suggest potential instability. For
+/// well-conditioned matrices (the common case), this check is skipped.
 fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
     let n = a.nrows();
 
@@ -323,31 +327,51 @@ fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
         ));
     }
 
-    // Verify inversion accuracy by checking ||A * A^{-1} - I||_max
-    // For near-singular matrices, this residual will be large even if
-    // the result contains no NaN/Inf values
-    let a_times_inv = a_faer.as_ref() * &x_faer;
-    let mut max_residual = 0.0_f64;
+    // Check pivot ratio to detect potential instability.
+    // The diagonal of U contains the pivots from LU factorization.
+    // A small pivot ratio (min/max) indicates potential numerical instability.
+    let u_factor = lu.U();
+    let mut max_pivot = 0.0_f64;
+    let mut min_pivot = f64::INFINITY;
     for i in 0..n {
-        for j in 0..n {
-            let expected = if i == j { 1.0 } else { 0.0 };
-            let residual = (a_times_inv[(i, j)] - expected).abs();
-            max_residual = max_residual.max(residual);
+        let pivot = u_factor[(i, i)].abs();
+        if pivot > 0.0 {
+            max_pivot = max_pivot.max(pivot);
+            min_pivot = min_pivot.min(pivot);
         }
     }
+    let pivot_ratio = if max_pivot > 0.0 { min_pivot / max_pivot } else { 0.0 };
+
+    // Only perform expensive residual check if pivots suggest potential instability.
+    // Threshold of 1e-10 catches truly problematic matrices while avoiding
+    // unnecessary O(n³) computation for well-conditioned cases.
+    if pivot_ratio < 1e-10 {
+        // Verify inversion accuracy by checking ||A * A^{-1} - I||_max
+        // For near-singular matrices, this residual will be large even if
+        // the result contains no NaN/Inf values
+        let a_times_inv = a_faer.as_ref() * &x_faer;
+        let mut max_residual = 0.0_f64;
+        for i in 0..n {
+            for j in 0..n {
+                let expected = if i == j { 1.0 } else { 0.0 };
+                let residual = (a_times_inv[(i, j)] - expected).abs();
+                max_residual = max_residual.max(residual);
+            }
+        }
 
-    // Threshold: detect truly singular matrices while allowing ill-conditioned ones
-    // Ill-conditioned matrices (high condition number) can have residuals up to ~1e-4
-    // while still producing usable results. Use 1e-4 * n as threshold.
-    let threshold = 1e-4 * (n as f64);
-    if max_residual > threshold {
-        return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-            format!(
-                "Matrix inversion numerically unstable (residual={:.2e} > threshold={:.2e}). \
-                 Design matrix may be near-singular.",
-                max_residual, threshold
-            )
-        ));
+        // Threshold: detect truly singular matrices while allowing ill-conditioned ones
+        // Ill-conditioned matrices (high condition number) can have residuals up to ~1e-4
+        // while still producing usable results. Use 1e-4 * n as threshold.
+        let threshold = 1e-4 * (n as f64);
+        if max_residual > threshold {
+            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
+                format!(
+                    "Matrix inversion numerically unstable (residual={:.2e} > threshold={:.2e}). \
+                     Design matrix may be near-singular.",
+                    max_residual, threshold
+                )
+            ));
+        }
     }
 
     // Convert back to ndarray
@@ -445,7 +469,8 @@ mod tests {
             [1.0, 1.0 + 1e-15],  // Nearly identical rows
         ];
 
-        // Should fail due to numerical instability
+        // Should fail due to numerical instability (small pivot ratio triggers
+        // residual check which detects the inversion error)
         let result = invert_symmetric(&a);
         assert!(result.is_err(), "Near-singular matrix inversion should fail");
 
diff --git a/rust/src/trop.rs b/rust/src/trop.rs
@@ -1036,8 +1036,9 @@ pub fn bootstrap_trop_variance<'py>(
         .collect();
 
     // Compute standard error
+    // Return NaN when < 2 samples to properly propagate undefined inference
     let se = if bootstrap_estimates.len() < 2 {
-        0.0
+        f64::NAN
     } else {
         let n = bootstrap_estimates.len() as f64;
         let mean = bootstrap_estimates.iter().sum::<f64>() / n;
@@ -1701,8 +1702,9 @@ pub fn bootstrap_trop_variance_joint<'py>(
         .collect();
 
     // Compute standard error
+    // Return NaN when < 2 samples to properly propagate undefined inference
     let se = if bootstrap_estimates.len() < 2 {
-        0.0
+        f64::NAN
     } else {
         let n = bootstrap_estimates.len() as f64;
         let mean = bootstrap_estimates.iter().sum::<f64>() / n;