Revert untested Rust backend changes

claude · claude · commit 9e576c06fb41 · 2026-01-13T12:15:47.000Z
The Rust backend optimizations (Cholesky factorization, reduced allocations)
could not be tested in CI due to missing OpenBLAS library. Reverting these
changes to keep v2.0.1 focused on tested Python improvements only.

The Rust optimizations remain in TODO.md for future implementation when
proper testing infrastructure is available.
diff --git a/TODO.md b/TODO.md
@@ -92,8 +92,8 @@ Enhancements for `honest_did.py`:
 
 Deferred from PR #58 code review (can be done post-merge):
 
-- [x] **Matrix inversion efficiency** (`rust/src/linalg.rs`): ~~Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve~~ (completed in v2.0.1)
-- [x] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): ~~Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice.~~ Now allocates directly into pre-allocated buffer. (completed in v2.0.1)
+- [ ] **Matrix inversion efficiency** (`rust/src/linalg.rs:180-194`): Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve
+- [ ] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice. Should allocate directly into ndarray.
 - [ ] **Consider static BLAS linking** (`rust/Cargo.toml`): Currently requires system BLAS libraries. Consider `openblas-static` or `intel-mkl-static` features for easier distribution.
 
 ---
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "diff_diff_rust"
-version = "2.0.1"
+version = "2.0.0"
 edition = "2021"
 description = "Rust backend for diff-diff DiD library"
 license = "MIT"
diff --git a/rust/src/bootstrap.rs b/rust/src/bootstrap.rs
@@ -51,20 +51,19 @@ pub fn generate_bootstrap_weights_batch<'py>(
 ///
 /// E[w] = 0, Var[w] = 1
 fn generate_rademacher_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<f64> {
-    // Pre-allocate flat array directly (single allocation instead of Vec<Vec<f64>> + flatten)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    // Generate weights in parallel using rayon
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                *val = if rng.gen::<bool>() { 1.0 } else { -1.0 };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| if rng.gen::<bool>() { 1.0 } else { -1.0 })
+                .collect()
+        })
+        .collect();
+
+    // Convert to ndarray
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -84,24 +83,23 @@ fn generate_mammen_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array
     // Probability of negative value
     let prob_neg = (sqrt5 + 1.0) / (2.0 * sqrt5); // ≈ 0.724
 
-    // Pre-allocate flat array directly (single allocation)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                *val = if rng.gen::<f64>() < prob_neg {
-                    val_neg
-                } else {
-                    val_pos
-                };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| {
+                    if rng.gen::<f64>() < prob_neg {
+                        val_neg
+                    } else {
+                        val_pos
+                    }
+                })
+                .collect()
+        })
+        .collect();
+
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -120,33 +118,32 @@ fn generate_webb_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<
     // Equal probability for each of 6 values: 1/6 each
     let prob = 1.0 / 6.0;
 
-    // Pre-allocate flat array directly (single allocation)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                let u = rng.gen::<f64>();
-                *val = if u < prob {
-                    -val1
-                } else if u < 2.0 * prob {
-                    -val2
-                } else if u < 3.0 * prob {
-                    -val3
-                } else if u < 4.0 * prob {
-                    val3
-                } else if u < 5.0 * prob {
-                    val2
-                } else {
-                    val1
-                };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| {
+                    let u = rng.gen::<f64>();
+                    if u < prob {
+                        -val1
+                    } else if u < 2.0 * prob {
+                        -val2
+                    } else if u < 3.0 * prob {
+                        -val3
+                    } else if u < 4.0 * prob {
+                        val3
+                    } else if u < 5.0 * prob {
+                        val2
+                    } else {
+                        val1
+                    }
+                })
+                .collect()
+        })
+        .collect();
+
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
@@ -6,7 +6,7 @@
 //! - Cluster-robust variance-covariance estimation
 
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
-use ndarray_linalg::{Cholesky, LeastSquaresSvd, Solve, UPLO};
+use ndarray_linalg::{LeastSquaresSvd, Solve};
 use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use std::collections::HashMap;
@@ -190,30 +190,18 @@ fn compute_robust_vcov_internal(
     }
 }
 
-/// Invert a symmetric positive-definite matrix using Cholesky factorization.
-///
-/// For symmetric positive-definite matrices like X'X, Cholesky factorization
-/// (A = L L') is more efficient than general LU decomposition, requiring
-/// approximately half the operations.
+/// Invert a symmetric positive-definite matrix.
 fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
     let n = a.nrows();
-
-    // Compute Cholesky factorization: A = L L' where L is lower triangular
-    let factorized = a.cholesky(UPLO::Lower)
-        .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
-            format!("Cholesky factorization failed (matrix may not be positive-definite): {}", e)
-        ))?;
-
-    // Solve A * A^{-1} = I by solving for each column of the identity
     let mut result = Array2::<f64>::zeros((n, n));
+
+    // Solve A * x_i = e_i for each column of the identity matrix
     for i in 0..n {
         let mut e_i = Array1::<f64>::zeros(n);
         e_i[i] = 1.0;
 
-        let col = factorized.solve(&e_i)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                format!("Matrix inversion failed: {}", e)
-            ))?;
+        let col = a.solve(&e_i)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Matrix inversion failed: {}", e)))?;
 
         result.column_mut(i).assign(&col);
     }