Address PR #105 feedback: fix Webb weights, lazy R fixture, test perf

igerber · claude · igerber · commit 4c2e53058cfa · 2026-01-24T12:00:25.000-05:00
- Fix Rust Webb bootstrap weights to match NumPy implementation:
  - Correct values: ±√(3/2), ±1, ±√(1/2) (was using wrong values)
  - Correct probabilities: [1,2,3,3,2,1]/12 (was uniform)
  - Add 3 Rust unit tests for Webb weight verification
  - Both backends now produce variance ≈ 0.833

- Add lazy R availability fixture to avoid import-time latency:
  - New tests/conftest.py with session-scoped r_available fixture
  - Support DIFF_DIFF_R=skip environment variable
  - Test collection now completes in &lt;1s (was ~2s with subprocess)

- Improve test performance:
  - Add @pytest.mark.slow marker for thorough bootstrap tests
  - Reduce bootstrap iterations from 199 to 99 where sufficient
  - Add slow marker definition to pyproject.toml

- Documentation updates:
  - METHODOLOGY_REVIEW.md: Correct Webb variance to 0.833
  - TODO.md: Log 7 pre-existing NaN handling issues
  - CLAUDE.md: Document Rust test troubleshooting (PyO3 linking)

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -53,6 +53,36 @@ DIFF_DIFF_BACKEND=rust pytest
 pytest tests/test_rust_backend.py -v
 ```
 
+#### Troubleshooting Rust Tests (PyO3 Linking)
+
+If `cargo test` fails with `library 'pythonX.Y' not found`, PyO3 cannot find the Python library. This commonly happens on macOS when using the system Python (which lacks development headers in expected locations).
+
+**Solution**: Use a Python environment with proper library paths (e.g., conda, Homebrew, or pyenv):
+
+```bash
+# Using miniconda (example path - adjust for your system)
+cd rust
+PYO3_PYTHON=/path/to/miniconda3/bin/python3 \
+DYLD_LIBRARY_PATH="/path/to/miniconda3/lib" \
+cargo test
+
+# Using Homebrew Python
+PYO3_PYTHON=/opt/homebrew/bin/python3 \
+DYLD_LIBRARY_PATH="/opt/homebrew/lib" \
+cargo test
+```
+
+**Environment variables:**
+- `PYO3_PYTHON`: Path to Python interpreter with development headers
+- `DYLD_LIBRARY_PATH` (macOS) / `LD_LIBRARY_PATH` (Linux): Path to `libpythonX.Y.dylib`/`.so`
+
+**Verification**: All 22 Rust tests should pass, including bootstrap weight tests:
+```
+test bootstrap::tests::test_webb_variance_approx_correct ... ok
+test bootstrap::tests::test_webb_values_correct ... ok
+test bootstrap::tests::test_webb_mean_approx_zero ... ok
+```
+
 ## Architecture
 
 ### Module Structure
diff --git a/METHODOLOGY_REVIEW.md b/METHODOLOGY_REVIEW.md
@@ -148,8 +148,9 @@ Each estimator in diff-diff should be periodically reviewed to ensure:
 **Deviations from R's did::att_gt():**
 1. **NaN for invalid inference**: When SE is non-finite or zero, Python returns NaN for
    t_stat/p_value rather than potentially erroring. This is a defensive enhancement.
-2. **Webb weights variance**: Webb's 6-point distribution has Var(w) ≈ 0.72, not 1.0.
-   This is the correct theoretical variance for this distribution.
+2. **Webb weights variance**: Webb's 6-point distribution with values ±√(3/2), ±1, ±√(1/2)
+   and probabilities [1,2,3,3,2,1]/12 has Var(w) ≈ 0.833 (=10/12), not 1.0.
+   This is the correct theoretical variance matching the NumPy and Rust implementations.
 
 ---
 
diff --git a/TODO.md b/TODO.md
@@ -52,6 +52,28 @@ Target: < 1000 lines per module for maintainability.
 | `pretrends.py` | 1160 | Acceptable |
 | `bacon.py` | 1027 | OK |
 
+### NaN Handling for Undefined t-statistics
+
+Several estimators return `0.0` for t-statistic when SE is 0 or undefined. This is incorrect—a t-stat of 0 implies a null effect, whereas `np.nan` correctly indicates undefined inference.
+
+**Pattern to fix**: `t_stat = effect / se if se > 0 else 0.0` → `t_stat = effect / se if se > 0 else np.nan`
+
+| Location | Line | Current Code |
+|----------|------|--------------|
+| `diagnostics.py` | 665 | `t_stat = original_att / se if se > 0 else 0.0` |
+| `diagnostics.py` | 786 | `t_stat = mean_effect / se if se > 0 else 0.0` |
+| `sun_abraham.py` | 603 | `overall_t = overall_att / overall_se if overall_se > 0 else 0.0` |
+| `sun_abraham.py` | 626 | `overall_t = overall_att / overall_se if overall_se > 0 else 0.0` |
+| `sun_abraham.py` | 643 | `eff_val / se_val if se_val > 0 else 0.0` |
+| `sun_abraham.py` | 881 | `t_stat = agg_effect / agg_se if agg_se > 0 else 0.0` |
+| `triple_diff.py` | 601 | `t_stat = att / se if se > 0 else 0.0` |
+
+**Priority**: Medium - affects inference reporting in edge cases.
+
+**Note**: CallawaySantAnna was fixed in PR #97 to use `np.nan`. These other estimators should follow the same pattern.
+
+---
+
 ### Standard Error Consistency
 
 Different estimators compute SEs differently. Consider unified interface.
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,9 @@ python-packages = ["diff_diff"]
 testpaths = ["tests"]
 python_files = "test_*.py"
 addopts = "-v --tb=short"
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]
 
 [tool.black]
 line-length = 100
diff --git a/rust/src/bootstrap.rs b/rust/src/bootstrap.rs
@@ -118,17 +118,29 @@ fn generate_mammen_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array
 /// Six-point distribution that matches additional moments:
 /// E[w] = 0, E[w²] = 1, E[w³] = 0, E[w⁴] = 1
 ///
-/// Values: ±√(3/2), ±√(1/2), ±√(1/6) with specific probabilities
+/// Values: ±√(3/2), ±√(2/2)=±1, ±√(1/2) with probabilities [1,2,3,3,2,1]/12
+/// This matches the NumPy implementation in staggered_bootstrap.py
 fn generate_webb_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<f64> {
-    // Webb 6-point values
-    let val1 = (3.0_f64 / 2.0).sqrt(); // √(3/2) ≈ 1.225
-    let val2 = (1.0_f64 / 2.0).sqrt(); // √(1/2) ≈ 0.707
-    let val3 = (1.0_f64 / 6.0).sqrt(); // √(1/6) ≈ 0.408
+    // Webb 6-point values (matching NumPy implementation)
+    let val1 = (3.0_f64 / 2.0).sqrt(); // √(3/2) ≈ 1.2247
+    let val2 = 1.0_f64; // √(2/2) = 1.0
+    let val3 = (1.0_f64 / 2.0).sqrt(); // √(1/2) ≈ 0.7071
 
-    // Lookup table for direct index computation (replaces 6-way if-else)
-    // Equal probability: u in [0, 1/6) -> -val1, [1/6, 2/6) -> -val2, etc.
+    // Values in order: -val1, -val2, -val3, val3, val2, val1
     let weights_table = [-val1, -val2, -val3, val3, val2, val1];
 
+    // Cumulative probabilities for [1,2,3,3,2,1]/12
+    // Probs: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+    // Cumulative: 1/12, 3/12, 6/12, 9/12, 11/12, 12/12
+    let cum_probs = [
+        1.0 / 12.0,  // P(bucket 0) = 1/12
+        3.0 / 12.0,  // P(bucket <= 1) = 3/12
+        6.0 / 12.0,  // P(bucket <= 2) = 6/12 = 0.5
+        9.0 / 12.0,  // P(bucket <= 3) = 9/12 = 0.75
+        11.0 / 12.0, // P(bucket <= 4) = 11/12
+        // bucket 5 is implicit (u >= 11/12)
+    ];
+
     // Pre-allocate output array - eliminates double allocation
     let mut weights = Array2::<f64>::zeros((n_bootstrap, n_units));
 
@@ -142,9 +154,20 @@ fn generate_webb_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
             for elem in row.iter_mut() {
                 let u = rng.gen::<f64>();
-                // Direct bucket computation: multiply by 6 and floor to get index 0-5
-                // Clamp to 5 to handle edge case where u == 1.0
-                let bucket = ((u * 6.0).floor() as usize).min(5);
+                // Find bucket using cumulative probabilities
+                let bucket = if u < cum_probs[0] {
+                    0
+                } else if u < cum_probs[1] {
+                    1
+                } else if u < cum_probs[2] {
+                    2
+                } else if u < cum_probs[3] {
+                    3
+                } else if u < cum_probs[4] {
+                    4
+                } else {
+                    5
+                };
                 *elem = weights_table[bucket];
             }
         });
@@ -225,4 +248,59 @@ mod tests {
         // Different seeds should produce different results
         assert_ne!(weights1, weights2);
     }
+
+    #[test]
+    fn test_webb_mean_approx_zero() {
+        let weights = generate_webb_batch(10000, 1, 42);
+        let mean: f64 = weights.iter().sum::<f64>() / weights.len() as f64;
+
+        // With 10000 samples, mean should be close to 0
+        assert!(
+            mean.abs() < 0.1,
+            "Webb mean should be close to 0, got {}",
+            mean
+        );
+    }
+
+    #[test]
+    fn test_webb_variance_approx_correct() {
+        // Webb's 6-point distribution with values ±√(3/2), ±1, ±√(1/2)
+        // and probabilities [1,2,3,3,2,1]/12 should have variance close to
+        // the theoretical value of 10/12 ≈ 0.833
+        let weights = generate_webb_batch(10000, 100, 42);
+        let n = weights.len() as f64;
+        let mean: f64 = weights.iter().sum::<f64>() / n;
+        let variance: f64 = weights.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
+
+        // Theoretical variance = 2 * (1/12 * 3/2 + 2/12 * 1 + 3/12 * 1/2) = 10/12 ≈ 0.833
+        // Allow some statistical variance in the estimate
+        assert!(
+            (variance - 0.833).abs() < 0.05,
+            "Webb variance should be ~0.833 (matching NumPy), got {}",
+            variance
+        );
+    }
+
+    #[test]
+    fn test_webb_values_correct() {
+        // Verify that Webb weights only take the expected 6 values
+        let weights = generate_webb_batch(100, 1000, 42);
+
+        let val1 = (3.0_f64 / 2.0).sqrt(); // ≈ 1.2247
+        let val2 = 1.0_f64;
+        let val3 = (1.0_f64 / 2.0).sqrt(); // ≈ 0.7071
+
+        let expected_values = [-val1, -val2, -val3, val3, val2, val1];
+
+        for w in weights.iter() {
+            let matches_expected = expected_values
+                .iter()
+                .any(|&expected| (*w - expected).abs() < 1e-10);
+            assert!(
+                matches_expected,
+                "Webb weight {} is not one of the expected values",
+                w
+            );
+        }
+    }
 }
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,83 @@
+"""
+Pytest configuration and shared fixtures for diff-diff tests.
+
+This module provides shared fixtures including lazy R availability checking
+to avoid import-time subprocess latency.
+"""
+
+import os
+import subprocess
+
+import pytest
+
+
+# =============================================================================
+# R Availability Fixtures (Lazy Loading)
+# =============================================================================
+
+_r_available_cache = None
+
+
+def _check_r_available() -> bool:
+    """
+    Check if R and the did package are available (cached).
+
+    This is called lazily when the r_available fixture is first used,
+    not at module import time, to avoid subprocess latency during test collection.
+
+    Returns
+    -------
+    bool
+        True if R and did package are available, False otherwise.
+    """
+    global _r_available_cache
+    if _r_available_cache is None:
+        # Allow environment override (matches DIFF_DIFF_BACKEND pattern)
+        r_env = os.environ.get("DIFF_DIFF_R", "auto").lower()
+        if r_env == "skip":
+            _r_available_cache = False
+        else:
+            try:
+                result = subprocess.run(
+                    ["Rscript", "-e", "library(did); cat('OK')"],
+                    capture_output=True,
+                    text=True,
+                    timeout=30,
+                )
+                _r_available_cache = result.returncode == 0 and "OK" in result.stdout
+            except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+                _r_available_cache = False
+    return _r_available_cache
+
+
+@pytest.fixture(scope="session")
+def r_available():
+    """
+    Lazy check for R availability.
+
+    This fixture is session-scoped and cached, so R availability is only
+    checked once per test session, and only when a test actually needs it.
+
+    Returns
+    -------
+    bool
+        True if R and did package are available.
+    """
+    return _check_r_available()
+
+
+@pytest.fixture
+def require_r(r_available):
+    """
+    Skip test if R is not available.
+
+    Use this fixture in tests that require R:
+
+    ```python
+    def test_comparison_with_r(require_r):
+        # This test will be skipped if R is not available
+        ...
+    ```
+    """
+    if not r_available:
+        pytest.skip("R or did package not available")
diff --git a/tests/test_methodology_callaway.py b/tests/test_methodology_callaway.py