Add sparse size guard, remove stale SA params, add tests for PR #165 round 2

igerber · claude · igerber · commit 3b2e40d82db0 · 2026-02-17T13:16:56.000-05:00
- Add _SPARSE_DENSE_THRESHOLD in two_stage.py with per-column .tocsc()
  fallback for large FE matrices; apply same pattern in bootstrap module
- Remove min_pre_periods/min_post_periods from README SunAbraham table
- Add test_removed_params_raise_typeerror for SunAbraham
- Add test_sparse_fallback_path for TwoStageDiD dense/sparse equivalence

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -2064,8 +2064,6 @@ SunAbraham(
 | `time` | str | Time period column |
 | `first_treat` | str | Column with first treatment period (0 for never-treated) |
 | `covariates` | list | Covariate column names |
-| `min_pre_periods` | int | Minimum pre-treatment periods to include |
-| `min_post_periods` | int | Minimum post-treatment periods to include |
 
 ### SunAbrahamResults
 
diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py
@@ -29,6 +29,11 @@
 from scipy import sparse
 from scipy.sparse.linalg import factorized as sparse_factorized
 
+# Maximum number of elements before falling back to per-column sparse aggregation.
+# 10M float64 elements ≈ 80 MB peak allocation. Above this, per-column .getcol()
+# trades throughput for bounded memory.
+_SPARSE_DENSE_THRESHOLD = 10_000_000
+
 from diff_diff.linalg import solve_ols
 from diff_diff.two_stage_bootstrap import TwoStageDiDBootstrapMixin
 from diff_diff.two_stage_results import TwoStageBootstrapResults, TwoStageDiDResults  # noqa: F401 (re-export)
@@ -1222,15 +1227,19 @@ def _compute_gmm_variance(
         unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True)
         G = len(unique_clusters)
 
-        # Convert sparse to dense once for efficient cluster aggregation.
-        # Total memory touched is identical to per-column .getcol().toarray();
-        # only peak allocation differs (full matrix vs one column at a time).
-        # For panels with >100K FE columns, consider reverting to per-column
-        # .getcol() to limit peak memory.
-        weighted_X10_dense = weighted_X10.toarray()
+        n_elements = weighted_X10.shape[0] * weighted_X10.shape[1]
         c_by_cluster = np.zeros((G, p))
-        for j_col in range(p):
-            np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
+        if n_elements > _SPARSE_DENSE_THRESHOLD:
+            # Per-column path: limits peak memory for large FE matrices
+            weighted_X10_csc = weighted_X10.tocsc()
+            for j_col in range(p):
+                col_data = weighted_X10_csc.getcol(j_col).toarray().ravel()
+                np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data)
+        else:
+            # Dense path: faster for moderate-size matrices
+            weighted_X10_dense = weighted_X10.toarray()
+            for j_col in range(p):
+                np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
 
         # 3. Per-cluster Stage 2 scores: X'_{2g} eps_{2g}
         weighted_X2 = X_2 * eps_2[:, None]  # (n x k) dense
diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py
@@ -15,6 +15,7 @@
 
 from diff_diff.linalg import solve_ols
 from diff_diff.staggered_bootstrap import _generate_bootstrap_weights_batch
+from diff_diff.two_stage import _SPARSE_DENSE_THRESHOLD
 from diff_diff.two_stage_results import TwoStageBootstrapResults
 
 __all__ = [
@@ -106,12 +107,19 @@ def _compute_cluster_S_scores(
         unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True)
         G = len(unique_clusters)
 
-        # Convert sparse to dense once (see _compute_gmm_variance for memory note).
-        # For panels with >100K FE columns, consider per-column .getcol() instead.
-        weighted_X10_dense = weighted_X10.toarray()
+        n_elements = weighted_X10.shape[0] * weighted_X10.shape[1]
         c_by_cluster = np.zeros((G, p))
-        for j_col in range(p):
-            np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
+        if n_elements > _SPARSE_DENSE_THRESHOLD:
+            # Per-column path: limits peak memory for large FE matrices
+            weighted_X10_csc = weighted_X10.tocsc()
+            for j_col in range(p):
+                col_data = weighted_X10_csc.getcol(j_col).toarray().ravel()
+                np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data)
+        else:
+            # Dense path: faster for moderate-size matrices
+            weighted_X10_dense = weighted_X10.toarray()
+            for j_col in range(p):
+                np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
 
         weighted_X2 = X_2 * eps_2[:, None]
         s2_by_cluster = np.zeros((G, k))
diff --git a/tests/test_sun_abraham.py b/tests/test_sun_abraham.py
@@ -1442,6 +1442,15 @@ def test_never_treated_inf_encoding(self):
             f"SE differs: inf={results_inf.overall_se}, zero={results_zero.overall_se}"
         )
 
+    def test_removed_params_raise_typeerror(self):
+        """Removed min_pre_periods/min_post_periods raise TypeError."""
+        data = generate_staggered_data(n_units=30, n_periods=6, seed=42)
+        sa = SunAbraham(n_bootstrap=0)
+        with pytest.raises(TypeError, match="unexpected keyword argument"):
+            sa.fit(data, "outcome", "unit", "time", "first_treat", min_pre_periods=2)
+        with pytest.raises(TypeError, match="unexpected keyword argument"):
+            sa.fit(data, "outcome", "unit", "time", "first_treat", min_post_periods=2)
+
     def test_all_never_treated_inf_raises(self):
         """Test that all-never-treated data with np.inf encoding raises ValueError."""
         data = generate_staggered_data(n_units=100, n_periods=10, n_cohorts=3, seed=42)
diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py
@@ -1145,3 +1145,35 @@ def test_print_summary(self, capsys):
         results.print_summary()
         captured = capsys.readouterr()
         assert "Two-Stage DiD" in captured.out
+
+    def test_sparse_fallback_path(self):
+        """Size guard falls back to per-column path and produces same results."""
+        import diff_diff.two_stage as ts_mod
+
+        data = generate_test_data(n_units=50, n_periods=6, seed=42)
+
+        # Run with normal (high) threshold — uses dense path
+        result_dense = TwoStageDiD().fit(
+            data, outcome="outcome", unit="unit", time="time", first_treat="first_treat"
+        )
+
+        # Patch threshold to 1 to force per-column path on all data
+        orig = ts_mod._SPARSE_DENSE_THRESHOLD
+        try:
+            ts_mod._SPARSE_DENSE_THRESHOLD = 1
+            result_sparse = TwoStageDiD().fit(
+                data,
+                outcome="outcome",
+                unit="unit",
+                time="time",
+                first_treat="first_treat",
+            )
+        finally:
+            ts_mod._SPARSE_DENSE_THRESHOLD = orig
+
+        np.testing.assert_allclose(
+            result_dense.overall_att, result_sparse.overall_att, rtol=1e-10
+        )
+        np.testing.assert_allclose(
+            result_dense.overall_se, result_sparse.overall_se, rtol=1e-10
+        )