|
29 | 29 | from scipy import sparse |
30 | 30 | from scipy.sparse.linalg import factorized as sparse_factorized |
31 | 31 |
|
| 32 | +# Maximum number of elements before falling back to per-column sparse aggregation. |
| 33 | +# 10M float64 elements ≈ 80 MB peak allocation. Above this, per-column .getcol() |
| 34 | +# trades throughput for bounded memory. |
| 35 | +_SPARSE_DENSE_THRESHOLD = 10_000_000 |
| 36 | + |
32 | 37 | from diff_diff.linalg import solve_ols |
33 | 38 | from diff_diff.two_stage_bootstrap import TwoStageDiDBootstrapMixin |
34 | 39 | from diff_diff.two_stage_results import TwoStageBootstrapResults, TwoStageDiDResults # noqa: F401 (re-export) |
@@ -1222,15 +1227,19 @@ def _compute_gmm_variance( |
1222 | 1227 | unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True) |
1223 | 1228 | G = len(unique_clusters) |
1224 | 1229 |
|
1225 | | - # Convert sparse to dense once for efficient cluster aggregation. |
1226 | | - # Total memory touched is identical to per-column .getcol().toarray(); |
1227 | | - # only peak allocation differs (full matrix vs one column at a time). |
1228 | | - # For panels with >100K FE columns, consider reverting to per-column |
1229 | | - # .getcol() to limit peak memory. |
1230 | | - weighted_X10_dense = weighted_X10.toarray() |
| 1230 | + n_elements = weighted_X10.shape[0] * weighted_X10.shape[1] |
1231 | 1231 | c_by_cluster = np.zeros((G, p)) |
1232 | | - for j_col in range(p): |
1233 | | - np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col]) |
| 1232 | + if n_elements > _SPARSE_DENSE_THRESHOLD: |
| 1233 | + # Per-column path: limits peak memory for large FE matrices |
| 1234 | + weighted_X10_csc = weighted_X10.tocsc() |
| 1235 | + for j_col in range(p): |
| 1236 | + col_data = weighted_X10_csc.getcol(j_col).toarray().ravel() |
| 1237 | + np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data) |
| 1238 | + else: |
| 1239 | + # Dense path: faster for moderate-size matrices |
| 1240 | + weighted_X10_dense = weighted_X10.toarray() |
| 1241 | + for j_col in range(p): |
| 1242 | + np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col]) |
1234 | 1243 |
|
1235 | 1244 | # 3. Per-cluster Stage 2 scores: X'_{2g} eps_{2g} |
1236 | 1245 | weighted_X2 = X_2 * eps_2[:, None] # (n x k) dense |
|
0 commit comments