Skip to content

Commit 75beec7

Browse files
igerberclaude
andcommitted
Codex CI R8 P1: cluster-aware sparse density gate + sparse+cluster regressions
Two related P1s on R8 (both about the sparse+cluster interaction). P1 (Performance) — density gate is now cluster-aware The R6 density gate counted ALL spatial in-range pairs before the cluster mask was applied, but the actual CSR matrix stores only within-cluster pairs (the inner row-loop drops cross-cluster neighbors before populating row/col/data lists). On large clustered panels — exactly the use case sparse+cluster is supposed to win — spatial density could be ~100% while within-cluster density is tiny, and we'd spuriously fall back to dense O(n²) work. Fix: when the unrefined spatial density exceeds the threshold AND `cluster_codes is not None`, refine by counting only within-cluster in-range pairs (per-cluster kd-trees, summed). The refinement runs only when needed (the gate is already cleared if spatial density is below threshold), so the per-cluster sub-tree builds are amortized. Stores the projected coordinates as `tree_coords` once on the haversine branch (was `xyz` locally before; renaming makes the per-cluster build symmetric across metrics). P1 (Documentation/Tests) — sparse+cluster regression coverage Sparse coverage previously exercised unclustered dense-vs-sparse parity; cluster coverage previously exercised dense/estimator behavior — but the combined sparse+cluster path was load-bearing and untested. Adds three regressions: - test_sparse_with_cluster_matches_dense: cross-sectional 600-row panel, 8 clusters, dense vs forced-sparse parity at atol=1e-10. - test_sparse_with_cluster_panel_matches_dense: 3-period block- decomposed panel, 200 units, 5 time-invariant clusters, dense vs forced-sparse parity at atol=1e-10. - test_sparse_density_gate_cluster_aware: tight spatial cluster (100% global density) + 50 small clusters → within-cluster density << 30%. Locks the refinement contract: NO density warning, sparse path executes, result matches dense. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 04a5fa1 commit 75beec7

2 files changed

Lines changed: 160 additions & 0 deletions

File tree

diff_diff/conley.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,11 +457,13 @@ def _compute_spatial_bartlett_meat_sparse(
457457
query_r = 2.0 * np.sin(arc_radians / 2.0)
458458
query_r *= 1.0 + 1e-12
459459
tree = cKDTree(xyz)
460+
tree_coords = xyz # used for per-cluster sub-trees in the density gate
460461
elif metric == "euclidean":
461462
# Small relative epsilon for symmetry with the haversine branch.
462463
# Bartlett's <=-vs-< boundary is moot since kernel is exactly 0 at u=1.
463464
query_r = cutoff * (1.0 + 1e-12)
464465
tree = cKDTree(coords)
466+
tree_coords = coords
465467
else:
466468
raise ValueError(
467469
"sparse Conley path requires metric in {'haversine', 'euclidean'}; "
@@ -475,6 +477,25 @@ def _compute_spatial_bartlett_meat_sparse(
475477
# memory than dense float64). Codex CI R6 P2.
476478
n_pairs_in_range = int(tree.count_neighbors(tree, r=query_r, p=2.0))
477479
density = n_pairs_in_range / float(n * n)
480+
if density > density_threshold and cluster_codes is not None:
481+
# Cluster-aware refinement: the actual CSR nnz reflects within-
482+
# cluster pairs only (the inner loop drops cross-cluster neighbors
483+
# before adding to the sparse matrix). On large clustered panels,
484+
# spatial density can be ~100% while within-cluster density is
485+
# tiny — without refinement we'd spuriously fall back to dense
486+
# exactly when sparse helps most. Refine by counting per-cluster.
487+
# Codex CI R8 P1: the unrefined density check was cluster-blind.
488+
refined_pairs = 0
489+
for c in np.unique(cluster_codes):
490+
in_c = cluster_codes == c
491+
n_c = int(in_c.sum())
492+
if n_c <= 1:
493+
# Singleton cluster contributes only the diagonal self-pair.
494+
refined_pairs += n_c
495+
continue
496+
sub_tree = cKDTree(tree_coords[in_c])
497+
refined_pairs += int(sub_tree.count_neighbors(sub_tree, r=query_r, p=2.0))
498+
density = refined_pairs / float(n * n)
478499
if density > density_threshold:
479500
warnings.warn(
480501
f"Conley sparse path: neighbor density {density:.1%} exceeds "

tests/test_conley_vcov.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3125,6 +3125,145 @@ def test_sparse_density_gate_does_not_trigger_below_threshold(self):
31253125
len(density_warnings) == 0
31263126
), "Density gate should not have triggered at low density"
31273127

3128+
def test_sparse_with_cluster_matches_dense(self):
3129+
"""Sparse + combined cluster kernel matches the dense path bit-for-bit
3130+
at atol=1e-10 (cross-sectional). Codex CI R8 P1: load-bearing
3131+
sparse+cluster regression that was missing prior."""
3132+
rng = np.random.default_rng(seed=181)
3133+
n = 600
3134+
coords = rng.uniform(0.0, 100.0, size=(n, 2))
3135+
cluster_ids = rng.integers(0, 8, size=n) # 8 clusters
3136+
X = np.column_stack([np.ones(n), rng.standard_normal(n), rng.standard_normal(n)])
3137+
y = X @ np.array([1.0, 1.5, -0.5]) + rng.standard_normal(n) * 0.4
3138+
coefs, *_ = np.linalg.lstsq(X, y, rcond=None)
3139+
residuals = y - X @ coefs
3140+
bread = X.T @ X
3141+
V_dense = _compute_conley_vcov(
3142+
X,
3143+
residuals,
3144+
coords,
3145+
15.0,
3146+
"euclidean",
3147+
"bartlett",
3148+
bread,
3149+
cluster_ids=cluster_ids,
3150+
_conley_sparse=False,
3151+
)
3152+
V_sparse = _compute_conley_vcov(
3153+
X,
3154+
residuals,
3155+
coords,
3156+
15.0,
3157+
"euclidean",
3158+
"bartlett",
3159+
bread,
3160+
cluster_ids=cluster_ids,
3161+
_conley_sparse=True,
3162+
)
3163+
np.testing.assert_allclose(V_sparse, V_dense, atol=1e-10, rtol=1e-10)
3164+
3165+
def test_sparse_with_cluster_panel_matches_dense(self):
3166+
"""Sparse + cluster + panel block-decomposed sandwich matches the
3167+
dense path on a 3-period panel with time-invariant cluster.
3168+
Codex CI R8 P1."""
3169+
rng = np.random.default_rng(seed=191)
3170+
n_units = 200
3171+
T = 3
3172+
unit = np.repeat(np.arange(n_units), T)
3173+
time = np.tile(np.arange(T), n_units)
3174+
n = n_units * T
3175+
# Time-invariant coords + cluster (per-unit)
3176+
unit_coords = rng.uniform(0.0, 100.0, size=(n_units, 2))
3177+
coords = unit_coords[unit]
3178+
cluster_per_unit = rng.integers(0, 5, size=n_units)
3179+
cluster_ids = cluster_per_unit[unit]
3180+
X = np.column_stack([np.ones(n), rng.standard_normal(n), rng.standard_normal(n)])
3181+
y = X @ np.array([1.0, 1.5, -0.3]) + rng.standard_normal(n) * 0.4
3182+
coefs, *_ = np.linalg.lstsq(X, y, rcond=None)
3183+
residuals = y - X @ coefs
3184+
bread = X.T @ X
3185+
V_dense = _compute_conley_vcov(
3186+
X,
3187+
residuals,
3188+
coords,
3189+
15.0,
3190+
"euclidean",
3191+
"bartlett",
3192+
bread,
3193+
time=time,
3194+
unit=unit,
3195+
lag_cutoff=1,
3196+
cluster_ids=cluster_ids,
3197+
_conley_sparse=False,
3198+
)
3199+
V_sparse = _compute_conley_vcov(
3200+
X,
3201+
residuals,
3202+
coords,
3203+
15.0,
3204+
"euclidean",
3205+
"bartlett",
3206+
bread,
3207+
time=time,
3208+
unit=unit,
3209+
lag_cutoff=1,
3210+
cluster_ids=cluster_ids,
3211+
_conley_sparse=True,
3212+
)
3213+
np.testing.assert_allclose(V_sparse, V_dense, atol=1e-10, rtol=1e-10)
3214+
3215+
def test_sparse_density_gate_cluster_aware(self):
3216+
"""High global spatial density + many small clusters: within-cluster
3217+
density is low, so the sparse path should NOT spuriously fall back
3218+
to dense. Tests the cluster-aware refinement of the density gate.
3219+
Codex CI R8 P1."""
3220+
# Tight spatial cluster of points → 100% global spatial density,
3221+
# but split into many small disjoint clusters so post-mask density
3222+
# is well below 30%.
3223+
rng = np.random.default_rng(seed=199)
3224+
n = 200
3225+
coords = rng.uniform(0.0, 5.0, size=(n, 2)) # tight cluster
3226+
# 50 clusters of 4 points each → within-cluster nnz = 4*4 = 16 per
3227+
# cluster, total = 50*16 = 800 = 800/(200*200) = 2% density << 30%.
3228+
cluster_ids = np.repeat(np.arange(50), 4)
3229+
X = np.column_stack([np.ones(n), rng.standard_normal(n)])
3230+
y = X @ np.array([1.0, 1.5]) + rng.standard_normal(n) * 0.4
3231+
coefs, *_ = np.linalg.lstsq(X, y, rcond=None)
3232+
residuals = y - X @ coefs
3233+
bread = X.T @ X
3234+
with warnings.catch_warnings(record=True) as w:
3235+
warnings.simplefilter("always")
3236+
V_sparse = _compute_conley_vcov(
3237+
X,
3238+
residuals,
3239+
coords,
3240+
1e6, # cutoff > data span → 100% global density
3241+
"euclidean",
3242+
"bartlett",
3243+
bread,
3244+
cluster_ids=cluster_ids,
3245+
_conley_sparse=True,
3246+
)
3247+
density_warnings = [msg for msg in w if "exceeds threshold" in str(msg.message)]
3248+
assert len(density_warnings) == 0, (
3249+
"Density gate spuriously fell back to dense even though "
3250+
"within-cluster density is low — cluster-aware refinement "
3251+
"is not working."
3252+
)
3253+
# Result must equal the dense path (sparse path executed correctly)
3254+
V_dense = _compute_conley_vcov(
3255+
X,
3256+
residuals,
3257+
coords,
3258+
1e6,
3259+
"euclidean",
3260+
"bartlett",
3261+
bread,
3262+
cluster_ids=cluster_ids,
3263+
_conley_sparse=False,
3264+
)
3265+
np.testing.assert_allclose(V_sparse, V_dense, atol=1e-10, rtol=1e-10)
3266+
31283267
def test_sparse_haversine_cutoff_at_exactly_half_earth_circumference(self):
31293268
"""Cutoff = π·R_earth: chord radius = 2 (sphere diameter); all
31303269
pairs are included. Bartlett at u=1 returns 0, so the antipodal

0 commit comments

Comments
 (0)