Skip to content

Commit 200745c

Browse files
igerberclaude
andcommitted
Address CI review: fix logit coef expansion, reject string masks
P1 fixes from CI AI review (PR #238): - solve_logit(): track original column count and expand returned beta back to p+1 length after effective-sample column dropping. Previously returned a shortened vector breaking the solver contract. - subpopulation(): reject string/object masks that would silently coerce non-empty strings to True, defining the wrong domain. - REGISTRY.md: add Note entries for estimator-level replicate limitations (SunAbraham rejection, CS/ContinuousDiD/EfficientDiD bootstrap rejection) Tests: assert beta length p+1 after zero-weight rank-deficient solve, assert string mask raises ValueError. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 14f2110 commit 200745c

4 files changed

Lines changed: 57 additions & 13 deletions

File tree

diff_diff/linalg.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,10 @@ def solve_logit(
11861186
f"got '{rank_deficient_action}'"
11871187
)
11881188

1189+
# Track original column count for coefficient expansion at the end
1190+
k_original = X_with_intercept.shape[1]
1191+
eff_dropped_original: list = [] # indices in original column space
1192+
11891193
# Validate effective weighted sample when weights have zeros
11901194
if weights is not None and np.any(weights == 0):
11911195
pos_mask = weights > 0
@@ -1207,8 +1211,7 @@ def solve_logit(
12071211
f"{X_eff.shape[1]} parameters. Cannot identify logistic model."
12081212
)
12091213
# Check rank deficiency on positive-weight rows — full design may
1210-
# be full rank due to zero-weight padding. Use effective-sample rank
1211-
# result to drive column dropping (same flow as full-sample check).
1214+
# be full rank due to zero-weight padding.
12121215
eff_rank_info = _detect_rank_deficiency(X_eff)
12131216
if len(eff_rank_info[1]) > 0:
12141217
n_dropped_eff = len(eff_rank_info[1])
@@ -1226,13 +1229,12 @@ def solve_logit(
12261229
UserWarning,
12271230
stacklevel=2,
12281231
)
1229-
# Use the effective-sample rank info for column dropping
1230-
# (overrides the full-sample check below which may show no deficiency)
1231-
_eff_rank, _eff_dropped, _eff_pivot = eff_rank_info
1232-
X_with_intercept = np.delete(X_with_intercept, _eff_dropped, axis=1)
1232+
# Drop columns and track original indices for final expansion
1233+
eff_dropped_original = list(eff_rank_info[1])
1234+
X_with_intercept = np.delete(X_with_intercept, eff_rank_info[1], axis=1)
12331235
k = X_with_intercept.shape[1]
12341236

1235-
# Check rank deficiency once before iterating
1237+
# Check rank deficiency once before iterating (on possibly-shrunk matrix)
12361238
rank_info = _detect_rank_deficiency(X_with_intercept)
12371239
rank, dropped_cols, _ = rank_info
12381240
if len(dropped_cols) > 0:
@@ -1328,10 +1330,20 @@ def solve_logit(
13281330
stacklevel=2,
13291331
)
13301332

1331-
# Expand beta back to full size if columns were dropped
1332-
if len(dropped_cols) > 0:
1333-
beta_full = np.zeros(k)
1334-
beta_full[kept_cols] = beta_solve
1333+
# Expand beta back to original column count, accounting for columns
1334+
# dropped in both the effective-sample check and the full-sample check
1335+
if len(dropped_cols) > 0 or len(eff_dropped_original) > 0:
1336+
# First expand from X_solve columns back to post-eff-drop columns
1337+
beta_post_eff = np.zeros(k)
1338+
beta_post_eff[kept_cols] = beta_solve
1339+
1340+
# Then expand from post-eff-drop columns back to original columns
1341+
if len(eff_dropped_original) > 0:
1342+
beta_full = np.zeros(k_original)
1343+
kept_original = [i for i in range(k_original) if i not in eff_dropped_original]
1344+
beta_full[kept_original] = beta_post_eff
1345+
else:
1346+
beta_full = beta_post_eff
13351347
else:
13361348
beta_full = beta_solve
13371349

diff_diff/survey.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,18 @@ def subpopulation(
432432
"Subpopulation mask contains None/NA values. "
433433
"Provide a boolean mask with no missing values."
434434
)
435+
# Reject string/object masks — non-empty strings coerce to True
436+
# which silently defines the wrong domain
437+
if any(isinstance(v, str) for v in raw_mask):
438+
raise ValueError(
439+
"Subpopulation mask has object dtype with string values. "
440+
"Provide a boolean or numeric (0/1) mask, not strings."
441+
)
442+
if hasattr(raw_mask, 'dtype') and raw_mask.dtype.kind in ('U', 'S'):
443+
raise ValueError(
444+
"Subpopulation mask contains string values. "
445+
"Provide a boolean or numeric (0/1) mask."
446+
)
435447
mask_arr = raw_mask.astype(bool)
436448

437449
if len(mask_arr) != len(data):

docs/methodology/REGISTRY.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2021,7 +2021,14 @@ variance from the distribution of replicate estimates.
20212021
- **Note:** JKn requires explicit `replicate_strata` (per-replicate stratum
20222022
assignment). Auto-derivation from weight patterns is not supported.
20232023
- **Note:** Invalid replicate solves (singular/degenerate) are dropped with
2024-
a warning. Variance is computed from valid replicates only.
2024+
a warning. Variance is computed from valid replicates only. Fewer than 2
2025+
valid replicates returns NaN variance.
2026+
- **Note:** SunAbraham rejects replicate-weight designs with
2027+
`NotImplementedError` because the weighted within-transformation must be
2028+
recomputed per replicate (not yet implemented).
2029+
- **Note:** CallawaySantAnna, ContinuousDiD, and EfficientDiD reject
2030+
replicate weights with `n_bootstrap > 0`. Replicate weights provide
2031+
analytical variance; bootstrap is a separate inference mechanism.
20252032

20262033
### DEFF Diagnostics (Phase 6)
20272034

tests/test_survey_phase6.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -744,7 +744,20 @@ def test_solve_logit_rank_deficient_positive_weight_subset_warn_mode(self):
744744
w[i] = 1.0
745745
X[i, 1] = 5.0
746746
with pytest.warns(UserWarning, match="rank-deficient"):
747-
solve_logit(X, y, weights=w, rank_deficient_action="warn")
747+
beta, probs = solve_logit(X, y, weights=w, rank_deficient_action="warn")
748+
# Key assertion: beta must have original p+1 length (intercept + 2 covariates)
749+
assert len(beta) == 3, (
750+
f"Expected beta length 3 (p+1), got {len(beta)}. "
751+
f"Column dropping broke the coefficient vector shape."
752+
)
753+
754+
def test_subpopulation_string_mask_rejected(self, basic_did_data):
755+
"""Subpopulation mask with string values should be rejected."""
756+
sd = SurveyDesign(weights="weight")
757+
mask = ["yes"] * len(basic_did_data)
758+
mask[0] = "no"
759+
with pytest.raises(ValueError, match="string"):
760+
sd.subpopulation(basic_did_data, mask)
748761

749762
def test_replicate_if_no_divide_by_zero_warning(self):
750763
"""compute_replicate_if_variance should not warn on zero weights."""

0 commit comments

Comments
 (0)