Skip to content

Commit a67d940

Browse files
igerberclaude
andcommitted
Fix survey HC1 meat formula and survey-weighted rank-deficiency issues from PR #218 review (round 4)
P0: Replace scores'scores (w²*e²) with correct X'diag(w*e²)X in no-structure survey vcov branch. P1: Handle NaN coefficients in survey vcov callers (LinearRegression.fit, MultiPeriodDiD.fit) by computing on kept columns and expanding with _expand_vcov_with_nan. P2: Fix oracle test and add fweight oracle + rank-deficiency tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b05d34b commit a67d940

4 files changed

Lines changed: 173 additions & 6 deletions

File tree

diff_diff/estimators.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from diff_diff.linalg import (
2222
LinearRegression,
23+
_expand_vcov_with_nan,
2324
compute_r_squared,
2425
compute_robust_vcov,
2526
solve_ols,
@@ -1056,7 +1057,18 @@ def fit( # type: ignore[override]
10561057
if _use_survey_vcov:
10571058
from diff_diff.survey import compute_survey_vcov
10581059

1059-
vcov = compute_survey_vcov(X, residuals, resolved_survey)
1060+
nan_mask = np.isnan(coefficients)
1061+
if np.any(nan_mask):
1062+
kept_cols = np.where(~nan_mask)[0]
1063+
if len(kept_cols) > 0:
1064+
vcov_reduced = compute_survey_vcov(
1065+
X[:, kept_cols], residuals, resolved_survey
1066+
)
1067+
vcov = _expand_vcov_with_nan(vcov_reduced, X.shape[1], kept_cols)
1068+
else:
1069+
vcov = np.full((X.shape[1], X.shape[1]), np.nan)
1070+
else:
1071+
vcov = compute_survey_vcov(X, residuals, resolved_survey)
10601072
r_squared = compute_r_squared(y, residuals)
10611073

10621074
# Degrees of freedom: survey df overrides standard df

diff_diff/linalg.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1625,7 +1625,18 @@ def fit(
16251625
if _use_survey_vcov:
16261626
from diff_diff.survey import compute_survey_vcov
16271627

1628-
vcov = compute_survey_vcov(X, residuals, self.survey_design)
1628+
nan_mask = np.isnan(coefficients)
1629+
if np.any(nan_mask):
1630+
kept_cols = np.where(~nan_mask)[0]
1631+
if len(kept_cols) > 0:
1632+
vcov_reduced = compute_survey_vcov(
1633+
X[:, kept_cols], residuals, self.survey_design
1634+
)
1635+
vcov = _expand_vcov_with_nan(vcov_reduced, X.shape[1], kept_cols)
1636+
else:
1637+
vcov = np.full((X.shape[1], X.shape[1]), np.nan)
1638+
else:
1639+
vcov = compute_survey_vcov(X, residuals, self.survey_design)
16291640

16301641
# Store fitted attributes
16311642
self.coefficients_ = coefficients

diff_diff/survey.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,11 +447,15 @@ def compute_survey_vcov(
447447

448448
if strata is None and psu is None:
449449
# No survey structure beyond weights — fall back to weighted HC1
450+
# HC1 meat = X' diag(w * e²) X, NOT scores'scores which gives w²*e²
450451
# For fweights, df uses sum(w) - k (effective sample size)
451452
n_eff = n
452453
if resolved.weight_type == "fweight":
453454
n_eff = int(np.sum(weights))
454-
meat = scores.T @ scores
455+
if resolved.weight_type == "aweight":
456+
meat = np.dot(X.T, X * (residuals**2)[:, np.newaxis])
457+
else:
458+
meat = np.dot(X.T, X * (weights * residuals**2)[:, np.newaxis])
455459
adjustment = n_eff / (n_eff - k)
456460
meat *= adjustment
457461
elif strata is None and psu is not None:

tests/test_survey.py

Lines changed: 143 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -512,12 +512,11 @@ def test_weights_only_oracle(self):
512512
)
513513
survey_vcov = compute_survey_vcov(X, resid, resolved)
514514

515-
# Hand-compute weighted HC1: (X'WX)^{-1} * (sum w_i^2 X_i X_i' e_i^2) * n/(n-k) * (X'WX)^{-1}
515+
# Correct weighted HC1: (X'WX)^{-1} * X' diag(w * e²) X * n/(n-k) * (X'WX)^{-1}
516516
k = X.shape[1]
517517
XtWX = X.T @ (X * weights[:, np.newaxis])
518518
XtWX_inv = np.linalg.inv(XtWX)
519-
scores = X * (weights * resid)[:, np.newaxis]
520-
meat = scores.T @ scores
519+
meat = np.dot(X.T, X * (weights * resid**2)[:, np.newaxis])
521520
meat *= n / (n - k)
522521
oracle_vcov = XtWX_inv @ meat @ XtWX_inv
523522

@@ -1462,6 +1461,147 @@ def test_linear_regression_weighted_rank_deficient_robust(self):
14621461
for i in kept:
14631462
assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
14641463

1464+
def test_fweight_survey_oracle(self):
1465+
"""fweight SurveyDesign: survey vcov matches expanded-data unweighted HC1."""
1466+
np.random.seed(55)
1467+
n = 30
1468+
X_base = np.column_stack([np.ones(n), np.random.randn(n)])
1469+
y_base = 2.0 + X_base[:, 1] * 1.5 + np.random.randn(n) * 0.3
1470+
freq = np.random.choice([1, 2, 3], n).astype(float)
1471+
1472+
# WLS with fweights via survey
1473+
coef_fw, resid_fw, _ = solve_ols(
1474+
X_base, y_base, weights=freq, weight_type="fweight"
1475+
)
1476+
resolved = ResolvedSurveyDesign(
1477+
weights=freq,
1478+
weight_type="fweight",
1479+
strata=None,
1480+
psu=None,
1481+
fpc=None,
1482+
n_strata=0,
1483+
n_psu=0,
1484+
lonely_psu="remove",
1485+
)
1486+
survey_vcov = compute_survey_vcov(X_base, resid_fw, resolved)
1487+
1488+
# Oracle: expand data and compute unweighted HC1
1489+
X_exp = np.repeat(X_base, freq.astype(int), axis=0)
1490+
y_exp = np.repeat(y_base, freq.astype(int))
1491+
coef_exp, resid_exp, _ = solve_ols(X_exp, y_exp)
1492+
n_exp = X_exp.shape[0]
1493+
k = X_exp.shape[1]
1494+
XtX = X_exp.T @ X_exp
1495+
XtX_inv = np.linalg.inv(XtX)
1496+
meat = np.dot(X_exp.T, X_exp * (resid_exp**2)[:, np.newaxis])
1497+
meat *= n_exp / (n_exp - k)
1498+
oracle_vcov = XtX_inv @ meat @ XtX_inv
1499+
1500+
np.testing.assert_allclose(survey_vcov, oracle_vcov, atol=1e-10)
1501+
1502+
def test_survey_rank_deficient_with_psu(self):
1503+
"""LinearRegression + survey design (PSU) + rank deficiency: no crash."""
1504+
np.random.seed(43)
1505+
n = 50
1506+
x1 = np.random.randn(n)
1507+
X = np.column_stack([np.ones(n), x1, x1]) # duplicate col
1508+
y = 2.0 + 1.5 * x1 + np.random.randn(n) * 0.3
1509+
pw = np.random.uniform(0.5, 3.0, size=n)
1510+
psu = np.arange(n) # each obs is its own PSU
1511+
1512+
resolved = ResolvedSurveyDesign(
1513+
weights=pw,
1514+
weight_type="pweight",
1515+
strata=None,
1516+
psu=psu,
1517+
fpc=None,
1518+
n_strata=0,
1519+
n_psu=n,
1520+
lonely_psu="remove",
1521+
)
1522+
1523+
model = LinearRegression(
1524+
survey_design=resolved,
1525+
include_intercept=False,
1526+
rank_deficient_action="warn",
1527+
)
1528+
1529+
with warnings.catch_warnings():
1530+
warnings.simplefilter("ignore", UserWarning)
1531+
model.fit(X, y)
1532+
1533+
coef = model.coefficients_
1534+
resid = model.residuals_
1535+
vcov = model.vcov_
1536+
1537+
# One dropped coefficient
1538+
assert np.sum(np.isnan(coef)) == 1
1539+
1540+
# Residuals all finite
1541+
assert np.all(np.isfinite(resid))
1542+
1543+
# Identified coefficients have positive, finite SEs
1544+
kept = np.where(~np.isnan(coef))[0]
1545+
for i in kept:
1546+
assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
1547+
1548+
# Dropped column has NaN vcov
1549+
dropped = np.where(np.isnan(coef))[0]
1550+
for i in dropped:
1551+
assert np.all(np.isnan(vcov[i, :]))
1552+
assert np.all(np.isnan(vcov[:, i]))
1553+
1554+
def test_survey_rank_deficient_weights_only(self):
1555+
"""Weights-only survey + rank deficiency: no crash, correct NaN pattern."""
1556+
np.random.seed(44)
1557+
n = 50
1558+
x1 = np.random.randn(n)
1559+
X = np.column_stack([np.ones(n), x1, x1]) # duplicate col
1560+
y = 2.0 + 1.5 * x1 + np.random.randn(n) * 0.3
1561+
pw = np.random.uniform(0.5, 3.0, size=n)
1562+
1563+
resolved = ResolvedSurveyDesign(
1564+
weights=pw,
1565+
weight_type="pweight",
1566+
strata=None,
1567+
psu=None,
1568+
fpc=None,
1569+
n_strata=0,
1570+
n_psu=0,
1571+
lonely_psu="remove",
1572+
)
1573+
1574+
model = LinearRegression(
1575+
survey_design=resolved,
1576+
include_intercept=False,
1577+
rank_deficient_action="warn",
1578+
)
1579+
1580+
with warnings.catch_warnings():
1581+
warnings.simplefilter("ignore", UserWarning)
1582+
model.fit(X, y)
1583+
1584+
coef = model.coefficients_
1585+
resid = model.residuals_
1586+
vcov = model.vcov_
1587+
1588+
# One dropped coefficient
1589+
assert np.sum(np.isnan(coef)) == 1
1590+
1591+
# Residuals all finite
1592+
assert np.all(np.isfinite(resid))
1593+
1594+
# Identified coefficients have positive, finite SEs
1595+
kept = np.where(~np.isnan(coef))[0]
1596+
for i in kept:
1597+
assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
1598+
1599+
# Dropped column has NaN vcov
1600+
dropped = np.where(np.isnan(coef))[0]
1601+
for i in dropped:
1602+
assert np.all(np.isnan(vcov[i, :]))
1603+
assert np.all(np.isnan(vcov[:, i]))
1604+
14651605
def test_linear_regression_weighted_rank_deficient_classical(self):
14661606
"""LinearRegression with weights + classical vcov + rank deficiency."""
14671607
np.random.seed(42)

0 commit comments

Comments
 (0)