Skip to content

Commit 21690d2

Browse files
igerberclaude
andcommitted
Fix weights-only TSL consistency and conflicting-weights gap from PR #218 review (round 6)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8443480 commit 21690d2

4 files changed

Lines changed: 163 additions & 32 deletions

File tree

diff_diff/linalg.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,6 +1414,15 @@ class LinearRegression:
14141414
- "warn": Issue warning and drop linearly dependent columns (default)
14151415
- "error": Raise ValueError
14161416
- "silent": Drop columns silently without warning
1417+
weights : array-like, optional
1418+
Observation weights. When survey_design is provided, weights are
1419+
automatically derived from it (explicit weights are overridden).
1420+
weight_type : str, default "pweight"
1421+
Weight type: "pweight", "fweight", or "aweight".
1422+
survey_design : ResolvedSurveyDesign, optional
1423+
Resolved survey design for Taylor Series Linearization variance
1424+
estimation. When provided, weights and weight_type are canonicalized
1425+
from this object.
14171426
14181427
Attributes
14191428
----------
@@ -1541,10 +1550,22 @@ def fit(
15411550

15421551
if isinstance(self.survey_design, ResolvedSurveyDesign):
15431552
_use_survey_vcov = self.survey_design.needs_survey_vcov
1544-
# Auto-derive weights from survey_design if not explicitly provided
1545-
if self.weights is None:
1546-
self.weights = self.survey_design.weights
1547-
self.weight_type = self.survey_design.weight_type
1553+
# Canonicalize weights from survey_design to ensure consistency
1554+
# between coefficient estimation and survey vcov computation
1555+
if (
1556+
self.weights is not None
1557+
and self.weights is not self.survey_design.weights
1558+
):
1559+
warnings.warn(
1560+
"Explicit weights= differ from survey_design.weights. "
1561+
"Using survey_design weights for both coefficient "
1562+
"estimation and variance computation to ensure "
1563+
"consistency.",
1564+
UserWarning,
1565+
stacklevel=2,
1566+
)
1567+
self.weights = self.survey_design.weights
1568+
self.weight_type = self.survey_design.weight_type
15481569

15491570
if self.robust or effective_cluster_ids is not None:
15501571
# Use solve_ols with robust/cluster SEs

diff_diff/survey.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -475,18 +475,13 @@ def compute_survey_vcov(
475475
psu = resolved.psu
476476

477477
if strata is None and psu is None:
478-
# No survey structure beyond weights — fall back to weighted HC1
479-
# HC1 meat = X' diag(w * e²) X, NOT scores'scores which gives w²*e²
480-
# For fweights, df uses sum(w) - k (effective sample size)
481-
n_eff = n
482-
if resolved.weight_type == "fweight":
483-
n_eff = int(np.sum(weights))
484-
if resolved.weight_type == "aweight":
485-
meat = np.dot(X.T, X * (residuals**2)[:, np.newaxis])
486-
else:
487-
meat = np.dot(X.T, X * (weights * residuals**2)[:, np.newaxis])
488-
adjustment = n_eff / (n_eff - k)
489-
meat *= adjustment
478+
# No survey structure beyond weights — use implicit per-observation PSUs
479+
# so the TSL construction is consistent across all branches.
480+
# Each observation is its own PSU; scores are already per-obs.
481+
psu_mean = scores.mean(axis=0, keepdims=True)
482+
centered = scores - psu_mean
483+
adjustment = n / (n - 1)
484+
meat = adjustment * (centered.T @ centered)
490485
elif strata is None and psu is not None:
491486
# No strata, but PSU present — single-stratum cluster-robust
492487
psu_scores = pd.DataFrame(scores).groupby(psu).sum().values

docs/methodology/REGISTRY.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1858,6 +1858,10 @@ unequal selection probabilities).
18581858
- **Note:** For unstratified designs with a single PSU, all `lonely_psu` modes
18591859
produce NaN variance. The "adjust" mode cannot center against a global mean
18601860
when there is only one stratum (the single PSU is the entire sample).
1861+
- **Note:** Weights-only designs (no explicit PSU or strata) use implicit
1862+
per-observation PSUs for the TSL meat computation, consistent with the
1863+
stratified-no-PSU path. The adjustment factor is `n/(n-1)` (not HC1's
1864+
`n/(n-k)`).
18611865

18621866
### Weight Type Effects on Inference
18631867

tests/test_survey.py

Lines changed: 127 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ def test_no_strata_degeneracy(self):
490490
np.testing.assert_allclose(survey_vcov, oracle_vcov, atol=1e-12)
491491

492492
def test_weights_only_oracle(self):
493-
"""Weights-only design: survey vcov matches hand-computed weighted HC1."""
493+
"""Weights-only design: survey vcov uses implicit-PSU TSL."""
494494
np.random.seed(81)
495495
n = 60
496496
raw_weights = np.random.uniform(0.5, 3.0, n)
@@ -512,12 +512,17 @@ def test_weights_only_oracle(self):
512512
)
513513
survey_vcov = compute_survey_vcov(X, resid, resolved)
514514

515-
# Correct weighted HC1: (X'WX)^{-1} * X' diag(w * e²) X * n/(n-k) * (X'WX)^{-1}
515+
# TSL with implicit per-observation PSUs:
516+
# meat = sum_i (s_i - s_bar)(s_i - s_bar)' * n/(n-1)
517+
# where s_i = w_i * X_i * e_i
516518
k = X.shape[1]
517519
XtWX = X.T @ (X * weights[:, np.newaxis])
518520
XtWX_inv = np.linalg.inv(XtWX)
519-
meat = np.dot(X.T, X * (weights * resid**2)[:, np.newaxis])
520-
meat *= n / (n - k)
521+
scores = X * (weights * resid)[:, np.newaxis]
522+
scores_mean = scores.mean(axis=0, keepdims=True)
523+
centered = scores - scores_mean
524+
adjustment = n / (n - 1)
525+
meat = adjustment * (centered.T @ centered)
521526
oracle_vcov = XtWX_inv @ meat @ XtWX_inv
522527

523528
np.testing.assert_allclose(survey_vcov, oracle_vcov, atol=1e-12)
@@ -1462,7 +1467,7 @@ def test_linear_regression_weighted_rank_deficient_robust(self):
14621467
assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
14631468

14641469
def test_fweight_survey_oracle(self):
1465-
"""fweight SurveyDesign: survey vcov matches expanded-data unweighted HC1."""
1470+
"""fweight SurveyDesign: survey vcov uses implicit-PSU TSL."""
14661471
np.random.seed(55)
14671472
n = 30
14681473
X_base = np.column_stack([np.ones(n), np.random.randn(n)])
@@ -1485,17 +1490,17 @@ def test_fweight_survey_oracle(self):
14851490
)
14861491
survey_vcov = compute_survey_vcov(X_base, resid_fw, resolved)
14871492

1488-
# Oracle: expand data and compute unweighted HC1
1489-
X_exp = np.repeat(X_base, freq.astype(int), axis=0)
1490-
y_exp = np.repeat(y_base, freq.astype(int))
1491-
coef_exp, resid_exp, _ = solve_ols(X_exp, y_exp)
1492-
n_exp = X_exp.shape[0]
1493-
k = X_exp.shape[1]
1494-
XtX = X_exp.T @ X_exp
1495-
XtX_inv = np.linalg.inv(XtX)
1496-
meat = np.dot(X_exp.T, X_exp * (resid_exp**2)[:, np.newaxis])
1497-
meat *= n_exp / (n_exp - k)
1498-
oracle_vcov = XtX_inv @ meat @ XtX_inv
1493+
# Oracle: TSL with implicit per-observation PSUs
1494+
# scores = w_i * X_i * e_i, meat = n/(n-1) * (scores - mean)' (scores - mean)
1495+
k = X_base.shape[1]
1496+
XtWX = X_base.T @ (X_base * freq[:, np.newaxis])
1497+
XtWX_inv = np.linalg.inv(XtWX)
1498+
scores = X_base * (freq * resid_fw)[:, np.newaxis]
1499+
scores_mean = scores.mean(axis=0, keepdims=True)
1500+
centered = scores - scores_mean
1501+
adjustment = n / (n - 1)
1502+
meat = adjustment * (centered.T @ centered)
1503+
oracle_vcov = XtWX_inv @ meat @ XtWX_inv
14991504

15001505
np.testing.assert_allclose(survey_vcov, oracle_vcov, atol=1e-10)
15011506

@@ -1832,3 +1837,109 @@ def test_resolve_warns_single_psu_unstratified(self):
18321837
)
18331838
with pytest.warns(UserWarning, match=r"Only 1 PSU"):
18341839
design.resolve(df)
1840+
1841+
1842+
class TestRound6Fixes:
1843+
"""Tests for round-6 review fixes (PR #218)."""
1844+
1845+
def test_weights_only_matches_explicit_individual_psu(self):
1846+
"""Weights-only design produces identical vcov to explicit psu=arange(n)."""
1847+
np.random.seed(600)
1848+
n = 50
1849+
X = np.column_stack([np.ones(n), np.random.randn(n)])
1850+
y = 1.0 + X[:, 1] * 0.5 + np.random.randn(n) * 0.4
1851+
weights = np.random.uniform(0.5, 3.0, n)
1852+
weights = weights * (n / np.sum(weights))
1853+
1854+
coef, resid, _ = solve_ols(X, y, weights=weights, weight_type="pweight")
1855+
1856+
# Weights-only design (no PSU)
1857+
resolved_wo = ResolvedSurveyDesign(
1858+
weights=weights,
1859+
weight_type="pweight",
1860+
strata=None,
1861+
psu=None,
1862+
fpc=None,
1863+
n_strata=0,
1864+
n_psu=0,
1865+
lonely_psu="remove",
1866+
)
1867+
vcov_wo = compute_survey_vcov(X, resid, resolved_wo)
1868+
1869+
# Explicit individual-PSU design
1870+
resolved_psu = ResolvedSurveyDesign(
1871+
weights=weights,
1872+
weight_type="pweight",
1873+
strata=None,
1874+
psu=np.arange(n),
1875+
fpc=None,
1876+
n_strata=0,
1877+
n_psu=n,
1878+
lonely_psu="remove",
1879+
)
1880+
vcov_psu = compute_survey_vcov(X, resid, resolved_psu)
1881+
1882+
np.testing.assert_allclose(vcov_wo, vcov_psu, atol=1e-12)
1883+
1884+
def test_conflicting_weights_warns_and_uses_survey(self):
1885+
"""Explicit weights differing from survey_design triggers warning."""
1886+
np.random.seed(601)
1887+
n = 40
1888+
X = np.random.randn(n, 2)
1889+
y = X @ [1.0, 0.5] + np.random.randn(n) * 0.3
1890+
survey_weights = np.random.uniform(0.5, 3.0, n)
1891+
different_weights = np.random.uniform(1.0, 5.0, n)
1892+
1893+
resolved = ResolvedSurveyDesign(
1894+
weights=survey_weights,
1895+
weight_type="pweight",
1896+
strata=None,
1897+
psu=None,
1898+
fpc=None,
1899+
n_strata=0,
1900+
n_psu=0,
1901+
lonely_psu="remove",
1902+
)
1903+
1904+
# Fit with conflicting explicit weights — should warn
1905+
reg_conflict = LinearRegression(
1906+
weights=different_weights, survey_design=resolved
1907+
)
1908+
with pytest.warns(UserWarning, match="differ from survey_design"):
1909+
reg_conflict.fit(X, y)
1910+
1911+
# Fit with only survey_design — no explicit weights
1912+
reg_survey = LinearRegression(survey_design=resolved)
1913+
reg_survey.fit(X, y)
1914+
1915+
np.testing.assert_allclose(
1916+
reg_conflict.coefficients_, reg_survey.coefficients_, atol=1e-14
1917+
)
1918+
np.testing.assert_allclose(
1919+
reg_conflict.vcov_, reg_survey.vcov_, atol=1e-14
1920+
)
1921+
1922+
def test_matching_weights_no_warning(self):
1923+
"""Same array object passed as weights and in survey_design: no warning."""
1924+
np.random.seed(602)
1925+
n = 40
1926+
X = np.random.randn(n, 2)
1927+
y = X @ [1.0, 0.5] + np.random.randn(n) * 0.3
1928+
weights = np.random.uniform(0.5, 3.0, n)
1929+
1930+
resolved = ResolvedSurveyDesign(
1931+
weights=weights,
1932+
weight_type="pweight",
1933+
strata=None,
1934+
psu=None,
1935+
fpc=None,
1936+
n_strata=0,
1937+
n_psu=0,
1938+
lonely_psu="remove",
1939+
)
1940+
1941+
# Same array object — should NOT warn
1942+
reg = LinearRegression(weights=weights, survey_design=resolved)
1943+
with warnings.catch_warnings():
1944+
warnings.simplefilter("error")
1945+
reg.fit(X, y)

0 commit comments

Comments
 (0)