Fix survey HC1 meat formula and survey-weighted rank-deficiency issues from PR #218 review (round 4)

igerber · claude · igerber · commit a67d940606cb · 2026-03-20T15:56:47.000-04:00
P0: Replace scores'scores (w²*e²) with correct X'diag(w*e²)X in no-structure
survey vcov branch. P1: Handle NaN coefficients in survey vcov callers
(LinearRegression.fit, MultiPeriodDiD.fit) by computing on kept columns and
expanding with _expand_vcov_with_nan. P2: Fix oracle test and add fweight
oracle + rank-deficiency tests.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -20,6 +20,7 @@
 
 from diff_diff.linalg import (
     LinearRegression,
+    _expand_vcov_with_nan,
     compute_r_squared,
     compute_robust_vcov,
     solve_ols,
@@ -1056,7 +1057,18 @@ def fit(  # type: ignore[override]
         if _use_survey_vcov:
             from diff_diff.survey import compute_survey_vcov
 
-            vcov = compute_survey_vcov(X, residuals, resolved_survey)
+            nan_mask = np.isnan(coefficients)
+            if np.any(nan_mask):
+                kept_cols = np.where(~nan_mask)[0]
+                if len(kept_cols) > 0:
+                    vcov_reduced = compute_survey_vcov(
+                        X[:, kept_cols], residuals, resolved_survey
+                    )
+                    vcov = _expand_vcov_with_nan(vcov_reduced, X.shape[1], kept_cols)
+                else:
+                    vcov = np.full((X.shape[1], X.shape[1]), np.nan)
+            else:
+                vcov = compute_survey_vcov(X, residuals, resolved_survey)
         r_squared = compute_r_squared(y, residuals)
 
         # Degrees of freedom: survey df overrides standard df
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
@@ -1625,7 +1625,18 @@ def fit(
         if _use_survey_vcov:
             from diff_diff.survey import compute_survey_vcov
 
-            vcov = compute_survey_vcov(X, residuals, self.survey_design)
+            nan_mask = np.isnan(coefficients)
+            if np.any(nan_mask):
+                kept_cols = np.where(~nan_mask)[0]
+                if len(kept_cols) > 0:
+                    vcov_reduced = compute_survey_vcov(
+                        X[:, kept_cols], residuals, self.survey_design
+                    )
+                    vcov = _expand_vcov_with_nan(vcov_reduced, X.shape[1], kept_cols)
+                else:
+                    vcov = np.full((X.shape[1], X.shape[1]), np.nan)
+            else:
+                vcov = compute_survey_vcov(X, residuals, self.survey_design)
 
         # Store fitted attributes
         self.coefficients_ = coefficients
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -447,11 +447,15 @@ def compute_survey_vcov(
 
     if strata is None and psu is None:
         # No survey structure beyond weights — fall back to weighted HC1
+        # HC1 meat = X' diag(w * e²) X, NOT scores'scores which gives w²*e²
         # For fweights, df uses sum(w) - k (effective sample size)
         n_eff = n
         if resolved.weight_type == "fweight":
             n_eff = int(np.sum(weights))
-        meat = scores.T @ scores
+        if resolved.weight_type == "aweight":
+            meat = np.dot(X.T, X * (residuals**2)[:, np.newaxis])
+        else:
+            meat = np.dot(X.T, X * (weights * residuals**2)[:, np.newaxis])
         adjustment = n_eff / (n_eff - k)
         meat *= adjustment
     elif strata is None and psu is not None:
diff --git a/tests/test_survey.py b/tests/test_survey.py
@@ -512,12 +512,11 @@ def test_weights_only_oracle(self):
         )
         survey_vcov = compute_survey_vcov(X, resid, resolved)
 
-        # Hand-compute weighted HC1: (X'WX)^{-1} * (sum w_i^2 X_i X_i' e_i^2) * n/(n-k) * (X'WX)^{-1}
+        # Correct weighted HC1: (X'WX)^{-1} * X' diag(w * e²) X * n/(n-k) * (X'WX)^{-1}
         k = X.shape[1]
         XtWX = X.T @ (X * weights[:, np.newaxis])
         XtWX_inv = np.linalg.inv(XtWX)
-        scores = X * (weights * resid)[:, np.newaxis]
-        meat = scores.T @ scores
+        meat = np.dot(X.T, X * (weights * resid**2)[:, np.newaxis])
         meat *= n / (n - k)
         oracle_vcov = XtWX_inv @ meat @ XtWX_inv
 
@@ -1462,6 +1461,147 @@ def test_linear_regression_weighted_rank_deficient_robust(self):
         for i in kept:
             assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
 
+    def test_fweight_survey_oracle(self):
+        """fweight SurveyDesign: survey vcov matches expanded-data unweighted HC1."""
+        np.random.seed(55)
+        n = 30
+        X_base = np.column_stack([np.ones(n), np.random.randn(n)])
+        y_base = 2.0 + X_base[:, 1] * 1.5 + np.random.randn(n) * 0.3
+        freq = np.random.choice([1, 2, 3], n).astype(float)
+
+        # WLS with fweights via survey
+        coef_fw, resid_fw, _ = solve_ols(
+            X_base, y_base, weights=freq, weight_type="fweight"
+        )
+        resolved = ResolvedSurveyDesign(
+            weights=freq,
+            weight_type="fweight",
+            strata=None,
+            psu=None,
+            fpc=None,
+            n_strata=0,
+            n_psu=0,
+            lonely_psu="remove",
+        )
+        survey_vcov = compute_survey_vcov(X_base, resid_fw, resolved)
+
+        # Oracle: expand data and compute unweighted HC1
+        X_exp = np.repeat(X_base, freq.astype(int), axis=0)
+        y_exp = np.repeat(y_base, freq.astype(int))
+        coef_exp, resid_exp, _ = solve_ols(X_exp, y_exp)
+        n_exp = X_exp.shape[0]
+        k = X_exp.shape[1]
+        XtX = X_exp.T @ X_exp
+        XtX_inv = np.linalg.inv(XtX)
+        meat = np.dot(X_exp.T, X_exp * (resid_exp**2)[:, np.newaxis])
+        meat *= n_exp / (n_exp - k)
+        oracle_vcov = XtX_inv @ meat @ XtX_inv
+
+        np.testing.assert_allclose(survey_vcov, oracle_vcov, atol=1e-10)
+
+    def test_survey_rank_deficient_with_psu(self):
+        """LinearRegression + survey design (PSU) + rank deficiency: no crash."""
+        np.random.seed(43)
+        n = 50
+        x1 = np.random.randn(n)
+        X = np.column_stack([np.ones(n), x1, x1])  # duplicate col
+        y = 2.0 + 1.5 * x1 + np.random.randn(n) * 0.3
+        pw = np.random.uniform(0.5, 3.0, size=n)
+        psu = np.arange(n)  # each obs is its own PSU
+
+        resolved = ResolvedSurveyDesign(
+            weights=pw,
+            weight_type="pweight",
+            strata=None,
+            psu=psu,
+            fpc=None,
+            n_strata=0,
+            n_psu=n,
+            lonely_psu="remove",
+        )
+
+        model = LinearRegression(
+            survey_design=resolved,
+            include_intercept=False,
+            rank_deficient_action="warn",
+        )
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            model.fit(X, y)
+
+        coef = model.coefficients_
+        resid = model.residuals_
+        vcov = model.vcov_
+
+        # One dropped coefficient
+        assert np.sum(np.isnan(coef)) == 1
+
+        # Residuals all finite
+        assert np.all(np.isfinite(resid))
+
+        # Identified coefficients have positive, finite SEs
+        kept = np.where(~np.isnan(coef))[0]
+        for i in kept:
+            assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
+
+        # Dropped column has NaN vcov
+        dropped = np.where(np.isnan(coef))[0]
+        for i in dropped:
+            assert np.all(np.isnan(vcov[i, :]))
+            assert np.all(np.isnan(vcov[:, i]))
+
+    def test_survey_rank_deficient_weights_only(self):
+        """Weights-only survey + rank deficiency: no crash, correct NaN pattern."""
+        np.random.seed(44)
+        n = 50
+        x1 = np.random.randn(n)
+        X = np.column_stack([np.ones(n), x1, x1])  # duplicate col
+        y = 2.0 + 1.5 * x1 + np.random.randn(n) * 0.3
+        pw = np.random.uniform(0.5, 3.0, size=n)
+
+        resolved = ResolvedSurveyDesign(
+            weights=pw,
+            weight_type="pweight",
+            strata=None,
+            psu=None,
+            fpc=None,
+            n_strata=0,
+            n_psu=0,
+            lonely_psu="remove",
+        )
+
+        model = LinearRegression(
+            survey_design=resolved,
+            include_intercept=False,
+            rank_deficient_action="warn",
+        )
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            model.fit(X, y)
+
+        coef = model.coefficients_
+        resid = model.residuals_
+        vcov = model.vcov_
+
+        # One dropped coefficient
+        assert np.sum(np.isnan(coef)) == 1
+
+        # Residuals all finite
+        assert np.all(np.isfinite(resid))
+
+        # Identified coefficients have positive, finite SEs
+        kept = np.where(~np.isnan(coef))[0]
+        for i in kept:
+            assert np.isfinite(vcov[i, i]) and vcov[i, i] > 0
+
+        # Dropped column has NaN vcov
+        dropped = np.where(np.isnan(coef))[0]
+        for i in dropped:
+            assert np.all(np.isnan(vcov[i, :]))
+            assert np.all(np.isnan(vcov[:, i]))
+
     def test_linear_regression_weighted_rank_deficient_classical(self):
         """LinearRegression with weights + classical vcov + rank deficiency."""
         np.random.seed(42)