Fix cluster-ignored-with-survey and weight validation gaps from PR #218 review (round 7)

igerber · claude · igerber · commit b4cd77065e4f · 2026-03-20T18:25:00.000-04:00
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -308,6 +308,14 @@ def fit(
             resolved_survey, cluster_ids, self.cluster
         )
 
+        # Inject cluster as effective PSU for survey variance estimation
+        if resolved_survey is not None and effective_cluster_ids is not None:
+            from diff_diff.survey import _inject_cluster_as_psu, compute_survey_metadata
+            resolved_survey = _inject_cluster_as_psu(resolved_survey, effective_cluster_ids)
+            if resolved_survey.psu is not None and survey_metadata is not None:
+                raw_w = data[survey_design.weights].values.astype(np.float64) if survey_design.weights else np.ones(len(data), dtype=np.float64)
+                survey_metadata = compute_survey_metadata(resolved_survey, raw_w)
+
         reg = LinearRegression(
             include_intercept=False,  # Intercept already in X
             robust=self.robust,
@@ -1036,6 +1044,14 @@ def fit(  # type: ignore[override]
             resolved_survey, cluster_ids, self.cluster
         )
 
+        # Inject cluster as effective PSU for survey variance estimation
+        if resolved_survey is not None and effective_cluster_ids is not None:
+            from diff_diff.survey import _inject_cluster_as_psu, compute_survey_metadata
+            resolved_survey = _inject_cluster_as_psu(resolved_survey, effective_cluster_ids)
+            if resolved_survey.psu is not None and survey_metadata is not None:
+                raw_w = data[survey_design.weights].values.astype(np.float64) if survey_design.weights else np.ones(len(data), dtype=np.float64)
+                survey_metadata = compute_survey_metadata(resolved_survey, raw_w)
+
         # Determine if survey vcov should be used
         _use_survey_vcov = resolved_survey is not None and resolved_survey.needs_survey_vcov
 
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
@@ -383,6 +383,34 @@ def solve_ols(
 ]: ...
 
 
+_VALID_WEIGHT_TYPES = {"pweight", "fweight", "aweight"}
+
+
+def _validate_weights(weights, weight_type, n):
+    """Validate weights array and weight_type for solve_ols/LinearRegression."""
+    if weight_type not in _VALID_WEIGHT_TYPES:
+        raise ValueError(
+            f"weight_type must be one of {_VALID_WEIGHT_TYPES}, "
+            f"got '{weight_type}'"
+        )
+    if weights is not None:
+        weights = np.asarray(weights, dtype=np.float64)
+        if weights.shape[0] != n:
+            raise ValueError(
+                f"weights length ({weights.shape[0]}) must match "
+                f"X rows ({n})"
+            )
+        if np.any(np.isnan(weights)):
+            raise ValueError("Weights contain NaN values")
+        if np.any(np.isinf(weights)):
+            raise ValueError("Weights contain Inf values")
+        if np.any(weights < 0):
+            raise ValueError(
+                "Weights must be non-negative"
+            )
+    return weights
+
+
 def solve_ols(
     X: np.ndarray,
     y: np.ndarray,
@@ -543,9 +571,7 @@ def solve_ols(
     _original_X = None
     _original_y = None
     if weights is not None:
-        weights = np.asarray(weights, dtype=np.float64)
-        if weights.shape[0] != n:
-            raise ValueError(f"weights length ({weights.shape[0]}) must match X rows ({n})")
+        weights = _validate_weights(weights, weight_type, n)
         _original_X = X
         _original_y = y
         sqrt_w = np.sqrt(weights)
@@ -1567,6 +1593,23 @@ def fit(
                 self.weights = self.survey_design.weights
                 self.weight_type = self.survey_design.weight_type
 
+        if self.weights is not None:
+            self.weights = _validate_weights(
+                self.weights, self.weight_type, X.shape[0]
+            )
+
+        # Inject cluster as PSU for survey variance when no PSU specified
+        if (
+            effective_cluster_ids is not None
+            and self.survey_design is not None
+            and _use_survey_vcov
+        ):
+            from diff_diff.survey import ResolvedSurveyDesign as _RSD, _inject_cluster_as_psu
+            if isinstance(self.survey_design, _RSD) and self.survey_design.psu is None:
+                self.survey_design = _inject_cluster_as_psu(
+                    self.survey_design, effective_cluster_ids
+                )
+
         if self.robust or effective_cluster_ids is not None:
             # Use solve_ols with robust/cluster SEs
             # When survey vcov will be used, skip standard vcov computation
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -15,7 +15,7 @@
 """
 
 import warnings
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from typing import Optional, Tuple
 
 import numpy as np
@@ -430,6 +430,25 @@ def _resolve_effective_cluster(resolved_survey, cluster_ids, cluster_name=None):
     return resolved_survey.psu
 
 
+def _inject_cluster_as_psu(resolved, cluster_ids):
+    """
+    When survey design has no PSU but cluster_ids are provided,
+    inject cluster_ids as the effective PSU for TSL variance estimation.
+
+    Returns a new ResolvedSurveyDesign (no mutation) or the original unchanged.
+    """
+    if resolved is None or cluster_ids is None:
+        return resolved
+    if resolved.psu is not None:
+        return resolved  # PSU already present; _resolve_effective_cluster handles this
+
+    # Factorize cluster_ids for consistent integer encoding
+    codes, uniques = pd.factorize(cluster_ids)
+    n_clusters = len(uniques)
+
+    return replace(resolved, psu=codes, n_psu=n_clusters)
+
+
 def compute_survey_vcov(
     X: np.ndarray,
     residuals: np.ndarray,
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
@@ -175,6 +175,14 @@ def fit(  # type: ignore[override]
             resolved_survey, cluster_ids, self.cluster
         )
 
+        # Inject cluster as effective PSU for survey variance estimation
+        if resolved_survey is not None and effective_cluster_ids is not None:
+            from diff_diff.survey import _inject_cluster_as_psu, compute_survey_metadata
+            resolved_survey = _inject_cluster_as_psu(resolved_survey, effective_cluster_ids)
+            if resolved_survey.psu is not None and survey_metadata is not None:
+                raw_w = data[survey_design.weights].values.astype(np.float64) if survey_design.weights else np.ones(len(data), dtype=np.float64)
+                survey_metadata = compute_survey_metadata(resolved_survey, raw_w)
+
         # Pass rank_deficient_action to LinearRegression
         # If "error", let LinearRegression raise immediately
         # If "warn" or "silent", suppress generic warning and use TWFE's context-specific
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -1883,6 +1883,10 @@ unequal selection probabilities).
 - **Note:** When no explicit PSU is specified (weights-only or stratified-no-PSU
   designs), each observation is treated as its own PSU for df purposes. Survey df
   becomes `n_obs - n_strata` (or `n_obs - 1` when unstratified).
+- **Note:** When survey_design specifies weights only (no PSU) and cluster=
+  is specified, cluster IDs are injected as effective PSUs for Taylor Series
+  Linearization variance estimation, matching the R `survey` package
+  convention that clusters are the primary sampling units.
 
 ---
 
diff --git a/tests/test_survey.py b/tests/test_survey.py
@@ -1943,3 +1943,176 @@ def test_matching_weights_no_warning(self):
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             reg.fit(X, y)
+
+
+class TestRound7Fixes:
+    """Tests for round-7 review fixes (PR #218)."""
+
+    @staticmethod
+    def _make_cluster_data(seed=700):
+        """Create 2-period DiD data with 10 clusters of 5 obs each."""
+        np.random.seed(seed)
+        n_clusters = 10
+        obs_per_cluster = 5
+        rows = []
+        for c in range(n_clusters):
+            is_treated = c >= 5
+            for i in range(obs_per_cluster):
+                for period in [0, 1]:
+                    y = 10.0 + c * 0.3 + np.random.randn() * 0.5
+                    if period == 1 and is_treated:
+                        y += 3.0
+                    rows.append({
+                        "unit": c * obs_per_cluster + i,
+                        "period": period,
+                        "treated": int(is_treated),
+                        "y": y,
+                        "cluster_id": c,
+                        "w": 1.0 + 0.2 * c,
+                    })
+        return pd.DataFrame(rows)
+
+    def test_cluster_injected_as_psu_did(self):
+        """Cluster IDs injected as PSU produce identical SEs to explicit PSU."""
+        data = self._make_cluster_data()
+
+        # Fit with cluster= and weights-only survey (no PSU)
+        result_inject = DifferenceInDifferences(cluster="cluster_id").fit(
+            data, "y", "treated", "period",
+            survey_design=SurveyDesign(weights="w"),
+        )
+
+        # Fit with explicit PSU in survey design
+        result_explicit = DifferenceInDifferences(cluster="cluster_id").fit(
+            data, "y", "treated", "period",
+            survey_design=SurveyDesign(weights="w", psu="cluster_id"),
+        )
+
+        np.testing.assert_allclose(result_inject.se, result_explicit.se, atol=1e-12)
+        assert result_inject.survey_metadata.n_psu == 10
+        assert result_inject.survey_metadata.df_survey == 9
+
+    def test_cluster_injected_as_psu_twfe(self):
+        """TWFE: cluster IDs injected as PSU produce identical SEs to explicit PSU."""
+        data = self._make_cluster_data()
+
+        result_inject = TwoWayFixedEffects(cluster="cluster_id").fit(
+            data, "y", "treated", "period", unit="unit",
+            survey_design=SurveyDesign(weights="w"),
+        )
+
+        result_explicit = TwoWayFixedEffects(cluster="cluster_id").fit(
+            data, "y", "treated", "period", unit="unit",
+            survey_design=SurveyDesign(weights="w", psu="cluster_id"),
+        )
+
+        np.testing.assert_allclose(result_inject.se, result_explicit.se, atol=1e-12)
+        assert result_inject.survey_metadata.n_psu == 10
+        assert result_inject.survey_metadata.df_survey == 9
+
+    def test_cluster_injected_as_psu_linear_regression(self):
+        """Standalone LinearRegression: cluster injection matches explicit PSU."""
+        np.random.seed(701)
+        n = 50
+        cluster_ids = np.repeat(np.arange(10), 5)
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = 1.0 + X[:, 1] * 0.5 + np.random.randn(n) * 0.4
+        weights = np.random.uniform(0.5, 3.0, n)
+
+        # No PSU in resolved design
+        resolved_no_psu = ResolvedSurveyDesign(
+            weights=weights, weight_type="pweight",
+            strata=None, psu=None, fpc=None,
+            n_strata=0, n_psu=0, lonely_psu="remove",
+        )
+        reg_inject = LinearRegression(
+            include_intercept=False, cluster_ids=cluster_ids,
+            survey_design=resolved_no_psu,
+        )
+        reg_inject.fit(X, y)
+
+        # Explicit PSU
+        codes, uniques = pd.factorize(cluster_ids)
+        resolved_psu = ResolvedSurveyDesign(
+            weights=weights, weight_type="pweight",
+            strata=None, psu=codes, fpc=None,
+            n_strata=0, n_psu=len(uniques), lonely_psu="remove",
+        )
+        reg_explicit = LinearRegression(
+            include_intercept=False, cluster_ids=cluster_ids,
+            survey_design=resolved_psu,
+        )
+        reg_explicit.fit(X, y)
+
+        np.testing.assert_allclose(reg_inject.vcov_, reg_explicit.vcov_, atol=1e-12)
+
+    def test_cluster_injection_no_effect_when_psu_present(self):
+        """When PSU is already present, _inject_cluster_as_psu is a no-op."""
+        from diff_diff.survey import _inject_cluster_as_psu
+
+        existing_psu = np.array([0, 0, 1, 1, 2, 2])
+        resolved = ResolvedSurveyDesign(
+            weights=np.ones(6), weight_type="pweight",
+            strata=None, psu=existing_psu, fpc=None,
+            n_strata=0, n_psu=3, lonely_psu="remove",
+        )
+        result = _inject_cluster_as_psu(resolved, np.array([10, 10, 20, 20, 30, 30]))
+        assert result is resolved  # Same object — no replacement
+
+    def test_invalid_weight_type_raises(self):
+        """Invalid weight_type raises ValueError in solve_ols and LinearRegression."""
+        n = 20
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = np.random.randn(n)
+        w = np.ones(n)
+
+        with pytest.raises(ValueError, match="weight_type must be one of"):
+            solve_ols(X, y, weights=w, weight_type="pwieght")
+
+        with pytest.raises(ValueError, match="weight_type must be one of"):
+            LinearRegression(weights=w, weight_type="bad").fit(X, y)
+
+    def test_nan_weights_raises(self):
+        """NaN weights raise ValueError."""
+        n = 20
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = np.random.randn(n)
+        w = np.ones(n)
+        w[5] = np.nan
+
+        with pytest.raises(ValueError, match="NaN"):
+            solve_ols(X, y, weights=w)
+
+    def test_negative_weights_raises(self):
+        """Negative weights raise ValueError."""
+        n = 20
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = np.random.randn(n)
+        w = np.ones(n)
+        w[3] = -0.5
+
+        with pytest.raises(ValueError, match="non-negative"):
+            solve_ols(X, y, weights=w)
+
+    def test_inf_weights_raises(self):
+        """Inf weights raise ValueError."""
+        n = 20
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = np.random.randn(n)
+        w = np.ones(n)
+        w[0] = np.inf
+
+        with pytest.raises(ValueError, match="Inf"):
+            solve_ols(X, y, weights=w)
+
+    def test_zero_weights_accepted(self):
+        """Zero weights are accepted (intentional divergence from SurveyDesign)."""
+        n = 20
+        X = np.column_stack([np.ones(n), np.random.randn(n)])
+        y = np.random.randn(n)
+        w = np.ones(n)
+        w[0] = 0.0
+
+        # Should NOT raise
+        coef, resid, vcov = solve_ols(X, y, weights=w)
+        assert coef is not None