Fix remaining review P1s (round 2)

igerber · claude · igerber · commit 31c74fa24c43 · 2026-04-06T21:26:26.000-04:00
- P1: Only inject cluster as PSU when user explicitly sets cluster=;
  weights-only surveys without cluster= now keep implicit per-obs PSUs,
  preserving documented df_survey = n_obs - 1 contract.
- P1: Add pweight-only guard in _resolve_survey_for_wooldridge() —
  fweight/aweight now raise ValueError matching other pweight-only
  estimators (ImputationDiD, TwoStageDiD).
- P1: Add zero-weight safeguards to solve_poisson(weights=...) mirroring
  solve_logit's positive-weight validation (rank check on effective
  sample, sample-size identification). Skip zero-weight ASF cells in
  Poisson survey path.
- P2: Add regression tests for implicit PSU contract, fweight rejection,
  and zero-weight Poisson cell handling.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
@@ -2429,6 +2429,46 @@ def solve_poisson(
         X = X[:, kept_cols]
 
     n, k = X.shape
+
+    # Validate effective weighted sample when weights have zeros
+    # (mirrors solve_logit's positive-weight safeguards)
+    if weights is not None and np.any(weights == 0):
+        pos_mask = weights > 0
+        n_pos = int(np.sum(pos_mask))
+        X_eff = X[pos_mask]
+        eff_rank_info = _detect_rank_deficiency(X_eff)
+        if len(eff_rank_info[1]) > 0:
+            n_dropped_eff = len(eff_rank_info[1])
+            if rank_deficient_action == "error":
+                raise ValueError(
+                    f"Effective (positive-weight) sample is rank-deficient: "
+                    f"{n_dropped_eff} linearly dependent column(s). "
+                    f"Cannot identify Poisson model on this subpopulation."
+                )
+            elif rank_deficient_action == "warn":
+                warnings.warn(
+                    f"Effective (positive-weight) sample is rank-deficient: "
+                    f"dropping {n_dropped_eff} column(s). Poisson estimates "
+                    f"may be unreliable on this subpopulation.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+            eff_dropped = set(int(d) for d in eff_rank_info[1])
+            eff_kept = np.array([i for i in range(k) if i not in eff_dropped])
+            X = X[:, eff_kept]
+            if len(dropped_cols) > 0:
+                kept_cols = kept_cols[eff_kept]
+            else:
+                kept_cols = eff_kept
+                dropped_cols = list(eff_dropped)
+            n, k = X.shape
+        if n_pos <= k:
+            raise ValueError(
+                f"Only {n_pos} positive-weight observation(s) for "
+                f"{k} parameters (after rank reduction). "
+                f"Cannot identify Poisson model."
+            )
+
     if init_beta is not None:
         beta = init_beta[kept_cols].copy() if len(dropped_cols) > 0 else init_beta.copy()
     else:
diff --git a/diff_diff/wooldridge.py b/diff_diff/wooldridge.py
@@ -89,6 +89,12 @@ def _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, cluster_n
             "WooldridgeDiD does not yet support replicate-weight variance. "
             "Use TSL (strata/PSU/FPC) instead."
         )
+    if resolved is not None and resolved.weight_type != "pweight":
+        raise ValueError(
+            f"WooldridgeDiD survey support requires weight_type='pweight', "
+            f"got '{resolved.weight_type}'. The survey variance math "
+            f"assumes probability weights (pweight)."
+        )
     if resolved is not None:
         effective_cluster = _resolve_effective_cluster(
             resolved, cluster_ids, cluster_name
@@ -623,9 +629,10 @@ def _fit_ols(
         cluster_col = self.cluster if self.cluster else unit
         cluster_ids = sample[cluster_col].values
 
-        # Resolve survey design, inject cluster as PSU when needed
+        # Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
+        survey_cluster_ids = cluster_ids if self.cluster else None
         resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
-            _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
+            _resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
         )
 
         # 4. Within-transform: absorb unit + time FE
@@ -822,9 +829,10 @@ def _fit_logit(
         cluster_col = self.cluster if self.cluster else unit
         cluster_ids = sample[cluster_col].values
 
-        # Resolve survey design, inject cluster as PSU when needed
+        # Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
+        survey_cluster_ids = cluster_ids if self.cluster else None
         resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
-            _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
+            _resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
         )
         _has_survey = resolved is not None
 
@@ -1054,9 +1062,10 @@ def _fit_poisson(
         cluster_col = self.cluster if self.cluster else unit
         cluster_ids = sample[cluster_col].values
 
-        # Resolve survey design, inject cluster as PSU when needed
+        # Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
+        survey_cluster_ids = cluster_ids if self.cluster else None
         resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
-            _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
+            _resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
         )
         _has_survey = resolved is not None
 
@@ -1148,6 +1157,9 @@ def _avg_ax0(a, cell_mask):
             # Use raw coefficients (before NaN->0 zeroing) to detect dropped cells.
             if np.isnan(beta_int_raw[idx]):
                 continue
+            # Skip cells where all survey weights are zero (non-estimable)
+            if survey_weights is not None and np.sum(survey_weights[cell_mask]) == 0:
+                continue
             delta = beta_int[idx]
             if np.isnan(delta):
                 continue
diff --git a/tests/test_wooldridge.py b/tests/test_wooldridge.py
@@ -1476,3 +1476,48 @@ def test_survey_gt_weights_are_counts(self, survey_panel):
                 f"gt_weights[{k}] = {w} (type {type(w).__name__}); "
                 f"expected int (cell count)"
             )
+
+    def test_weights_only_no_cluster_implicit_psu(self, survey_panel):
+        """Weights-only survey without cluster= keeps implicit per-obs PSUs."""
+        from diff_diff.survey import SurveyDesign
+        from diff_diff.wooldridge import _filter_sample
+        sd = SurveyDesign(weights="weight")
+        r = WooldridgeDiD().fit(
+            survey_panel, outcome="y", unit="unit", time="time",
+            cohort="cohort", survey_design=sd,
+        )
+        # n_psu should equal n_obs in the filtered sample (not n_units)
+        sample = _filter_sample(
+            survey_panel.copy().assign(cohort=lambda d: d["cohort"].fillna(0)),
+            "unit", "time", "cohort", "not_yet_treated", 0,
+        )
+        assert r.survey_metadata is not None
+        assert r.survey_metadata.n_psu == len(sample)
+
+    def test_fweight_rejected(self, survey_panel):
+        """fweight raises ValueError (pweight only)."""
+        from diff_diff.survey import SurveyDesign
+        # Use integer weights so fweight validation passes in resolve(),
+        # and the pweight guard in _resolve_survey_for_wooldridge fires.
+        df = survey_panel.copy()
+        df["int_weight"] = 1
+        sd = SurveyDesign(weights="int_weight", weight_type="fweight")
+        with pytest.raises(ValueError, match="weight_type='pweight'"):
+            WooldridgeDiD().fit(
+                df, outcome="y", unit="unit", time="time",
+                cohort="cohort", survey_design=sd,
+            )
+
+    def test_poisson_zero_weight_cell(self, survey_panel):
+        """Poisson survey fit handles zero-weight treated cells cleanly."""
+        from diff_diff.survey import SurveyDesign
+        df = survey_panel.copy()
+        # Zero out weights for one treated cohort so some cells have zero weight
+        df.loc[df["cohort"] == 3, "weight"] = 0.0
+        sd = SurveyDesign(weights="weight", strata="stratum", psu="unit")
+        r = WooldridgeDiD(method="poisson").fit(
+            df, outcome="y_count", unit="unit", time="time",
+            cohort="cohort", survey_design=sd,
+        )
+        assert np.isfinite(r.overall_att)
+        assert np.isfinite(r.overall_se)