Skip to content

Commit 31c74fa

Browse files
igerberclaude
andcommitted
Fix remaining review P1s (round 2)
- P1: Only inject cluster as PSU when user explicitly sets cluster=; weights-only surveys without cluster= now keep implicit per-obs PSUs, preserving documented df_survey = n_obs - 1 contract. - P1: Add pweight-only guard in _resolve_survey_for_wooldridge() — fweight/aweight now raise ValueError matching other pweight-only estimators (ImputationDiD, TwoStageDiD). - P1: Add zero-weight safeguards to solve_poisson(weights=...) mirroring solve_logit's positive-weight validation (rank check on effective sample, sample-size identification). Skip zero-weight ASF cells in Poisson survey path. - P2: Add regression tests for implicit PSU contract, fweight rejection, and zero-weight Poisson cell handling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d9cdbf0 commit 31c74fa

3 files changed

Lines changed: 103 additions & 6 deletions

File tree

diff_diff/linalg.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2429,6 +2429,46 @@ def solve_poisson(
24292429
X = X[:, kept_cols]
24302430

24312431
n, k = X.shape
2432+
2433+
# Validate effective weighted sample when weights have zeros
2434+
# (mirrors solve_logit's positive-weight safeguards)
2435+
if weights is not None and np.any(weights == 0):
2436+
pos_mask = weights > 0
2437+
n_pos = int(np.sum(pos_mask))
2438+
X_eff = X[pos_mask]
2439+
eff_rank_info = _detect_rank_deficiency(X_eff)
2440+
if len(eff_rank_info[1]) > 0:
2441+
n_dropped_eff = len(eff_rank_info[1])
2442+
if rank_deficient_action == "error":
2443+
raise ValueError(
2444+
f"Effective (positive-weight) sample is rank-deficient: "
2445+
f"{n_dropped_eff} linearly dependent column(s). "
2446+
f"Cannot identify Poisson model on this subpopulation."
2447+
)
2448+
elif rank_deficient_action == "warn":
2449+
warnings.warn(
2450+
f"Effective (positive-weight) sample is rank-deficient: "
2451+
f"dropping {n_dropped_eff} column(s). Poisson estimates "
2452+
f"may be unreliable on this subpopulation.",
2453+
UserWarning,
2454+
stacklevel=2,
2455+
)
2456+
eff_dropped = set(int(d) for d in eff_rank_info[1])
2457+
eff_kept = np.array([i for i in range(k) if i not in eff_dropped])
2458+
X = X[:, eff_kept]
2459+
if len(dropped_cols) > 0:
2460+
kept_cols = kept_cols[eff_kept]
2461+
else:
2462+
kept_cols = eff_kept
2463+
dropped_cols = list(eff_dropped)
2464+
n, k = X.shape
2465+
if n_pos <= k:
2466+
raise ValueError(
2467+
f"Only {n_pos} positive-weight observation(s) for "
2468+
f"{k} parameters (after rank reduction). "
2469+
f"Cannot identify Poisson model."
2470+
)
2471+
24322472
if init_beta is not None:
24332473
beta = init_beta[kept_cols].copy() if len(dropped_cols) > 0 else init_beta.copy()
24342474
else:

diff_diff/wooldridge.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ def _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, cluster_n
8989
"WooldridgeDiD does not yet support replicate-weight variance. "
9090
"Use TSL (strata/PSU/FPC) instead."
9191
)
92+
if resolved is not None and resolved.weight_type != "pweight":
93+
raise ValueError(
94+
f"WooldridgeDiD survey support requires weight_type='pweight', "
95+
f"got '{resolved.weight_type}'. The survey variance math "
96+
f"assumes probability weights (pweight)."
97+
)
9298
if resolved is not None:
9399
effective_cluster = _resolve_effective_cluster(
94100
resolved, cluster_ids, cluster_name
@@ -623,9 +629,10 @@ def _fit_ols(
623629
cluster_col = self.cluster if self.cluster else unit
624630
cluster_ids = sample[cluster_col].values
625631

626-
# Resolve survey design, inject cluster as PSU when needed
632+
# Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
633+
survey_cluster_ids = cluster_ids if self.cluster else None
627634
resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
628-
_resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
635+
_resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
629636
)
630637

631638
# 4. Within-transform: absorb unit + time FE
@@ -822,9 +829,10 @@ def _fit_logit(
822829
cluster_col = self.cluster if self.cluster else unit
823830
cluster_ids = sample[cluster_col].values
824831

825-
# Resolve survey design, inject cluster as PSU when needed
832+
# Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
833+
survey_cluster_ids = cluster_ids if self.cluster else None
826834
resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
827-
_resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
835+
_resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
828836
)
829837
_has_survey = resolved is not None
830838

@@ -1054,9 +1062,10 @@ def _fit_poisson(
10541062
cluster_col = self.cluster if self.cluster else unit
10551063
cluster_ids = sample[cluster_col].values
10561064

1057-
# Resolve survey design, inject cluster as PSU when needed
1065+
# Resolve survey design, inject cluster as PSU only when user explicitly set cluster=
1066+
survey_cluster_ids = cluster_ids if self.cluster else None
10581067
resolved, survey_weights, survey_weight_type, survey_metadata, df_inf = (
1059-
_resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, self.cluster)
1068+
_resolve_survey_for_wooldridge(survey_design, sample, survey_cluster_ids, self.cluster)
10601069
)
10611070
_has_survey = resolved is not None
10621071

@@ -1148,6 +1157,9 @@ def _avg_ax0(a, cell_mask):
11481157
# Use raw coefficients (before NaN->0 zeroing) to detect dropped cells.
11491158
if np.isnan(beta_int_raw[idx]):
11501159
continue
1160+
# Skip cells where all survey weights are zero (non-estimable)
1161+
if survey_weights is not None and np.sum(survey_weights[cell_mask]) == 0:
1162+
continue
11511163
delta = beta_int[idx]
11521164
if np.isnan(delta):
11531165
continue

tests/test_wooldridge.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,3 +1476,48 @@ def test_survey_gt_weights_are_counts(self, survey_panel):
14761476
f"gt_weights[{k}] = {w} (type {type(w).__name__}); "
14771477
f"expected int (cell count)"
14781478
)
1479+
1480+
def test_weights_only_no_cluster_implicit_psu(self, survey_panel):
1481+
"""Weights-only survey without cluster= keeps implicit per-obs PSUs."""
1482+
from diff_diff.survey import SurveyDesign
1483+
from diff_diff.wooldridge import _filter_sample
1484+
sd = SurveyDesign(weights="weight")
1485+
r = WooldridgeDiD().fit(
1486+
survey_panel, outcome="y", unit="unit", time="time",
1487+
cohort="cohort", survey_design=sd,
1488+
)
1489+
# n_psu should equal n_obs in the filtered sample (not n_units)
1490+
sample = _filter_sample(
1491+
survey_panel.copy().assign(cohort=lambda d: d["cohort"].fillna(0)),
1492+
"unit", "time", "cohort", "not_yet_treated", 0,
1493+
)
1494+
assert r.survey_metadata is not None
1495+
assert r.survey_metadata.n_psu == len(sample)
1496+
1497+
def test_fweight_rejected(self, survey_panel):
1498+
"""fweight raises ValueError (pweight only)."""
1499+
from diff_diff.survey import SurveyDesign
1500+
# Use integer weights so fweight validation passes in resolve(),
1501+
# and the pweight guard in _resolve_survey_for_wooldridge fires.
1502+
df = survey_panel.copy()
1503+
df["int_weight"] = 1
1504+
sd = SurveyDesign(weights="int_weight", weight_type="fweight")
1505+
with pytest.raises(ValueError, match="weight_type='pweight'"):
1506+
WooldridgeDiD().fit(
1507+
df, outcome="y", unit="unit", time="time",
1508+
cohort="cohort", survey_design=sd,
1509+
)
1510+
1511+
def test_poisson_zero_weight_cell(self, survey_panel):
1512+
"""Poisson survey fit handles zero-weight treated cells cleanly."""
1513+
from diff_diff.survey import SurveyDesign
1514+
df = survey_panel.copy()
1515+
# Zero out weights for one treated cohort so some cells have zero weight
1516+
df.loc[df["cohort"] == 3, "weight"] = 0.0
1517+
sd = SurveyDesign(weights="weight", strata="stratum", psu="unit")
1518+
r = WooldridgeDiD(method="poisson").fit(
1519+
df, outcome="y_count", unit="unit", time="time",
1520+
cohort="cohort", survey_design=sd,
1521+
)
1522+
assert np.isfinite(r.overall_att)
1523+
assert np.isfinite(r.overall_se)

0 commit comments

Comments
 (0)