Skip to content

Commit 8a0ba95

Browse files
igerberclaude
andcommitted
Fix cluster-as-PSU nesting across strata and defer FPC validation for late PSU injection from PR #218 review (round 16)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d12bab2 commit 8a0ba95

2 files changed

Lines changed: 57 additions & 8 deletions

File tree

diff_diff/survey.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,10 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
197197
f"({n_psu_h}) in stratum {h}. FPC must be >= n_PSU."
198198
)
199199
else:
200-
n_h = np.sum(mask_h)
201-
if fpc_h < n_h:
202-
raise ValueError(
203-
f"FPC ({fpc_h}) is less than the number of observations "
204-
f"({n_h}) in stratum {h}. FPC must be >= n_obs."
205-
)
200+
# No PSU declared yet — clusters may be injected later
201+
# as effective PSUs, so skip per-obs FPC validation here.
202+
# FPC will be applied at the PSU level in compute_survey_vcov.
203+
pass
206204
elif psu_arr is not None:
207205
# No strata: require FPC is a single constant value
208206
if len(np.unique(fpc_arr)) > 1:
@@ -459,8 +457,15 @@ def _inject_cluster_as_psu(resolved, cluster_ids):
459457
"when used as effective PSUs for survey variance estimation."
460458
)
461459

462-
# Factorize cluster_ids for consistent integer encoding
463-
codes, uniques = pd.factorize(cluster_ids)
460+
# When strata are present, make cluster IDs unique within strata
461+
# (same nesting logic as SurveyDesign.resolve() with nest=True)
462+
if resolved.strata is not None:
463+
combined = np.array(
464+
[f"{s}_{c}" for s, c in zip(resolved.strata, cluster_ids)]
465+
)
466+
codes, uniques = pd.factorize(combined)
467+
else:
468+
codes, uniques = pd.factorize(cluster_ids)
464469
n_clusters = len(uniques)
465470

466471
return replace(resolved, psu=codes, n_psu=n_clusters)

tests/test_survey.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,3 +2872,47 @@ def test_cluster_as_psu_with_na_rejected(self):
28722872
cluster_ids = np.array([0, 1, np.nan, 0])
28732873
with pytest.raises(ValueError, match="Cluster IDs contain missing"):
28742874
_inject_cluster_as_psu(resolved, cluster_ids)
2875+
2876+
2877+
class TestRound16Fixes:
2878+
"""Tests for PR #218 review round 16: cluster-as-PSU nesting and FPC."""
2879+
2880+
def test_injected_cluster_nested_in_strata(self):
2881+
"""Injected cluster IDs with repeated labels across strata get unique codes."""
2882+
from diff_diff.survey import _inject_cluster_as_psu
2883+
2884+
# 2 strata, cluster "1" appears in both → should produce 4 unique PSUs
2885+
strata = np.array([0, 0, 0, 0, 1, 1, 1, 1])
2886+
resolved = ResolvedSurveyDesign(
2887+
weights=np.ones(8),
2888+
weight_type="pweight",
2889+
strata=strata,
2890+
psu=None,
2891+
fpc=None,
2892+
n_strata=2,
2893+
n_psu=0,
2894+
lonely_psu="remove",
2895+
)
2896+
cluster_ids = np.array([1, 1, 2, 2, 1, 1, 2, 2]) # labels repeat across strata
2897+
result = _inject_cluster_as_psu(resolved, cluster_ids)
2898+
# Should produce 4 unique PSUs (2 per stratum), not 2
2899+
assert result.n_psu == 4
2900+
# df_survey = n_psu - n_strata = 4 - 2 = 2
2901+
assert result.df_survey == 2
2902+
2903+
def test_fpc_with_strata_no_psu_accepted(self):
2904+
"""FPC + strata (no PSU) is accepted — clusters may be injected later."""
2905+
df = pd.DataFrame(
2906+
{
2907+
"y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
2908+
"w": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
2909+
"strat": [0, 0, 0, 1, 1, 1],
2910+
"pop": [10.0, 10.0, 10.0, 20.0, 20.0, 20.0],
2911+
}
2912+
)
2913+
sd = SurveyDesign(
2914+
weights="w", weight_type="pweight", strata="strat", fpc="pop"
2915+
)
2916+
# Should not raise — FPC validation defers when no PSU declared
2917+
resolved = sd.resolve(df)
2918+
assert resolved.fpc is not None

0 commit comments

Comments
 (0)