Skip to content

Commit e57b748

Browse files
igerberclaude
andcommitted
Move FPC >= n_PSU validation to compute_survey_vcov for effective PSU structure from PR #218 review (round 17)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8a0ba95 commit e57b748

2 files changed

Lines changed: 64 additions & 22 deletions

File tree

diff_diff/survey.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,14 +169,10 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
169169
if np.any(np.isnan(fpc_arr)) or np.any(~np.isfinite(fpc_arr)):
170170
raise ValueError("FPC values must be finite and non-NaN")
171171

172-
# FPC requires survey structure (psu or strata)
173-
if self.psu is None and self.strata is None:
174-
raise ValueError(
175-
"FPC requires either psu or strata to be specified. "
176-
"FPC alone without survey structure is not supported."
177-
)
178-
179-
# Validate FPC >= n_h per stratum
172+
# Validate FPC structure (constant within strata, positive).
173+
# FPC >= n_PSU validation is deferred to compute_survey_vcov()
174+
# where the final effective PSU structure is known (after
175+
# cluster-as-PSU injection and implicit per-obs PSU fallback).
180176
if strata_arr is not None:
181177
for h in np.unique(strata_arr):
182178
mask_h = strata_arr == h
@@ -188,27 +184,23 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
188184
f"Stratum {h} has values: {np.unique(fpc_vals)}"
189185
)
190186
fpc_h = fpc_vals[0]
191-
# Validate FPC >= number of sampled PSUs (not obs count)
187+
# Validate FPC >= n_PSU when explicit PSU is declared
192188
if psu_arr is not None:
193189
n_psu_h = len(np.unique(psu_arr[mask_h]))
194190
if fpc_h < n_psu_h:
195191
raise ValueError(
196192
f"FPC ({fpc_h}) is less than the number of PSUs "
197193
f"({n_psu_h}) in stratum {h}. FPC must be >= n_PSU."
198194
)
199-
else:
200-
# No PSU declared yet — clusters may be injected later
201-
# as effective PSUs, so skip per-obs FPC validation here.
202-
# FPC will be applied at the PSU level in compute_survey_vcov.
203-
pass
204-
elif psu_arr is not None:
195+
else:
205196
# No strata: require FPC is a single constant value
206197
if len(np.unique(fpc_arr)) > 1:
207198
raise ValueError(
208199
"FPC values must be constant when no strata are specified. "
209200
f"Found {len(np.unique(fpc_arr))} distinct values."
210201
)
211-
if fpc_arr[0] < n_psu:
202+
# Validate FPC >= n_PSU when explicit PSU is declared
203+
if psu_arr is not None and fpc_arr[0] < n_psu:
212204
raise ValueError(
213205
f"FPC ({fpc_arr[0]}) is less than the number of PSUs "
214206
f"({n_psu}). FPC must be >= number of PSUs."
@@ -544,6 +536,11 @@ def compute_survey_vcov(
544536
f_h = 0.0 # No FPC
545537
if resolved.fpc is not None:
546538
N_h = resolved.fpc[0]
539+
if N_h < n_psu:
540+
raise ValueError(
541+
f"FPC ({N_h}) is less than the number of effective PSUs "
542+
f"({n_psu}). FPC must be >= n_PSU."
543+
)
547544
f_h = n_psu / N_h
548545
if f_h >= 1.0:
549546
legitimate_zero_count += 1
@@ -599,6 +596,11 @@ def compute_survey_vcov(
599596
f_h = 0.0
600597
if resolved.fpc is not None:
601598
N_h = resolved.fpc[mask_h][0]
599+
if N_h < n_psu_h:
600+
raise ValueError(
601+
f"FPC ({N_h}) is less than the number of effective PSUs "
602+
f"({n_psu_h}) in stratum. FPC must be >= n_PSU."
603+
)
602604
f_h = n_psu_h / N_h
603605
if f_h >= 1.0:
604606
legitimate_zero_count += 1

tests/test_survey.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,8 +1232,8 @@ def test_fpc_nonconstant_within_stratum(self):
12321232
with pytest.raises(ValueError, match="constant within each stratum"):
12331233
sd.resolve(df)
12341234

1235-
def test_fpc_only_without_psu_raises(self):
1236-
"""FPC without psu or strata raises ValueError (P1)."""
1235+
def test_fpc_only_without_psu_resolves(self):
1236+
"""FPC without psu or strata resolves — validated later against effective PSUs."""
12371237
n = 10
12381238
df = pd.DataFrame(
12391239
{
@@ -1243,8 +1243,8 @@ def test_fpc_only_without_psu_raises(self):
12431243
}
12441244
)
12451245
sd = SurveyDesign(weights="w", fpc="fpc")
1246-
with pytest.raises(ValueError, match="FPC requires either psu or strata"):
1247-
sd.resolve(df)
1246+
resolved = sd.resolve(df)
1247+
assert resolved.fpc is not None
12481248

12491249
def test_weighted_within_transform_matches_explicit_wls(self):
12501250
"""Weighted within_transform + OLS matches explicit WLS with dummies (P0-2).
@@ -2901,7 +2901,7 @@ def test_injected_cluster_nested_in_strata(self):
29012901
assert result.df_survey == 2
29022902

29032903
def test_fpc_with_strata_no_psu_accepted(self):
2904-
"""FPC + strata (no PSU) is accepted — clusters may be injected later."""
2904+
"""FPC + strata (no PSU) resolves — FPC validated later against effective PSUs."""
29052905
df = pd.DataFrame(
29062906
{
29072907
"y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
@@ -2913,6 +2913,46 @@ def test_fpc_with_strata_no_psu_accepted(self):
29132913
sd = SurveyDesign(
29142914
weights="w", weight_type="pweight", strata="strat", fpc="pop"
29152915
)
2916-
# Should not raise — FPC validation defers when no PSU declared
2916+
# Should not raise at resolve time — FPC >= n_PSU validated at vcov time
29172917
resolved = sd.resolve(df)
29182918
assert resolved.fpc is not None
2919+
2920+
def test_fpc_alone_no_strata_no_psu_accepted(self):
2921+
"""FPC alone (no PSU/strata) resolves — clusters may be injected later."""
2922+
df = pd.DataFrame(
2923+
{
2924+
"y": [1.0, 2.0, 3.0, 4.0],
2925+
"w": [1.0, 1.0, 1.0, 1.0],
2926+
"pop": [100.0, 100.0, 100.0, 100.0],
2927+
}
2928+
)
2929+
sd = SurveyDesign(weights="w", weight_type="pweight", fpc="pop")
2930+
resolved = sd.resolve(df)
2931+
assert resolved.fpc is not None
2932+
2933+
def test_fpc_lt_effective_npsu_rejected_at_vcov(self):
2934+
"""FPC < effective n_PSU is rejected at compute_survey_vcov time."""
2935+
np.random.seed(42)
2936+
n = 12
2937+
strata = np.repeat([0, 1], 6)
2938+
psu = np.tile(np.arange(3), 4) # 3 PSUs per stratum
2939+
2940+
X = np.column_stack([np.ones(n), np.random.randn(n)])
2941+
residuals = np.random.randn(n)
2942+
weights = np.ones(n)
2943+
2944+
# FPC = 2 per stratum, but we have 3 PSUs → invalid
2945+
fpc = np.array([2.0] * n)
2946+
2947+
resolved = ResolvedSurveyDesign(
2948+
weights=weights,
2949+
weight_type="pweight",
2950+
strata=strata,
2951+
psu=psu,
2952+
fpc=fpc,
2953+
n_strata=2,
2954+
n_psu=6,
2955+
lonely_psu="remove",
2956+
)
2957+
with pytest.raises(ValueError, match="FPC.*less than.*effective PSUs"):
2958+
compute_survey_vcov(X, residuals, resolved=resolved)

0 commit comments

Comments
 (0)