Skip to content

Commit 4ba002f

Browse files
igerberclaude
andcommitted
Use QR-rank with R-compatible tolerance for replicate df_survey
Replace np.linalg.matrix_rank (SVD-based) with QR decomposition using tol=1e-5 matching R's survey::degf() which uses qr(..., tol=1e-5)$rank. Return exact rank-1 (no max(...,1) floor). When rank <= 1, df_survey returns None yielding NaN inference. Remove max(...,1) clamping in TripleDifference survey df path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 06cf61b commit 4ba002f

3 files changed

Lines changed: 23 additions & 13 deletions

File tree

diff_diff/survey.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -574,22 +574,28 @@ def uses_replicate_variance(self) -> bool:
574574
def df_survey(self) -> Optional[int]:
575575
"""Survey degrees of freedom.
576576
577-
For replicate designs: numerical rank of centered replicate weight
578-
matrix, matching R's ``survey::degf()``. For TSL: n_PSU - n_strata.
577+
For replicate designs: QR-rank of the analysis-weight matrix minus 1,
578+
matching R's ``survey::degf()`` which uses ``qr(..., tol=1e-5)$rank``.
579+
Returns ``None`` when rank <= 1 (insufficient for t-based inference).
580+
For TSL: n_PSU - n_strata.
579581
"""
580582
if self.uses_replicate_variance:
581583
if self.replicate_weights is None or self.n_replicates < 2:
582584
return None
583-
# Rank-based df from analysis-weight matrix, matching
584-
# R's survey::degf() which uses weights(design, "analysis").
585+
# QR-rank of analysis-weight matrix, matching R's survey::degf()
586+
# which uses qr(weights(design, "analysis"), tol=1e-5)$rank.
585587
# For combined_weights=True, replicate cols ARE analysis weights.
586588
# For combined_weights=False, analysis weights = rep * full-sample.
587589
if self.combined_weights:
588590
analysis_weights = self.replicate_weights
589591
else:
590592
analysis_weights = self.replicate_weights * self.weights[:, np.newaxis]
591-
rank = int(np.linalg.matrix_rank(analysis_weights))
592-
return max(rank - 1, 1) if rank > 1 else None
593+
# Use QR decomposition with R-compatible tolerance (1e-5)
594+
Q, R_mat = np.linalg.qr(analysis_weights, mode='reduced')
595+
tol = 1e-5
596+
rank = int(np.sum(np.abs(np.diag(R_mat)) > tol * np.abs(np.diag(R_mat)).max()))
597+
df = rank - 1
598+
return df if df > 0 else None
593599
if self.psu is not None and self.n_psu > 0:
594600
if self.strata is not None and self.n_strata > 0:
595601
return self.n_psu - self.n_strata

diff_diff/triple_diff.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,13 +581,16 @@ def fit(
581581
# Compute inference
582582
# When survey design is active, use survey df (n_PSU - n_strata)
583583
if survey_metadata is not None and survey_metadata.df_survey is not None:
584-
df = max(survey_metadata.df_survey, 1)
584+
df = survey_metadata.df_survey
585585
# Override with effective replicate df only when replicates were dropped
586586
if (hasattr(self, '_replicate_n_valid') and self._replicate_n_valid is not None
587587
and resolved_survey is not None
588588
and self._replicate_n_valid < resolved_survey.n_replicates):
589-
df = max(self._replicate_n_valid - 1, 1)
589+
df = self._replicate_n_valid - 1
590590
survey_metadata.df_survey = self._replicate_n_valid - 1
591+
# df <= 0 means insufficient rank for t-based inference
592+
if df is not None and df <= 0:
593+
df = None
591594
else:
592595
df = n_obs - 8 # Approximate df (8 cell means)
593596
if covariates:

docs/methodology/REGISTRY.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2011,11 +2011,12 @@ variance from the distribution of replicate estimates.
20112011
contrasts are formed via weight-ratio rescaling:
20122012
`theta_r = sum((w_r/w_full) * psi)` when `combined_weights=True`,
20132013
`theta_r = sum(w_r * psi)` when `combined_weights=False`.
2014-
- **Survey df**: Numerical rank of the analysis-weight matrix minus 1,
2015-
matching R's `survey::degf()`. For `combined_weights=True` (default),
2016-
analysis weights are the raw replicate columns. For `combined_weights=False`,
2017-
analysis weights are `replicate_weights * full_sample_weights`.
2018-
Replaces `n_PSU - n_strata`.
2014+
- **Survey df**: QR-rank of the analysis-weight matrix minus 1,
2015+
matching R's `survey::degf()` which uses `qr(..., tol=1e-5)$rank`.
2016+
For `combined_weights=True` (default), analysis weights are the raw
2017+
replicate columns. For `combined_weights=False`, analysis weights are
2018+
`replicate_weights * full_sample_weights`. Returns `None` (undefined)
2019+
when rank <= 1, yielding NaN inference. Replaces `n_PSU - n_strata`.
20192020
- **Mutual exclusion**: Replicate weights cannot be combined with
20202021
strata/psu/fpc (the replicates encode design structure implicitly)
20212022
- **Design parameters** (matching R `svrepdesign()`):

0 commit comments

Comments
 (0)