Use QR-rank with R-compatible tolerance for replicate df_survey

igerber · claude · igerber · commit 4ba002f242b0 · 2026-03-28T10:31:01.000-04:00
Replace np.linalg.matrix_rank (SVD-based) with QR decomposition
using tol=1e-5 matching R's survey::degf() which uses
qr(..., tol=1e-5)$rank. Return exact rank-1 (no max(...,1) floor).
When rank &lt;= 1, df_survey returns None yielding NaN inference.

Remove max(...,1) clamping in TripleDifference survey df path.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -574,22 +574,28 @@ def uses_replicate_variance(self) -> bool:
     def df_survey(self) -> Optional[int]:
         """Survey degrees of freedom.
 
-        For replicate designs: numerical rank of centered replicate weight
-        matrix, matching R's ``survey::degf()``. For TSL: n_PSU - n_strata.
+        For replicate designs: QR-rank of the analysis-weight matrix minus 1,
+        matching R's ``survey::degf()`` which uses ``qr(..., tol=1e-5)$rank``.
+        Returns ``None`` when rank <= 1 (insufficient for t-based inference).
+        For TSL: n_PSU - n_strata.
         """
         if self.uses_replicate_variance:
             if self.replicate_weights is None or self.n_replicates < 2:
                 return None
-            # Rank-based df from analysis-weight matrix, matching
-            # R's survey::degf() which uses weights(design, "analysis").
+            # QR-rank of analysis-weight matrix, matching R's survey::degf()
+            # which uses qr(weights(design, "analysis"), tol=1e-5)$rank.
             # For combined_weights=True, replicate cols ARE analysis weights.
             # For combined_weights=False, analysis weights = rep * full-sample.
             if self.combined_weights:
                 analysis_weights = self.replicate_weights
             else:
                 analysis_weights = self.replicate_weights * self.weights[:, np.newaxis]
-            rank = int(np.linalg.matrix_rank(analysis_weights))
-            return max(rank - 1, 1) if rank > 1 else None
+            # Use QR decomposition with R-compatible tolerance (1e-5)
+            Q, R_mat = np.linalg.qr(analysis_weights, mode='reduced')
+            tol = 1e-5
+            rank = int(np.sum(np.abs(np.diag(R_mat)) > tol * np.abs(np.diag(R_mat)).max()))
+            df = rank - 1
+            return df if df > 0 else None
         if self.psu is not None and self.n_psu > 0:
             if self.strata is not None and self.n_strata > 0:
                 return self.n_psu - self.n_strata
diff --git a/diff_diff/triple_diff.py b/diff_diff/triple_diff.py
@@ -581,13 +581,16 @@ def fit(
         # Compute inference
         # When survey design is active, use survey df (n_PSU - n_strata)
         if survey_metadata is not None and survey_metadata.df_survey is not None:
-            df = max(survey_metadata.df_survey, 1)
+            df = survey_metadata.df_survey
             # Override with effective replicate df only when replicates were dropped
             if (hasattr(self, '_replicate_n_valid') and self._replicate_n_valid is not None
                     and resolved_survey is not None
                     and self._replicate_n_valid < resolved_survey.n_replicates):
-                df = max(self._replicate_n_valid - 1, 1)
+                df = self._replicate_n_valid - 1
                 survey_metadata.df_survey = self._replicate_n_valid - 1
+            # df <= 0 means insufficient rank for t-based inference
+            if df is not None and df <= 0:
+                df = None
         else:
             df = n_obs - 8  # Approximate df (8 cell means)
             if covariates:
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -2011,11 +2011,12 @@ variance from the distribution of replicate estimates.
   contrasts are formed via weight-ratio rescaling:
   `theta_r = sum((w_r/w_full) * psi)` when `combined_weights=True`,
   `theta_r = sum(w_r * psi)` when `combined_weights=False`.
-- **Survey df**: Numerical rank of the analysis-weight matrix minus 1,
-  matching R's `survey::degf()`. For `combined_weights=True` (default),
-  analysis weights are the raw replicate columns. For `combined_weights=False`,
-  analysis weights are `replicate_weights * full_sample_weights`.
-  Replaces `n_PSU - n_strata`.
+- **Survey df**: QR-rank of the analysis-weight matrix minus 1,
+  matching R's `survey::degf()` which uses `qr(..., tol=1e-5)$rank`.
+  For `combined_weights=True` (default), analysis weights are the raw
+  replicate columns. For `combined_weights=False`, analysis weights are
+  `replicate_weights * full_sample_weights`. Returns `None` (undefined)
+  when rank <= 1, yielding NaN inference. Replaces `n_PSU - n_strata`.
 - **Mutual exclusion**: Replicate weights cannot be combined with
   strata/psu/fpc (the replicates encode design structure implicitly)
 - **Design parameters** (matching R `svrepdesign()`):