Fix replicate df_survey to use analysis weights and mse=False rscales centering

igerber · claude · igerber · commit b285dc753833 · 2026-03-28T09:12:16.000-04:00
Use analysis-weight matrix (rep * full-sample weights when
combined_weights=False) for rank-based df computation, matching R's
survey::degf(). When mse=False and replicate_rscales has zero entries,
exclude zero-scaled replicates from the centering mean, matching R's
svrVar() convention.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -574,9 +574,15 @@ def df_survey(self) -> Optional[int]:
         if self.uses_replicate_variance:
             if self.replicate_weights is None or self.n_replicates < 2:
                 return None
-            # Rank-based df from replicate weight matrix, matching
-            # R's survey::degf() for svrepdesign objects
-            rank = int(np.linalg.matrix_rank(self.replicate_weights))
+            # Rank-based df from analysis-weight matrix, matching
+            # R's survey::degf() which uses weights(design, "analysis").
+            # For combined_weights=True, replicate cols ARE analysis weights.
+            # For combined_weights=False, analysis weights = rep * full-sample.
+            if self.combined_weights:
+                analysis_weights = self.replicate_weights
+            else:
+                analysis_weights = self.replicate_weights * self.weights[:, np.newaxis]
+            rank = int(np.linalg.matrix_rank(analysis_weights))
             return max(rank - 1, 1) if rank > 1 else None
         if self.psu is not None and self.n_psu > 0:
             if self.strata is not None and self.n_strata > 0:
@@ -1375,10 +1381,19 @@ def compute_replicate_vcov(
 
     # Compute variance by method
     # Support mse=False: center on replicate mean instead of full-sample estimate
+    # When rscales present and mse=False, center only over rscales > 0
+    # (R's svrVar convention — zero-scaled replicates should not shift center)
     if resolved.mse:
         center = c
     else:
-        center = np.mean(coef_valid, axis=0)
+        if resolved.replicate_rscales is not None:
+            pos_scale = resolved.replicate_rscales[valid] > 0
+            if np.any(pos_scale):
+                center = np.mean(coef_valid[pos_scale], axis=0)
+            else:
+                center = np.mean(coef_valid, axis=0)
+        else:
+            center = np.mean(coef_valid, axis=0)
     diffs = coef_valid - center[np.newaxis, :]
 
     # Use custom scale/rscales if provided, else default method factor
@@ -1489,10 +1504,19 @@ def compute_replicate_if_variance(
         return np.nan, n_valid
 
     # Support mse=False: center on replicate mean
+    # When rscales present and mse=False, center only over rscales > 0
+    # (R's svrVar convention — zero-scaled replicates should not shift center)
     if resolved.mse:
         center = theta_full
     else:
-        center = float(np.mean(theta_reps[valid]))
+        if resolved.replicate_rscales is not None:
+            pos_scale = resolved.replicate_rscales[valid] > 0
+            if np.any(pos_scale):
+                center = float(np.mean(theta_reps[valid][pos_scale]))
+            else:
+                center = float(np.mean(theta_reps[valid]))
+        else:
+            center = float(np.mean(theta_reps[valid]))
     diffs = theta_reps[valid] - center
 
     # Custom scale/rscales
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -2011,8 +2011,11 @@ variance from the distribution of replicate estimates.
   contrasts are formed via weight-ratio rescaling:
   `theta_r = sum((w_r/w_full) * psi)` when `combined_weights=True`,
   `theta_r = sum(w_r * psi)` when `combined_weights=False`.
-- **Survey df**: Numerical rank of replicate weight matrix minus 1,
-  matching R's `survey::degf()`. Replaces `n_PSU - n_strata`.
+- **Survey df**: Numerical rank of the analysis-weight matrix minus 1,
+  matching R's `survey::degf()`. For `combined_weights=True` (default),
+  analysis weights are the raw replicate columns. For `combined_weights=False`,
+  analysis weights are `replicate_weights * full_sample_weights`.
+  Replaces `n_PSU - n_strata`.
 - **Mutual exclusion**: Replicate weights cannot be combined with
   strata/psu/fpc (the replicates encode design structure implicitly)
 - **Design parameters** (matching R `svrepdesign()`):
@@ -2023,7 +2026,9 @@ variance from the distribution of replicate estimates.
   - `replicate_rscales`: per-replicate scaling factors (vector of length R)
   - `mse` (default False, matching R's `survey::svrepdesign()`): if True,
     center variance on full-sample estimate; if False, center on mean of
-    replicate estimates.
+    replicate estimates. When `replicate_rscales` contains zero entries
+    and `mse=False`, centering excludes zero-scaled replicates, matching
+    R's `survey::svrVar()` convention.
 - **Note:** Replicate columns are NOT normalized — raw values are preserved
   to maintain correct weight ratios in the IF path.
 - **Note:** JKn requires explicit `replicate_strata` (per-replicate stratum
diff --git a/tests/test_survey_phase6.py b/tests/test_survey_phase6.py
@@ -870,6 +870,157 @@ def test_replicate_if_no_divide_by_zero_warning(self):
             assert np.isfinite(v)
 
 
+class TestReplicateEdgeCases:
+    """Regression tests for analysis-weight df and rscales centering."""
+
+    def test_df_survey_combined_weights_false(self):
+        """df_survey uses analysis-weight rank when combined_weights=False."""
+        from diff_diff.survey import ResolvedSurveyDesign
+
+        np.random.seed(42)
+        n = 50
+        R = 5
+        weights = 1.0 + np.random.exponential(0.5, n)
+        # Perturbation factors (not full weights)
+        rep_factors = np.random.uniform(0.8, 1.2, (n, R))
+
+        resolved = ResolvedSurveyDesign(
+            weights=weights, weight_type="pweight",
+            strata=None, psu=None, fpc=None,
+            n_strata=0, n_psu=0, lonely_psu="remove",
+            replicate_weights=rep_factors,
+            replicate_method="BRR", n_replicates=R,
+            combined_weights=False,
+        )
+        # df should match rank of analysis weights (rep * full-sample)
+        analysis_weights = rep_factors * weights[:, np.newaxis]
+        expected_rank = int(np.linalg.matrix_rank(analysis_weights))
+        expected_df = max(expected_rank - 1, 1)
+        assert resolved.df_survey == expected_df
+
+        # Verify it differs from raw perturbation-factor rank when weights
+        # cause a rank reduction (e.g., zero full-sample weights)
+        weights_with_zeros = weights.copy()
+        weights_with_zeros[:10] = 0.0  # subpopulation-zeroed
+        resolved2 = ResolvedSurveyDesign(
+            weights=weights_with_zeros, weight_type="pweight",
+            strata=None, psu=None, fpc=None,
+            n_strata=0, n_psu=0, lonely_psu="remove",
+            replicate_weights=rep_factors,
+            replicate_method="BRR", n_replicates=R,
+            combined_weights=False,
+        )
+        raw_rank = int(np.linalg.matrix_rank(rep_factors))
+        analysis_rank = int(np.linalg.matrix_rank(
+            rep_factors * weights_with_zeros[:, np.newaxis]
+        ))
+        # Analysis rank should be <= raw rank when zero weights present
+        assert analysis_rank <= raw_rank
+        assert resolved2.df_survey == max(analysis_rank - 1, 1)
+
+    def test_rscales_zero_centering_vcov(self):
+        """mse=False with zero rscales: center only on rscales > 0 replicates."""
+        from diff_diff.survey import compute_replicate_vcov, ResolvedSurveyDesign
+        from diff_diff.linalg import solve_ols
+
+        np.random.seed(42)
+        n = 100
+        R = 6
+        x = np.random.randn(n)
+        y = 1.0 + 2.0 * x + np.random.randn(n) * 0.5
+        X = np.column_stack([np.ones(n), x])
+        w = np.ones(n)
+
+        # Build JK1-style replicates
+        cluster_size = n // R
+        rep_arr = np.ones((n, R))
+        for r in range(R):
+            start = r * cluster_size
+            end = min((r + 1) * cluster_size, n)
+            rep_arr[start:end, :] = 0.0
+            # Correct column r only
+            rep_arr[:, r] = np.where(
+                (np.arange(n) >= start) & (np.arange(n) < end), 0.0,
+                R / (R - 1)
+            )
+
+        # rscales with one zero entry
+        rscales = np.array([1.0, 1.0, 0.0, 1.0, 1.0, 1.0])
+
+        coef, _, _ = solve_ols(X, y, weights=w)
+
+        resolved = ResolvedSurveyDesign(
+            weights=w, weight_type="pweight",
+            strata=None, psu=None, fpc=None,
+            n_strata=0, n_psu=0, lonely_psu="remove",
+            replicate_weights=rep_arr,
+            replicate_method="BRR", n_replicates=R,
+            replicate_rscales=rscales, mse=False,
+        )
+        vcov, _nv = compute_replicate_vcov(X, y, coef, resolved)
+
+        # Manual computation: center only on replicates with rscales > 0
+        coef_reps = []
+        for r in range(R):
+            c_r, _, _ = solve_ols(X, y, weights=rep_arr[:, r])
+            coef_reps.append(c_r)
+        coef_reps = np.array(coef_reps)
+        pos_mask = rscales > 0
+        center = np.mean(coef_reps[pos_mask], axis=0)
+        diffs = coef_reps - center[np.newaxis, :]
+        V_manual = np.zeros((2, 2))
+        for r in range(R):
+            V_manual += rscales[r] * np.outer(diffs[r], diffs[r])
+
+        assert np.allclose(np.diag(vcov), np.diag(V_manual), rtol=1e-10)
+
+    def test_rscales_zero_centering_if(self):
+        """mse=False with zero rscales: IF path centers only on rscales > 0."""
+        from diff_diff.survey import compute_replicate_if_variance, ResolvedSurveyDesign
+
+        np.random.seed(42)
+        n = 50
+        R = 5
+        psi = np.random.randn(n) * 0.1
+        w = np.ones(n)
+
+        # Build simple replicates
+        rep_arr = np.ones((n, R))
+        for r in range(R):
+            start = r * (n // R)
+            end = min((r + 1) * (n // R), n)
+            rep_arr[start:end, r] = 0.0
+            rep_arr[:, r] = np.where(
+                (np.arange(n) >= start) & (np.arange(n) < end), 0.0,
+                R / (R - 1)
+            )
+
+        rscales = np.array([1.0, 0.0, 1.0, 1.0, 1.0])
+
+        resolved = ResolvedSurveyDesign(
+            weights=w, weight_type="pweight",
+            strata=None, psu=None, fpc=None,
+            n_strata=0, n_psu=0, lonely_psu="remove",
+            replicate_weights=rep_arr,
+            replicate_method="BRR", n_replicates=R,
+            replicate_rscales=rscales, mse=False,
+        )
+        var, _nv = compute_replicate_if_variance(psi, resolved)
+
+        # Manual: theta_r = sum((w_r/w) * psi), center on rscales > 0 only
+        theta_full = float(np.sum(psi))
+        theta_reps = np.array([
+            float(np.sum(np.divide(rep_arr[:, r], w, out=np.zeros(n), where=w > 0) * psi))
+            for r in range(R)
+        ])
+        pos_mask = rscales > 0
+        center = float(np.mean(theta_reps[pos_mask]))
+        diffs = theta_reps - center
+        var_manual = float(np.sum(rscales * diffs**2))
+
+        assert var == pytest.approx(var_manual, rel=1e-10)
+
+
 # =============================================================================
 # Estimator-Level Replicate Weight Tests
 # =============================================================================