Address P1/P2 review findings: survey-weighted WIF, bootstrap denominator, zero-weight guard

igerber · claude · igerber · commit 66ab529283e6 · 2026-04-04T10:55:05.000-04:00
- Make _compute_wif_contribution() survey-aware: use w_i * 1{G_i=g} - pg_k
  formula when unit_weights present, matching staggered_aggregation.py
- Use explicit sum(unit_level_weights) denominator in bootstrap perturbation
  when survey design is active
- Guard zero-weight cohorts: skip in fit loop, early return in
  compute_generated_outcomes_cov when pi_g &lt;= 0
- Add regression tests: analytical SE differs from unweighted, bootstrap SE
  in ballpark of analytical, zero-weight cohort handled gracefully
- Update tutorial notebook: remove stale note about covariates+survey

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -571,6 +571,7 @@ def fit(
             # Use the resolved survey's weights (already normalized per weight_type)
             # subset to unit level via _unit_first_panel_row (aligned to all_units)
             unit_level_weights = self._unit_resolved_survey.weights
+        self._unit_level_weights = unit_level_weights
 
         cohort_fractions: Dict[float, float] = {}
         if unit_level_weights is not None:
@@ -674,6 +675,15 @@ def fit(
             else:
                 effective_p1_col = period_1_col
 
+            # Guard: skip cohorts with zero survey weight (all units zero-weighted)
+            if cohort_fractions[g] <= 0:
+                warnings.warn(
+                    f"Cohort {g} has zero survey weight; skipping.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+                continue
+
             # Estimate all (g, t) cells including pre-treatment. Under PT-Post,
             # pre-treatment cells serve as placebo/pre-trend diagnostics, matching
             # the CallawaySantAnna implementation. Users filter to t >= g for
@@ -976,6 +986,7 @@ def fit(
                 cluster_indices=unit_cluster_indices,
                 n_clusters=n_clusters,
                 resolved_survey=self._unit_resolved_survey,
+                unit_level_weights=self._unit_level_weights,
             )
             # Update estimates with bootstrap inference
             overall_se = bootstrap_results.overall_att_se
@@ -1137,6 +1148,7 @@ def _compute_wif_contribution(
         unit_cohorts: np.ndarray,
         cohort_fractions: Dict[float, float],
         n_units: int,
+        unit_weights: Optional[np.ndarray] = None,
     ) -> np.ndarray:
         """Compute weight influence function correction (O(1) scale, matching EIF).
 
@@ -1156,6 +1168,9 @@ def _compute_wif_contribution(
             ``{cohort: n_cohort / n}`` for each cohort.
         n_units : int
             Total number of units.
+        unit_weights : ndarray, shape (n_units,), optional
+            Survey weights at the unit level.  When provided, uses the
+            survey-weighted WIF formula: IF_i(p_g) = (w_i * 1{G_i=g} - pg_k).
 
         Returns
         -------
@@ -1169,10 +1184,19 @@ def _compute_wif_contribution(
             return np.zeros(n_units)
 
         indicator = (unit_cohorts[:, None] == groups_for_keepers[None, :]).astype(float)
-        indicator_sum = np.sum(indicator - pg_keepers, axis=1)
+
+        if unit_weights is not None:
+            # Survey-weighted WIF (matches staggered_aggregation.py:392-401):
+            # IF_i(p_g) = (w_i * 1{G_i=g} - pg_k), NOT (1{G_i=g} - pg_k)
+            weighted_indicator = indicator * unit_weights[:, None]
+            indicator_diff = weighted_indicator - pg_keepers
+            indicator_sum = np.sum(indicator_diff, axis=1)
+        else:
+            indicator_diff = indicator - pg_keepers
+            indicator_sum = np.sum(indicator_diff, axis=1)
 
         with np.errstate(divide="ignore", invalid="ignore", over="ignore"):
-            if1 = (indicator - pg_keepers) / sum_pg
+            if1 = indicator_diff / sum_pg
             if2 = np.outer(indicator_sum, pg_keepers) / sum_pg**2
             wif_matrix = if1 - if2
             wif_contrib = wif_matrix @ effects
@@ -1229,7 +1253,8 @@ def _aggregate_overall(
 
         # WIF correction: accounts for uncertainty in cohort-size weights
         wif = self._compute_wif_contribution(
-            keepers, effects, unit_cohorts, cohort_fractions, n_units
+            keepers, effects, unit_cohorts, cohort_fractions, n_units,
+            unit_weights=self._unit_level_weights,
         )
         agg_eif_total = agg_eif + wif  # both O(1) scale
 
@@ -1325,7 +1350,8 @@ def _aggregate_event_study(
                 es_keepers = [(g, t) for (g, t) in gt_pairs]
                 es_effects = effs
                 wif = self._compute_wif_contribution(
-                    es_keepers, es_effects, unit_cohorts, cohort_fractions, n_units
+                    es_keepers, es_effects, unit_cohorts, cohort_fractions, n_units,
+                    unit_weights=self._unit_level_weights,
                 )
                 agg_eif = agg_eif + wif
 
diff --git a/diff_diff/efficient_did_bootstrap.py b/diff_diff/efficient_did_bootstrap.py
@@ -63,6 +63,7 @@ def _run_multiplier_bootstrap(
         cluster_indices: Optional[np.ndarray] = None,
         n_clusters: Optional[int] = None,
         resolved_survey: object = None,
+        unit_level_weights: Optional[np.ndarray] = None,
     ) -> EDiDBootstrapResults:
         """Run multiplier bootstrap on stored EIF values.
 
@@ -136,11 +137,19 @@ def _run_multiplier_bootstrap(
         original_atts = np.array([group_time_effects[gt]["effect"] for gt in gt_pairs])
 
         # Perturbed ATTs: (n_bootstrap, n_gt)
+        # Under survey design, normalize by sum(survey_weights) instead of n_units
+        # (pweights are normalized to mean=1, so numerically equivalent, but explicit
+        # for robustness against future weight types)
+        denom = (
+            float(np.sum(unit_level_weights))
+            if unit_level_weights is not None
+            else float(n_units)
+        )
         bootstrap_atts = np.zeros((self.n_bootstrap, n_gt))
         for j, gt in enumerate(gt_pairs):
             eif_gt = eif_by_gt[gt]  # shape (n_units,)
             with np.errstate(divide="ignore", invalid="ignore", over="ignore"):
-                perturbation = (all_weights @ eif_gt) / n_units
+                perturbation = (all_weights @ eif_gt) / denom
             bootstrap_atts[:, j] = original_atts[j] + perturbation
 
         # Post-treatment mask — also exclude NaN effects
diff --git a/diff_diff/efficient_did_covariates.py b/diff_diff/efficient_did_covariates.py
@@ -488,6 +488,10 @@ def compute_generated_outcomes_cov(
     g_mask = cohort_masks[target_g]
     pi_g = cohort_fractions[target_g]
 
+    # Guard: zero survey weight for the target cohort → no DR estimation possible
+    if pi_g <= 0:
+        return np.zeros((n_units, H))
+
     gen_out = np.zeros((n_units, H))
 
     for j, (gp, tpre) in enumerate(valid_pairs):
diff --git a/docs/tutorials/16_survey_did.ipynb b/docs/tutorials/16_survey_did.ipynb
@@ -527,7 +527,42 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## 9. Which Estimators Support Survey Design?\n\n`diff-diff` supports survey design across all estimators, though the level of support varies:\n\n| Estimator | Weights | Strata/PSU/FPC (TSL) | Replicate Weights | Survey-Aware Bootstrap |\n|-----------|---------|---------------------|-------------------|------------------------|\n| **DifferenceInDifferences** | Full | Full | -- | -- |\n| **TwoWayFixedEffects** | Full | Full | -- | -- |\n| **MultiPeriodDiD** | Full | Full | -- | -- |\n| **CallawaySantAnna** | pweight only | Full | Full | Multiplier at PSU |\n| **TripleDifference** | pweight only | Full | Full (analytical) | -- |\n| **StaggeredTripleDifference** | pweight only | Full | Full | Multiplier at PSU |\n| **SunAbraham** | Full | Full | -- | Rao-Wu rescaled |\n| **StackedDiD** | pweight only | Full (pweight only) | -- | -- |\n| **ImputationDiD** | pweight only | Partial (no FPC) | -- | Multiplier at PSU |\n| **TwoStageDiD** | pweight only | Partial (no FPC) | -- | Multiplier at PSU |\n| **ContinuousDiD** | Full | Full | Full (analytical) | Multiplier at PSU |\n| **EfficientDiD** | Full | Full | Full (analytical) | Multiplier at PSU |\n| **SyntheticDiD** | pweight only | -- | -- | Rao-Wu rescaled |\n| **TROP** | pweight only | -- | -- | Rao-Wu rescaled |\n| **BaconDecomposition** | Diagnostic | Diagnostic | -- | -- |\n\n**Legend:**\n- **Full**: All weight types (pweight/fweight/aweight) + strata/PSU/FPC + Taylor Series Linearization variance\n- **Full (pweight only)**: Full TSL support with strata/PSU/FPC, but only accepts `pweight` weight type (`fweight`/`aweight` rejected because Q-weight composition changes their semantics)\n- **Partial (no FPC)**: Weights + strata (for df) + PSU (for clustering); FPC raises `NotImplementedError`\n- **pweight only** (Weights column): Only `pweight` accepted; `fweight`/`aweight` raise an error\n- **pweight only** (TSL column): Sampling weights for point estimates; no strata/PSU/FPC design elements\n- **Diagnostic**: Weighted descriptive statistics only (no inference)\n- **--**: Not supported\n\n**Note:** `EfficientDiD` does not support `covariates` and `survey_design` simultaneously (the DR nuisance path does not yet thread survey weights). Use `covariates=None` with survey designs.\n\nFor full details, see `docs/survey-roadmap.md`."
+   "source": [
+    "## 9. Which Estimators Support Survey Design?\n",
+    "\n",
+    "`diff-diff` supports survey design across all estimators, though the level of support varies:\n",
+    "\n",
+    "| Estimator | Weights | Strata/PSU/FPC (TSL) | Replicate Weights | Survey-Aware Bootstrap |\n",
+    "|-----------|---------|---------------------|-------------------|------------------------|\n",
+    "| **DifferenceInDifferences** | Full | Full | -- | -- |\n",
+    "| **TwoWayFixedEffects** | Full | Full | -- | -- |\n",
+    "| **MultiPeriodDiD** | Full | Full | -- | -- |\n",
+    "| **CallawaySantAnna** | pweight only | Full | Full | Multiplier at PSU |\n",
+    "| **TripleDifference** | pweight only | Full | Full (analytical) | -- |\n",
+    "| **StaggeredTripleDifference** | pweight only | Full | Full | Multiplier at PSU |\n",
+    "| **SunAbraham** | Full | Full | -- | Rao-Wu rescaled |\n",
+    "| **StackedDiD** | pweight only | Full (pweight only) | -- | -- |\n",
+    "| **ImputationDiD** | pweight only | Partial (no FPC) | -- | Multiplier at PSU |\n",
+    "| **TwoStageDiD** | pweight only | Partial (no FPC) | -- | Multiplier at PSU |\n",
+    "| **ContinuousDiD** | Full | Full | Full (analytical) | Multiplier at PSU |\n",
+    "| **EfficientDiD** | Full | Full | Full (analytical) | Multiplier at PSU |\n",
+    "| **SyntheticDiD** | pweight only | -- | -- | Rao-Wu rescaled |\n",
+    "| **TROP** | pweight only | -- | -- | Rao-Wu rescaled |\n",
+    "| **BaconDecomposition** | Diagnostic | Diagnostic | -- | -- |\n",
+    "\n",
+    "**Legend:**\n",
+    "- **Full**: All weight types (pweight/fweight/aweight) + strata/PSU/FPC + Taylor Series Linearization variance\n",
+    "- **Full (pweight only)**: Full TSL support with strata/PSU/FPC, but only accepts `pweight` weight type (`fweight`/`aweight` rejected because Q-weight composition changes their semantics)\n",
+    "- **Partial (no FPC)**: Weights + strata (for df) + PSU (for clustering); FPC raises `NotImplementedError`\n",
+    "- **pweight only** (Weights column): Only `pweight` accepted; `fweight`/`aweight` raise an error\n",
+    "- **pweight only** (TSL column): Sampling weights for point estimates; no strata/PSU/FPC design elements\n",
+    "- **Diagnostic**: Weighted descriptive statistics only (no inference)\n",
+    "- **--**: Not supported\n",
+    "\n",
+    "**Note:** `EfficientDiD` supports `covariates` and `survey_design` simultaneously. The doubly-robust (DR) path threads survey weights through WLS outcome regression, weighted sieve propensity ratios, and survey-weighted kernel smoothing.\n",
+    "\n",
+    "For full details, see `docs/survey-roadmap.md`."
+   ]
   },
   {
    "cell_type": "markdown",
diff --git a/tests/test_survey_phase3.py b/tests/test_survey_phase3.py
@@ -1044,6 +1044,66 @@ def test_bootstrap_covariates_survey(self, cov_survey_data):
         assert np.isfinite(result.overall_se)
         assert result.overall_se > 0
 
+    def test_analytical_se_differs_from_unweighted(self, cov_survey_data):
+        """Survey analytical SE should differ from unweighted SE."""
+        from diff_diff import EfficientDiD
+
+        sd = SurveyDesign(weights="weight")
+        result_survey = EfficientDiD(n_bootstrap=0).fit(
+            cov_survey_data,
+            "outcome", "unit", "time", "first_treat",
+            covariates=["x1"],
+            survey_design=sd,
+        )
+        result_nosurv = EfficientDiD(n_bootstrap=0).fit(
+            cov_survey_data,
+            "outcome", "unit", "time", "first_treat",
+            covariates=["x1"],
+        )
+        # Non-uniform weights (1.0 + 0.3*stratum) should produce different SEs
+        assert result_survey.overall_se != result_nosurv.overall_se
+        assert np.isfinite(result_survey.overall_se)
+        assert result_survey.overall_se > 0
+
+    def test_bootstrap_se_in_ballpark_of_analytical(self, cov_survey_data):
+        """Bootstrap SE should be in same ballpark as analytical SE."""
+        from diff_diff import EfficientDiD
+
+        sd = SurveyDesign(weights="weight")
+        result_analytical = EfficientDiD(n_bootstrap=0).fit(
+            cov_survey_data,
+            "outcome", "unit", "time", "first_treat",
+            covariates=["x1"],
+            survey_design=sd,
+        )
+        result_boot = EfficientDiD(n_bootstrap=199, seed=42).fit(
+            cov_survey_data,
+            "outcome", "unit", "time", "first_treat",
+            covariates=["x1"],
+            survey_design=sd,
+        )
+        ratio = result_boot.overall_se / result_analytical.overall_se
+        assert 0.3 < ratio < 3.0, (
+            f"Bootstrap/analytical SE ratio {ratio:.2f} outside [0.3, 3.0]"
+        )
+
+    def test_zero_weight_cohort_skipped(self, cov_survey_data):
+        """Zero-weight cohort should be skipped with a warning."""
+        from diff_diff import EfficientDiD
+
+        # Set early cohort (first_treat=4) weights to near-zero
+        cov_survey_data = cov_survey_data.copy()
+        cov_survey_data.loc[cov_survey_data["first_treat"] == 4, "weight"] = 1e-15
+        sd = SurveyDesign(weights="weight")
+        result = EfficientDiD(n_bootstrap=0).fit(
+            cov_survey_data,
+            "outcome", "unit", "time", "first_treat",
+            covariates=["x1"],
+            survey_design=sd,
+        )
+        assert np.isfinite(result.overall_att)
+        assert np.isfinite(result.overall_se)
+
 
 # =============================================================================
 # Scale Invariance (applies to all estimators)