Fix 4 P1 issues from PR #226 review (round 2)

igerber · claude · igerber · commit eb5ee64f9324 · 2026-03-21T20:41:03.000-04:00
- ContinuousDiD: rescale IFs by n_units before compute_survey_vcov to
  avoid double-counting 1/n bread; use unit-level df_survey
- EfficientDiD: align unit_first_panel_row to sorted all_units order;
  build unit-level ResolvedSurveyDesign once in fit(); use unit-level df
- SunAbraham: thread survey weights into _compute_iw_effects and
  _compute_overall_att for survey-weighted cohort aggregation
- StackedDiD: pass survey df to safe_inference for event-study and
  overall ATT p-values/CIs

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/continuous_did.py b/diff_diff/continuous_did.py
@@ -509,8 +509,8 @@ def fit(
                 overall_att_se = analytic["overall_att_se"]
                 overall_acrt_se = analytic["overall_acrt_se"]
 
-                # Survey df for t-distribution inference
-                _survey_df = survey_metadata.df_survey if survey_metadata is not None else None
+                # Survey df for t-distribution inference (unit-level, not panel-level)
+                _survey_df = analytic.get("df_survey")
 
                 overall_att_t, overall_att_p, overall_att_ci = safe_inference(
                     overall_att, overall_att_se, self.alpha, df=_survey_df
@@ -626,7 +626,9 @@ def fit(
                         # Compute SE: survey-aware TSL or standard sqrt(sum(IF^2))
                         if unit_resolved_es is not None:
                             X_ones_es = np.ones((n_units, 1))
-                            vcov_es = compute_survey_vcov(X_ones_es, if_es, unit_resolved_es)
+                            # Rescale IFs from 1/n convention to score scale for TSL
+                            if_es_tsl = if_es * n_units
+                            vcov_es = compute_survey_vcov(X_ones_es, if_es_tsl, unit_resolved_es)
                             es_se = float(np.sqrt(np.abs(vcov_es[0, 0])))
                         else:
                             es_se = float(np.sqrt(np.sum(if_es**2)))
@@ -1162,22 +1164,33 @@ def _compute_analytical_se(
 
             X_ones = np.ones((n_units, 1))
 
+            # Rescale IFs from 1/n convention to score scale for TSL sandwich.
+            # The per-unit IFs contain internal 1/n_t, 1/n_c scaling (for the
+            # unweighted SE = sqrt(sum(IF^2)) convention). compute_survey_vcov
+            # applies its own (X'WX)^{-1} ≈ 1/n bread, which would double-count.
+            # Multiplying by n_units undoes the internal scaling so TSL gives
+            # the correct variance.
+            if_att_glob_tsl = if_att_glob * n_units
+            if_acrt_glob_tsl = if_acrt_glob * n_units
+            if_att_d_tsl = if_att_d * n_units
+            if_acrt_d_tsl = if_acrt_d * n_units
+
             # Overall ATT SE via compute_survey_vcov
-            vcov_att = compute_survey_vcov(X_ones, if_att_glob, unit_resolved)
+            vcov_att = compute_survey_vcov(X_ones, if_att_glob_tsl, unit_resolved)
             overall_att_se = float(np.sqrt(np.abs(vcov_att[0, 0])))
 
             # Overall ACRT SE via compute_survey_vcov
-            vcov_acrt = compute_survey_vcov(X_ones, if_acrt_glob, unit_resolved)
+            vcov_acrt = compute_survey_vcov(X_ones, if_acrt_glob_tsl, unit_resolved)
             overall_acrt_se = float(np.sqrt(np.abs(vcov_acrt[0, 0])))
 
             # Per-grid-point SEs for dose-response curves
             att_d_se = np.zeros(n_grid)
             acrt_d_se = np.zeros(n_grid)
             for d_idx in range(n_grid):
-                vcov_d = compute_survey_vcov(X_ones, if_att_d[:, d_idx], unit_resolved)
+                vcov_d = compute_survey_vcov(X_ones, if_att_d_tsl[:, d_idx], unit_resolved)
                 att_d_se[d_idx] = float(np.sqrt(np.abs(vcov_d[0, 0])))
 
-                vcov_d = compute_survey_vcov(X_ones, if_acrt_d[:, d_idx], unit_resolved)
+                vcov_d = compute_survey_vcov(X_ones, if_acrt_d_tsl[:, d_idx], unit_resolved)
                 acrt_d_se[d_idx] = float(np.sqrt(np.abs(vcov_d[0, 0])))
         else:
             # SE = sqrt(sum(IF_i^2)), matching CallawaySantAnna's convention
@@ -1188,11 +1201,15 @@ def _compute_analytical_se(
             att_d_se = np.sqrt(np.sum(if_att_d**2, axis=0))
             acrt_d_se = np.sqrt(np.sum(if_acrt_d**2, axis=0))
 
+        # Return unit-level survey df when available (for t-distribution inference)
+        unit_df_survey = unit_resolved.df_survey if resolved_survey is not None else None
+
         return {
             "overall_att_se": overall_att_se,
             "overall_acrt_se": overall_acrt_se,
             "att_d_se": att_d_se,
             "acrt_d_se": acrt_d_se,
+            "df_survey": unit_df_survey,
         }
 
     def _run_bootstrap(
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -133,7 +133,7 @@ def __init__(
         self.kernel_bandwidth = kernel_bandwidth
         self.is_fitted_ = False
         self.results_: Optional[EfficientDiDResults] = None
-        self._survey_se_ctx: Optional[tuple] = None
+        self._unit_resolved_survey = None
         self._validate_params()
 
     def _validate_params(self) -> None:
@@ -361,9 +361,45 @@ def fit(
         all_units = sorted(df[unit].unique())
         n_units = len(all_units)
 
-        # Build unit-to-first-panel-row index (for unit-level survey collapse)
-        _first_rows = df.groupby(unit).cumcount() == 0
-        self._unit_first_panel_row = np.where(_first_rows)[0]
+        # Build unit-to-first-panel-row index aligned to all_units (sorted)
+        # order.  The previous approach (groupby cumcount == 0) yielded
+        # first-appearance order which can differ from sorted order when the
+        # input DataFrame is not pre-sorted by unit.
+        first_pos: Dict[Any, int] = {}
+        for i, u in enumerate(df[unit].values):
+            if u not in first_pos:
+                first_pos[u] = i
+        self._unit_first_panel_row = np.array([first_pos[u] for u in all_units])
+
+        # Build unit-level ResolvedSurveyDesign once (avoids repeated
+        # construction in _compute_survey_eif_se and ensures consistent
+        # unit-level df for safe_inference t-distribution).
+        if resolved_survey is not None:
+            from diff_diff.survey import ResolvedSurveyDesign
+
+            row_idx = self._unit_first_panel_row
+            unit_weights_s = resolved_survey.weights[row_idx]
+            unit_strata = (
+                resolved_survey.strata[row_idx] if resolved_survey.strata is not None else None
+            )
+            unit_psu = resolved_survey.psu[row_idx] if resolved_survey.psu is not None else None
+            unit_fpc = resolved_survey.fpc[row_idx] if resolved_survey.fpc is not None else None
+            n_strata_u = len(np.unique(unit_strata)) if unit_strata is not None else 0
+            n_psu_u = len(np.unique(unit_psu)) if unit_psu is not None else 0
+            self._unit_resolved_survey = ResolvedSurveyDesign(
+                weights=unit_weights_s,
+                weight_type=resolved_survey.weight_type,
+                strata=unit_strata,
+                psu=unit_psu,
+                fpc=unit_fpc,
+                n_strata=n_strata_u,
+                n_psu=n_psu_u,
+                lonely_psu=resolved_survey.lonely_psu,
+            )
+            # Use unit-level df (not panel-level) for t-distribution
+            self._survey_df = self._unit_resolved_survey.df_survey
+        else:
+            self._unit_resolved_survey = None
 
         period_to_col = {p: i for i, p in enumerate(time_periods)}
         period_1 = time_periods[0]
@@ -686,11 +722,8 @@ def fit(
 
                 # Analytical SE = sqrt(mean(EIF^2) / n)  [paper p.21]
                 # With survey: use TSL variance via compute_survey_vcov
-                if resolved_survey is not None:
-                    se_gt = self._compute_survey_eif_se(
-                        eif_vals,
-                        resolved_survey,
-                    )
+                if self._unit_resolved_survey is not None:
+                    se_gt = self._compute_survey_eif_se(eif_vals)
                 else:
                     se_gt = float(np.sqrt(np.mean(eif_vals**2) / n_units))
 
@@ -714,12 +747,6 @@ def fit(
                 "Check data has sufficient observations."
             )
 
-        # ----- Store survey context for aggregation SE helpers -----
-        # Temporarily store survey context for use in aggregation helpers.
-        # This avoids threading survey args through the deeply nested
-        # aggregation methods that are also used by the bootstrap mixin.
-        self._survey_se_ctx = resolved_survey if resolved_survey is not None else None
-
         # ----- Aggregation -----
         overall_att, overall_se = self._aggregate_overall(
             group_time_effects, eif_by_gt, n_units, cohort_fractions, unit_cohorts
@@ -752,9 +779,6 @@ def fit(
                 unit_cohorts=unit_cohorts,
             )
 
-        # Clean up temporary survey context
-        self._survey_se_ctx = None
-
         # ----- Bootstrap -----
         bootstrap_results = None
         if self.n_bootstrap > 0 and eif_by_gt:
@@ -855,63 +879,27 @@ def fit(
 
     # -- Survey SE helpers ----------------------------------------------------
 
-    def _compute_survey_eif_se(
-        self,
-        eif_vals: np.ndarray,
-        resolved_survey: Any,
-    ) -> float:
+    def _compute_survey_eif_se(self, eif_vals: np.ndarray) -> float:
         """Compute SE from EIF scores using Taylor Series Linearization.
 
-        The EIF is at unit level (shape n_units).  We collapse the
-        panel-level resolved survey to unit level using the first-panel-row
-        index and pass unit-level arrays to ``compute_survey_vcov``.
-        This avoids the previous bug where expanding EIF to panel rows
-        created one implicit PSU per period-copy, deflating SEs for
-        weights-only and stratified-no-PSU survey designs.
+        Uses the pre-built unit-level ``_unit_resolved_survey`` constructed
+        once in ``fit()``, ensuring consistent unit-level arrays and
+        avoiding repeated subsetting of panel-level survey data.
         """
-        from diff_diff.survey import ResolvedSurveyDesign, compute_survey_vcov
+        from diff_diff.survey import compute_survey_vcov
 
-        row_idx = self._unit_first_panel_row
-        n_units = len(eif_vals)
-
-        # Subset survey arrays to unit level
-        unit_weights = resolved_survey.weights[row_idx]
-        unit_strata = (
-            resolved_survey.strata[row_idx] if resolved_survey.strata is not None else None
-        )
-        unit_psu = resolved_survey.psu[row_idx] if resolved_survey.psu is not None else None
-        unit_fpc = resolved_survey.fpc[row_idx] if resolved_survey.fpc is not None else None
-
-        # Count unique strata/PSU in the unit-level subset
-        n_strata_unit = len(np.unique(unit_strata)) if unit_strata is not None else 0
-        n_psu_unit = len(np.unique(unit_psu)) if unit_psu is not None else 0
-
-        unit_resolved = ResolvedSurveyDesign(
-            weights=unit_weights,
-            weight_type=resolved_survey.weight_type,
-            strata=unit_strata,
-            psu=unit_psu,
-            fpc=unit_fpc,
-            n_strata=n_strata_unit,
-            n_psu=n_psu_unit,
-            lonely_psu=resolved_survey.lonely_psu,
-        )
-
-        X_ones = np.ones((n_units, 1))
-        vcov = compute_survey_vcov(X_ones, eif_vals, unit_resolved)
+        X_ones = np.ones((len(eif_vals), 1))
+        vcov = compute_survey_vcov(X_ones, eif_vals, self._unit_resolved_survey)
         return float(np.sqrt(np.abs(vcov[0, 0])))
 
     def _eif_se(self, eif_vals: np.ndarray, n_units: int) -> float:
         """Compute SE from aggregated EIF scores.
 
-        Dispatches to survey TSL when ``_survey_se_ctx`` is set (during
-        fit), otherwise uses the standard analytical formula.
+        Dispatches to survey TSL when ``_unit_resolved_survey`` is set
+        (during fit), otherwise uses the standard analytical formula.
         """
-        if self._survey_se_ctx is not None:
-            return self._compute_survey_eif_se(
-                eif_vals,
-                self._survey_se_ctx,
-            )
+        if self._unit_resolved_survey is not None:
+            return self._compute_survey_eif_se(eif_vals)
         return float(np.sqrt(np.mean(eif_vals**2) / n_units))
 
     # -- Aggregation helpers --------------------------------------------------
diff --git a/diff_diff/stacked_did.py b/diff_diff/stacked_did.py
@@ -459,7 +459,14 @@ def fit(
                 idx = interaction_indices[h]
                 effect = float(coef[idx])
                 se = float(np.sqrt(max(vcov[idx, idx], 0.0)))
-                t_stat, p_value, conf_int = safe_inference(effect, se, alpha=self.alpha)
+                _survey_df = (
+                    max(survey_metadata.df_survey, 1)
+                    if survey_metadata is not None and survey_metadata.df_survey is not None
+                    else None
+                )
+                t_stat, p_value, conf_int = safe_inference(
+                    effect, se, alpha=self.alpha, df=_survey_df
+                )
                 n_obs_h = int(np.sum((et_vals == h) & (d_vals == 1)))
                 event_study_effects[h] = {
                     "effect": effect,
@@ -489,7 +496,14 @@ def fit(
             overall_att = np.nan
             overall_se = np.nan
 
-        overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha)
+        _survey_df_overall = (
+            max(survey_metadata.df_survey, 1)
+            if survey_metadata is not None and survey_metadata.df_survey is not None
+            else None
+        )
+        overall_t, overall_p, overall_ci = safe_inference(
+            overall_att, overall_se, alpha=self.alpha, df=_survey_df_overall
+        )
 
         # ---- Construct results ----
         self.results_ = StackedDiDResults(
diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py
@@ -625,6 +625,15 @@ def fit(
             resolved_survey=resolved_survey,
         )
 
+        # Resolve survey weight column name for cohort aggregation
+        survey_weight_col = (
+            survey_design.weights
+            if survey_design is not None
+            and hasattr(survey_design, "weights")
+            and survey_design.weights
+            else None
+        )
+
         # Compute interaction-weighted event study effects
         event_study_effects, cohort_weights = self._compute_iw_effects(
             df,
@@ -636,6 +645,7 @@ def fit(
             cohort_ses,
             vcov_cohort,
             coef_index_map,
+            survey_weight_col=survey_weight_col,
         )
 
         # Compute overall ATT (average of post-treatment effects)
@@ -647,6 +657,7 @@ def fit(
             cohort_weights,
             vcov_cohort,
             coef_index_map,
+            survey_weight_col=survey_weight_col,
         )
 
         overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha)
@@ -869,6 +880,7 @@ def _compute_iw_effects(
         cohort_ses: Dict[Tuple[Any, int], float],
         vcov_cohort: np.ndarray,
         coef_index_map: Dict[Tuple[Any, int], int],
+        survey_weight_col: Optional[str] = None,
     ) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[Any, float]]]:
         """
         Compute interaction-weighted event study effects.
@@ -878,6 +890,10 @@ def _compute_iw_effects(
         where w_{g,e} = n_{g,e} / Σ_g n_{g,e} is the share of observations from cohort g
         at event-time e among all treated observations at that event-time.
 
+        When survey weights are provided, n_{g,e} is the survey-weighted mass
+        (sum of weights) rather than raw observation counts, so the estimand
+        reflects the survey-weighted cohort composition.
+
         Returns
         -------
         event_study_effects : dict
@@ -888,8 +904,15 @@ def _compute_iw_effects(
         event_study_effects: Dict[int, Dict[str, Any]] = {}
         cohort_weights: Dict[int, Dict[Any, float]] = {}
 
-        # Pre-compute per-event-time observation counts: n_{g,e}
-        event_time_counts = df[df[first_treat] > 0].groupby([first_treat, "_rel_time"]).size()
+        # Pre-compute per-event-time observation mass: n_{g,e}
+        # With survey weights, use weighted sum; otherwise raw counts.
+        treated_mask = df[first_treat] > 0
+        if survey_weight_col is not None and survey_weight_col in df.columns:
+            event_time_counts = (
+                df[treated_mask].groupby([first_treat, "_rel_time"])[survey_weight_col].sum()
+            )
+        else:
+            event_time_counts = df[treated_mask].groupby([first_treat, "_rel_time"]).size()
 
         for e in rel_periods:
             # Get cohorts that have observations at this relative time
@@ -951,23 +974,31 @@ def _compute_overall_att(
         cohort_weights: Dict[int, Dict[Any, float]],
         vcov_cohort: np.ndarray,
         coef_index_map: Dict[Tuple[Any, int], int],
+        survey_weight_col: Optional[str] = None,
     ) -> Tuple[float, float]:
         """
         Compute overall ATT as weighted average of post-treatment effects.
 
+        When survey weights are provided, the per-period weights use
+        survey-weighted mass rather than raw observation counts.
+
         Returns (att, se) tuple.
         """
         post_effects = [(e, eff) for e, eff in event_study_effects.items() if e >= 0]
 
         if not post_effects:
             return np.nan, np.nan
 
-        # Weight by number of treated observations at each relative time
+        # Weight by (survey-weighted) mass of treated observations at each relative time
         post_weights = []
         post_estimates = []
 
         for e, eff in post_effects:
-            n_at_e = len(df[(df["_rel_time"] == e) & (df[first_treat] > 0)])
+            mask = (df["_rel_time"] == e) & (df[first_treat] > 0)
+            if survey_weight_col is not None and survey_weight_col in df.columns:
+                n_at_e = df.loc[mask, survey_weight_col].sum()
+            else:
+                n_at_e = len(df[mask])
             post_weights.append(max(n_at_e, 1))
             post_estimates.append(eff["effect"])
 
diff --git a/tests/test_sun_abraham.py b/tests/test_sun_abraham.py
@@ -1291,7 +1291,8 @@ def test_variance_fallback_warning(self):
 
         def patched_compute_overall_att(df, first_treat, event_study_effects,
                                         cohort_effects, cohort_weights,
-                                        vcov_cohort, coef_index_map):
+                                        vcov_cohort, coef_index_map,
+                                        survey_weight_col=None):
             # Pass an empty coef_index_map to trigger the fallback
             return original_method(
                 df, first_treat, event_study_effects,