Fix EfficientDiD fweight consistency and BaconDecomposition index safety from PR #226 review (round 5)

igerber · claude · igerber · commit 20f37e126aa1 · 2026-03-21T20:41:37.000-04:00
- EfficientDiD: use resolved survey weights directly for unit-level
  estimation (Omega*, EIF, cohort fractions) instead of separately
  renormalized raw column, ensuring fweight/aweight consistency with TSL
- BaconDecomposition: store survey weights as DataFrame column for safe
  label-based subsetting in _recompute_exact_weights, preventing
  out-of-bounds errors on non-default DataFrame indexes

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/bacon.py b/diff_diff/bacon.py
@@ -690,8 +690,11 @@ def _recompute_exact_weights(
         within-group variance of the treatment indicator.
         """
         n_total_obs = len(df)
-        w = weights if weights is not None else np.ones(n_total_obs)
-        w_total = np.sum(w)
+        w_arr = weights if weights is not None else np.ones(n_total_obs)
+        # Store weights as a column for safe label-based subsetting
+        df = df.copy()
+        df["_sw"] = w_arr
+        w_total = np.sum(w_arr)
         n_total_units = df[unit].nunique()
 
         for comp in comparisons:
@@ -742,7 +745,7 @@ def _recompute_exact_weights(
                 continue
 
             # Weighted observation counts for the 2x2 sample
-            w_22 = w[df_22.index]
+            w_22 = df_22["_sw"].values
             w_22_sum = np.sum(w_22)
 
             # Sample share of this comparison (weighted)
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -429,17 +429,10 @@ def fit(
         # by taking the first observation per unit (balanced panel, so
         # weights should be constant within unit).
         unit_level_weights: Optional[np.ndarray] = None
-        if survey_weights is not None:
-            # survey_weights is obs-level from _resolve_survey_for_fit
-            # Build a unit-level weight vector aligned with all_units ordering
-            w_col = survey_design.weights if survey_design.weights else None
-            if w_col is not None:
-                w_series = df.groupby(unit)[w_col].first()
-            else:
-                w_series = pd.Series(1.0, index=df[unit].unique())
-            # Normalize unit-level weights (sum = n_units)
-            raw_unit_w = w_series.reindex(all_units).values.astype(float)
-            unit_level_weights = raw_unit_w * (n_units / np.sum(raw_unit_w))
+        if resolved_survey is not None:
+            # Use the resolved survey's weights (already normalized per weight_type)
+            # subset to unit level via _unit_first_panel_row (aligned to all_units)
+            unit_level_weights = self._unit_resolved_survey.weights
 
         cohort_fractions: Dict[float, float] = {}
         if unit_level_weights is not None: