Fix CI review R5: survey TWFE math consistency + zero-weight row filter

igerber · claude · igerber · commit aeca1d8e5a6f · 2026-04-16T18:22:41.000-04:00
- P1 #1: _compute_twfe_diagnostic now uses cell_weight (w_gt when available, else n_gt) for FE regressions, the normalization denominator, contribution weights, and the Corollary 1 observation shares. On survey-backed inputs the outputs now match the observation-level pweighted TWFE estimand; non-survey path is byte-identical. - P1 #2: Zero-weight rows are dropped before the groupby in _validate_and_aggregate_to_cells when weights are provided, so that d_min/d_max/n_gt reflect the effective sample. Prevents zero-weight subpopulation rows from tripping the fuzzy-DiD guard or inflating downstream n_gt counts. - P2: 2 new regression tests in test_survey_dcdh.py — TestSurveyTWFEOracle.test_survey_twfe_matches_obs_level_pweighted_ols verifies beta_fe matches an observation-level pweighted OLS under survey (would fail if n_gt was still used), and TestZeroWeightSubpopulation.test_mixed_zero_weight_row_excluded_from_validation verifies an injected zero-weight row with opposite treatment value doesn't trip the within-cell constancy check. All 256 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -210,6 +210,18 @@ def _validate_and_aggregate_to_cells(
 
     # 5. Cell aggregation (compute min/max for within-cell check)
     if weights is not None:
+        # Zero-weight rows are out-of-sample (e.g., via
+        # SurveyDesign.subpopulation()). Pre-filter them before the
+        # groupby so that d_min/d_max/n_gt reflect the effective sample
+        # and a zero-weight obs cannot trip the within-cell treatment-
+        # constancy check or inflate downstream n_gt counts.
+        weights_arr = np.asarray(weights, dtype=np.float64)
+        pos_mask = weights_arr > 0
+        if not pos_mask.all():
+            df = df.loc[pos_mask].reset_index(drop=True)
+            weights_arr = weights_arr[pos_mask]
+        weights = weights_arr
+
         # Survey-weighted cell aggregation.
         # y_gt = sum(w_i * y_i) / sum(w_i) within each (g, t) cell.
         # Treatment is constant within cells (checked below), so weighted
@@ -4828,7 +4840,16 @@ def _compute_twfe_diagnostic(
     """
     X, _ = _build_group_time_design(cell, group_col, time_col)
     d_arr = cell["d_gt"].to_numpy().astype(float)
-    n_arr = cell["n_gt"].to_numpy().astype(float)
+    # Cell weight for Theorem 1: under survey_design, survey-weighted
+    # cell totals (w_gt) replace raw cell counts (n_gt) so the FE
+    # regressions, normalization denominator, and Corollary 1 shares
+    # match the observation-level pweighted TWFE estimand. Without
+    # survey_design (w_gt column absent), fall back to n_gt — the
+    # non-survey path is unchanged.
+    if "w_gt" in cell.columns:
+        cell_weight = cell["w_gt"].to_numpy().astype(float)
+    else:
+        cell_weight = cell["n_gt"].to_numpy().astype(float)
     y_arr = cell["y_gt"].to_numpy().astype(float)
 
     # Step 1-2: regress d on FE
@@ -4837,13 +4858,13 @@ def _compute_twfe_diagnostic(
         d_arr,
         return_vcov=False,
         rank_deficient_action=rank_deficient_action,
-        weights=n_arr,
+        weights=cell_weight,
     )
     eps = residuals_d
 
     # Step 3: per-cell weights — normalize by sum over treated cells
     treated_mask = d_arr == 1
-    denom = float((n_arr[treated_mask] * eps[treated_mask]).sum())
+    denom = float((cell_weight[treated_mask] * eps[treated_mask]).sum())
     if denom == 0:
         # Cannot normalize: the design has zero treated mass after FE absorption.
         # Warn so the user knows the diagnostic returned NaN values rather than
@@ -4866,12 +4887,14 @@ def _compute_twfe_diagnostic(
             sigma_fe=float("nan"),
             beta_fe=float("nan"),
         )
-    w_gt = (n_arr * eps) / denom
+    contribution_weights = (cell_weight * eps) / denom
 
     weights_df = cell[[group_col, time_col]].copy()
-    weights_df["weight"] = w_gt
+    weights_df["weight"] = contribution_weights
 
-    fraction_negative = float((w_gt[treated_mask] < 0).sum() / treated_mask.sum())
+    fraction_negative = float(
+        (contribution_weights[treated_mask] < 0).sum() / treated_mask.sum()
+    )
 
     # Step 5: plain TWFE regression of y on (FE + d_gt)
     X_with_d = np.column_stack([X, d_arr.reshape(-1, 1)])
@@ -4880,7 +4903,7 @@ def _compute_twfe_diagnostic(
         y_arr,
         return_vcov=False,
         rank_deficient_action=rank_deficient_action,
-        weights=n_arr,
+        weights=cell_weight,
     )
     beta_fe = float(coef_fe[-1])
 
@@ -4897,12 +4920,14 @@ def _compute_twfe_diagnostic(
     #   sigma(w) = sqrt(sum_treated(s * (w_paper - 1)^2))
     #   sigma_fe = |beta_fe| / sigma(w)
     #
-    # where s_{g,t} = N_{g,t} / N_1 are observation shares.
+    # where s_{g,t} = N_{g,t} / N_1 are observation shares (under
+    # survey_design, cell_weight is w_gt so shares are effective-
+    # weight shares; non-survey path is byte-identical).
     eps_treated = eps[treated_mask]
-    n_treated_arr = n_arr[treated_mask]
-    n1 = float(n_treated_arr.sum())  # total treated observations
-    if n1 > 0:
-        shares = n_treated_arr / n1  # s_{g,t} = N_{g,t} / N_1
+    w_treated_arr = cell_weight[treated_mask]
+    w1 = float(w_treated_arr.sum())  # total treated weight (N_1 or W_1)
+    if w1 > 0:
+        shares = w_treated_arr / w1  # s_{g,t} = w_{g,t} / w_1
         denom_paper = float((shares * eps_treated).sum())
         if abs(denom_paper) > 0:
             w_paper = eps_treated / denom_paper  # paper's w_{g,t}
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
@@ -725,3 +725,97 @@ def test_twfe_helper_rejects_non_pweight(self, base_data):
                 time="period", treatment="treatment",
                 survey_design=sd,
             )
+
+
+# ── Test: TWFE diagnostic oracle under survey ───────────────────────
+
+
+class TestSurveyTWFEOracle:
+    """twfe_beta_fe under survey must match an observation-level pweighted
+    TWFE regression on the same data (proving w_gt is used, not n_gt)."""
+
+    def test_survey_twfe_matches_obs_level_pweighted_ols(self, data_with_survey):
+        from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights
+        from diff_diff.linalg import solve_ols
+
+        sd = SurveyDesign(weights="pw")
+        helper = twowayfeweights(
+            data_with_survey,
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(helper.beta_fe)
+
+        # Build observation-level TWFE design with group and period FE
+        # (reference category dropped) and treatment indicator.
+        df_ = data_with_survey.copy()
+        groups_u = sorted(df_["group"].unique())
+        periods_u = sorted(df_["period"].unique())
+        g_map = {g: i for i, g in enumerate(groups_u)}
+        t_map = {t: i for i, t in enumerate(periods_u)}
+        g_idx = df_["group"].map(g_map).to_numpy()
+        t_idx = df_["period"].map(t_map).to_numpy()
+        n = len(df_)
+        X_g = np.zeros((n, len(groups_u) - 1))
+        X_t = np.zeros((n, len(periods_u) - 1))
+        for i in range(n):
+            if g_idx[i] > 0:
+                X_g[i, g_idx[i] - 1] = 1.0
+            if t_idx[i] > 0:
+                X_t[i, t_idx[i] - 1] = 1.0
+        intercept = np.ones((n, 1))
+        treat = df_["treatment"].to_numpy().astype(float).reshape(-1, 1)
+        X_obs = np.hstack([intercept, X_g, X_t, treat])
+        y_obs = df_["outcome"].to_numpy().astype(float)
+        w_obs = df_["pw"].to_numpy().astype(float)
+
+        coef, _, _ = solve_ols(
+            X_obs, y_obs,
+            weights=w_obs, weight_type="pweight",
+            return_vcov=False,
+        )
+        beta_oracle = float(coef[-1])
+        # Point-estimate match (one obs per cell in this fixture; so the
+        # cell-level WLS with cell_weight == w_gt equals the obs-level
+        # WLS with w_obs weights).
+        assert helper.beta_fe == pytest.approx(beta_oracle, rel=1e-6), (
+            f"helper.beta_fe={helper.beta_fe} oracle={beta_oracle} "
+            f"— TWFE diagnostic must use w_gt under survey"
+        )
+
+
+# ── Test: Zero-weight subpopulation exclusion ──────────────────────
+
+
+class TestZeroWeightSubpopulation:
+    """Zero-weight rows must not trip fuzzy-DiD guard or inflate counts."""
+
+    def test_mixed_zero_weight_row_excluded_from_validation(self, base_data):
+        """A cell with a positive-weight treated obs and a zero-weight
+        obs with a different treatment value must fit cleanly — the
+        zero-weight row is out-of-sample (SurveyDesign.subpopulation())."""
+        df_ = base_data.copy()
+        df_["pw"] = 1.0
+        # Pick a treated (g, t) cell. Add a zero-weight row in the same
+        # cell with the opposite treatment value. Unweighted d_min != d_max
+        # would trip the fuzzy-DiD guard; pre-filtering zero-weight rows
+        # must bypass it.
+        treated_mask = df_["treatment"] == 1
+        if not treated_mask.any():
+            pytest.skip("no treated row in fixture")
+        sample = df_[treated_mask].iloc[0].copy()
+        # Flip treatment on the injected row, give it zero weight
+        sample["treatment"] = 0
+        sample["pw"] = 0.0
+        df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True)
+        sd = SurveyDesign(weights="pw")
+
+        # Must succeed (not raise fuzzy-DiD ValueError)
+        result = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_,
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(result.overall_att)