Fix CI review R12: preserve full survey design under auto-inject

igerber · claude · igerber · commit 657b62b7dd09 · 2026-04-17T17:47:24.000-04:00
R11's auto-inject of psu=group filtered `data` to positive-weight rows
before re-resolving the effective SurveyDesign. That silently shrank
`df_survey` on SurveyDesign.subpopulation() inputs without an explicit
PSU — violating the documented subpopulation contract that keeps the
full design intact so t critical values, p-values, CIs, and HonestDiD
bounds match full-design expectations.

Replace the pre-filter with a synthesized PSU column built on a
private copy of `data`:
- Valid group values flow through unchanged as the per-row PSU label.
- NaN / invalid group values on zero-weight rows (the edge case that
  motivated the R11 filter) are replaced with a single shared dummy
  label so the PSU resolver accepts them.
- Zero-weight rows contribute psi_i = 0 to the variance, but remain in
  the resolved design so n_psu / n_strata / df_survey reflect the
  full sample — matching the library's subpopulation contract.

Added TestSurveyWithinGroupValidation.test_subpopulation_preserves_full_design_df_survey:
zero-weights an entire group (mimicking SurveyDesign.subpopulation)
and asserts that auto-inject df_survey equals the explicit psu='group'
df_survey — the full-design reference.

All 271 targeted tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -672,25 +672,39 @@ def fit(
         ):
             from diff_diff.survey import SurveyDesign as _SurveyDesign
 
-            # Pre-filter zero-weight rows so NaN / invalid group IDs on
-            # excluded subpopulation rows don't block PSU resolution
-            # (group becomes the PSU column after auto-inject). Updates
-            # local bindings only; caller's DataFrame is untouched.
-            pos_mask_sv = np.asarray(survey_weights) > 0
-            if not pos_mask_sv.all():
-                data = data.loc[pos_mask_sv].reset_index(drop=True)
+            # Build a synthesized PSU column on a private copy of data
+            # so the caller's DataFrame is untouched. Valid group values
+            # flow through as their own PSU label; NaN/invalid group
+            # values on zero-weight rows (SurveyDesign.subpopulation()
+            # excluded rows) are replaced with a single shared dummy
+            # label so the PSU resolver accepts them. Zero-weight rows
+            # contribute psi_i = 0 to the variance; keeping them in the
+            # resolved design preserves the full-design df_survey
+            # contract (n_psu / n_strata reflect the full sample, not
+            # the positive-weight subset).
+            psu_col_name = "__dcdh_eff_psu__"
+            synth_data = data.copy()
+            synth_psu = synth_data[group].copy()
+            try:
+                invalid_mask = synth_psu.isna().to_numpy()
+            except (AttributeError, TypeError):
+                invalid_mask = np.zeros(len(synth_psu), dtype=bool)
+            if invalid_mask.any():
+                synth_psu = synth_psu.astype(object)
+                synth_psu.loc[invalid_mask] = "__dcdh_excluded_null_psu__"
+            synth_data[psu_col_name] = synth_psu
 
             eff_design = _SurveyDesign(
                 weights=survey_design.weights,
                 strata=survey_design.strata,
-                psu=group,
+                psu=psu_col_name,
                 fpc=getattr(survey_design, "fpc", None),
                 weight_type=getattr(survey_design, "weight_type", "pweight"),
                 nest=getattr(survey_design, "nest", False),
                 lonely_psu=getattr(survey_design, "lonely_psu", "remove"),
             )
             resolved_survey, survey_weights, _, survey_metadata = (
-                _resolve_survey_for_fit(eff_design, data, "analytical")
+                _resolve_survey_for_fit(eff_design, synth_data, "analytical")
             )
 
         if resolved_survey is not None:
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
@@ -1117,6 +1117,46 @@ def test_auto_inject_psu_matches_explicit_group_psu(self, base_data):
             == r_explicit.survey_metadata.df_survey
         )
 
+    def test_subpopulation_preserves_full_design_df_survey(self, base_data):
+        """Under dCDH auto-inject, zero-weighting an entire group must not
+        shrink df_survey below what the full-design PSU count would give.
+
+        Mirrors SurveyDesign.subpopulation() semantics where excluded
+        rows keep their weights at zero but remain in the design so
+        that t critical values, p-values, CIs, and HonestDiD bounds
+        reflect the full sampling structure."""
+        df_ = base_data.copy()
+        df_["pw"] = 1.0
+        # Mimic subpopulation() by zero-weighting one entire group
+        excluded_group = df_["group"].unique()[0]
+        df_.loc[df_["group"] == excluded_group, "pw"] = 0.0
+
+        sd = SurveyDesign(weights="pw")
+        r_subpop = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_, outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        # Reference: explicit psu='group' preserves the full-design
+        # PSU count because the resolver sees all groups (even those
+        # entirely zero-weighted). The auto-inject path must match this.
+        r_explicit = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_, outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=SurveyDesign(weights="pw", psu="group"),
+        )
+        assert r_subpop.survey_metadata is not None
+        assert r_explicit.survey_metadata is not None
+        assert (
+            r_subpop.survey_metadata.df_survey
+            == r_explicit.survey_metadata.df_survey
+        ), (
+            f"Auto-inject df_survey={r_subpop.survey_metadata.df_survey} "
+            f"must match explicit psu='group' df_survey="
+            f"{r_explicit.survey_metadata.df_survey} "
+            f"(full-design subpopulation contract)."
+        )
+
     def test_off_horizon_row_duplication_does_not_change_se(self, base_data):
         """Under auto-injected psu=group, duplicating an observation
         within a group (cell mean unchanged because the duplicate matches