Add fit-ready weight column mapping NaN precision to 0.0

igerber · claude · igerber · commit f04fe0d18cf4 · 2026-04-07T11:48:56.000-04:00
The returned SurveyDesign now points at a {outcome}_weight column
where NaN/Inf precision values are mapped to 0.0, so downstream
fit() never rejects missing weights. Diagnostic *_precision column
is preserved as-is. Add stage2-handoff test with single-observation
cell (NaN precision → zero weight → fit succeeds).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/prep.py b/diff_diff/prep.py
@@ -1453,8 +1453,10 @@ def aggregate_survey(
     panel_df : pd.DataFrame
         Aggregated panel with columns: grouping variables,
         ``{outcome}_mean``, ``{outcome}_se``, ``{outcome}_n``,
-        ``{outcome}_precision``, ``{covariate}_mean``, ``cell_n``,
-        ``cell_n_eff``, ``srs_fallback``.
+        ``{outcome}_precision``, ``{outcome}_weight``,
+        ``{covariate}_mean``, ``cell_n``, ``cell_n_eff``,
+        ``srs_fallback``. The ``_weight`` column is a fit-ready
+        version of ``_precision`` with NaN/Inf mapped to 0.0.
     second_stage_design : SurveyDesign
         Pre-configured for second-stage estimation with
         ``weight_type="aweight"``, precision weights from the first
@@ -1637,9 +1639,18 @@ def aggregate_survey(
     panel_df = panel_df.sort_values(by_cols).reset_index(drop=True)
 
     # --- Construct second-stage SurveyDesign ---
+    # Create a fit-ready weight column: NaN/Inf precision → 0.0 so downstream
+    # resolve() doesn't reject missing weights. Diagnostic *_precision is kept.
     first_outcome = outcome_cols[0]
+    weight_col = f"{first_outcome}_weight"
+    panel_df[weight_col] = np.where(
+        np.isfinite(panel_df[f"{first_outcome}_precision"]),
+        panel_df[f"{first_outcome}_precision"],
+        0.0,
+    )
+
     second_stage_design = SurveyDesign(
-        weights=f"{first_outcome}_precision",
+        weights=weight_col,
         weight_type="aweight",
         psu=by_cols[0],
     )
diff --git a/tests/test_prep.py b/tests/test_prep.py
@@ -2057,7 +2057,7 @@ def test_multiple_outcomes(self, micro_data, design):
         assert "y2_mean" in panel.columns
         assert "y_precision" in panel.columns
         assert "y2_precision" in panel.columns
-        assert stage2.weights == "y_precision"
+        assert stage2.weights == "y_weight"
 
     def test_covariates_mean_only(self, micro_data, design):
         """Covariates get mean column only, no SE/precision."""
@@ -2081,7 +2081,7 @@ def test_returned_survey_design(self, micro_data, design):
             survey_design=design,
         )
         assert stage2.weight_type == "aweight"
-        assert stage2.weights == "y_precision"
+        assert stage2.weights == "y_weight"
         assert stage2.psu == "state"
 
     def test_srs_fallback(self):
@@ -2452,6 +2452,59 @@ def test_error_all_missing_grouping_keys(self, design):
                 survey_design=design_simple,
             )
 
+    def test_stage2_handoff_with_nonfinite_cells(self):
+        """stage2 SurveyDesign works even when some cells have NaN precision."""
+        from diff_diff import DifferenceInDifferences
+
+        rng = np.random.RandomState(99)
+        rows = []
+        for state in range(4):
+            treated = 1 if state < 2 else 0
+            for period in [0, 1]:
+                te = 3.0 if (treated and period == 1) else 0.0
+                n_cell = 30
+                for _ in range(n_cell):
+                    rows.append(
+                        {
+                            "state": state,
+                            "period": period,
+                            "wt": rng.uniform(0.5, 2.0),
+                            "outcome": rng.normal(10 + te, 2),
+                            "treated": treated,
+                        }
+                    )
+        micro = pd.DataFrame(rows)
+        # Make one cell have only 1 observation → NaN SE → NaN precision
+        mask = (micro["state"] == 0) & (micro["period"] == 0)
+        micro = micro.drop(micro[mask].index[1:])  # keep only 1 row
+
+        design = SurveyDesign(weights="wt")
+        panel, stage2 = aggregate_survey(
+            micro,
+            by=["state", "period"],
+            outcomes="outcome",
+            covariates="treated",
+            survey_design=design,
+        )
+
+        # The zero-variance cell should have weight=0 (not NaN)
+        cell_00 = panel[(panel["state"] == 0) & (panel["period"] == 0)]
+        assert np.isnan(cell_00["outcome_precision"].iloc[0])  # diagnostic
+        assert cell_00["outcome_weight"].iloc[0] == 0.0  # fit-ready
+
+        # stage2 should work with fit() despite NaN-precision cells
+        panel["treated_bin"] = (panel["treated_mean"] > 0.5).astype(int)
+        did = DifferenceInDifferences()
+        result = did.fit(
+            panel,
+            outcome="outcome_mean",
+            treatment="treated_bin",
+            time="period",
+            survey_design=stage2,
+        )
+        assert np.isfinite(result.att)
+        assert np.isfinite(result.se)
+
     def test_zero_weight_rows_excluded_from_n_valid(self):
         """Zero-weight rows should not count as valid observations."""
         rng = np.random.RandomState(66)