Exclude zero-weight rows from valid observation count

igerber · claude · igerber · commit 703a7fe74b5c · 2026-04-07T11:36:12.000-04:00
Define validity as non-NaN AND positive weight so zero-weight padding
rows don't inflate {outcome}_n or bypass n_valid &lt; 2 / min_n guards.
Add regression test for cell with 1 real + 9 zero-weight observations.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/prep.py b/diff_diff/prep.py
@@ -1351,7 +1351,8 @@ def _cell_mean_variance(
     """
     y_cell = y_full[cell_mask]
     w_cell = full_resolved.weights[cell_mask]
-    valid = ~np.isnan(y_cell)
+    # Valid = non-missing AND positive weight (zero-weight rows are padding)
+    valid = ~np.isnan(y_cell) & (w_cell > 0)
     n_valid = int(np.sum(valid))
 
     if n_valid == 0:
diff --git a/tests/test_prep.py b/tests/test_prep.py
@@ -2452,6 +2452,36 @@ def test_error_all_missing_grouping_keys(self, design):
                 survey_design=design_simple,
             )
 
+    def test_zero_weight_rows_excluded_from_n_valid(self):
+        """Zero-weight rows should not count as valid observations."""
+        rng = np.random.RandomState(66)
+        # Cell A: 1 positive-weight obs + 9 zero-weight padding
+        # With only 1 effective observation, SE should be NaN
+        data = pd.DataFrame(
+            {
+                "geo": ["A"] * 10 + ["B"] * 10,
+                "time": np.ones(20, dtype=int),
+                "wt": np.concatenate(
+                    [
+                        np.array([1.0] + [0.0] * 9),  # A: 1 real, 9 padding
+                        np.ones(10),  # B: all real
+                    ]
+                ),
+                "y": rng.normal(10, 2, 20),
+            }
+        )
+        design = SurveyDesign(weights="wt")
+        panel, _ = aggregate_survey(data, by=["geo", "time"], outcomes="y", survey_design=design)
+        cell_a = panel[panel["geo"] == "A"]
+        # Only 1 positive-weight obs → n_valid=1, SE=NaN
+        assert cell_a["y_n"].iloc[0] == 1
+        assert np.isnan(cell_a["y_se"].iloc[0])
+
+        cell_b = panel[panel["geo"] == "B"]
+        # 10 positive-weight obs → normal SE
+        assert cell_b["y_n"].iloc[0] == 10
+        assert cell_b["y_se"].iloc[0] > 0
+
     def test_duplicate_index(self):
         """Duplicate DataFrame indices do not break aggregation."""
         rng = np.random.RandomState(77)