Skip to content

Commit 703a7fe

Browse files
igerberclaude
andcommitted
Exclude zero-weight rows from valid observation count
Define validity as non-NaN AND positive weight so zero-weight padding rows don't inflate {outcome}_n or bypass n_valid < 2 / min_n guards. Add regression test for cell with 1 real + 9 zero-weight observations. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4788166 commit 703a7fe

2 files changed

Lines changed: 32 additions & 1 deletion

File tree

diff_diff/prep.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,8 @@ def _cell_mean_variance(
13511351
"""
13521352
y_cell = y_full[cell_mask]
13531353
w_cell = full_resolved.weights[cell_mask]
1354-
valid = ~np.isnan(y_cell)
1354+
# Valid = non-missing AND positive weight (zero-weight rows are padding)
1355+
valid = ~np.isnan(y_cell) & (w_cell > 0)
13551356
n_valid = int(np.sum(valid))
13561357

13571358
if n_valid == 0:

tests/test_prep.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2452,6 +2452,36 @@ def test_error_all_missing_grouping_keys(self, design):
24522452
survey_design=design_simple,
24532453
)
24542454

2455+
def test_zero_weight_rows_excluded_from_n_valid(self):
2456+
"""Zero-weight rows should not count as valid observations."""
2457+
rng = np.random.RandomState(66)
2458+
# Cell A: 1 positive-weight obs + 9 zero-weight padding
2459+
# With only 1 effective observation, SE should be NaN
2460+
data = pd.DataFrame(
2461+
{
2462+
"geo": ["A"] * 10 + ["B"] * 10,
2463+
"time": np.ones(20, dtype=int),
2464+
"wt": np.concatenate(
2465+
[
2466+
np.array([1.0] + [0.0] * 9), # A: 1 real, 9 padding
2467+
np.ones(10), # B: all real
2468+
]
2469+
),
2470+
"y": rng.normal(10, 2, 20),
2471+
}
2472+
)
2473+
design = SurveyDesign(weights="wt")
2474+
panel, _ = aggregate_survey(data, by=["geo", "time"], outcomes="y", survey_design=design)
2475+
cell_a = panel[panel["geo"] == "A"]
2476+
# Only 1 positive-weight obs → n_valid=1, SE=NaN
2477+
assert cell_a["y_n"].iloc[0] == 1
2478+
assert np.isnan(cell_a["y_se"].iloc[0])
2479+
2480+
cell_b = panel[panel["geo"] == "B"]
2481+
# 10 positive-weight obs → normal SE
2482+
assert cell_b["y_n"].iloc[0] == 10
2483+
assert cell_b["y_se"].iloc[0] > 0
2484+
24552485
def test_duplicate_index(self):
24562486
"""Duplicate DataFrame indices do not break aggregation."""
24572487
rng = np.random.RandomState(77)

0 commit comments

Comments
 (0)