Skip to content

Commit 7c6b160

Browse files
igerberclaude
andcommitted
Address AI review round 3: ICC covariate variance, DGP REGISTRY notes
- Include covariate variance (0.2725) in ICC calibration when add_covariates=True - Document informative_sampling structural Y(0) ranking in docstring and REGISTRY.md - Add cross-section default-weights test and ICC + covariates regression test Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5c3798a commit 7c6b160

3 files changed

Lines changed: 56 additions & 1 deletion

File tree

diff_diff/prep_dgp.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,8 @@ def generate_survey_did_data(
12711271
repeated cross-sections, ranking is refreshed each period. Within
12721272
each stratum, rank-based weights are scaled to preserve the
12731273
stratum's baseline weight level from ``weight_variation``.
1274+
Ranking is based on structural Y(0) (unit FE + PSU effects),
1275+
excluding covariates from ``add_covariates``.
12741276
heterogeneous_te_by_strata : bool, default=False
12751277
If True, treatment effect varies by stratum:
12761278
``TE_h = TE * (1 + 0.5 * (h - mean) / std)``. Creates a gap
@@ -1399,8 +1401,11 @@ def generate_survey_did_data(
13991401

14001402
# --- ICC -> psu_re_sd resolution ---
14011403
if icc is not None:
1404+
# Include covariate variance: Var(0.5*x1) + Var(0.3*x2)
1405+
# where x1 ~ N(0,1), x2 ~ Bernoulli(0.5)
1406+
cov_var = (0.25 + 0.09 * 0.25) if add_covariates else 0.0
14021407
psu_re_sd = np.sqrt(
1403-
icc * (unit_fe_sd**2 + noise_sd**2)
1408+
icc * (unit_fe_sd**2 + noise_sd**2 + cov_var)
14041409
/ ((1 - icc) * (1 + psu_period_factor**2))
14051410
)
14061411

docs/methodology/REGISTRY.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2515,6 +2515,17 @@ The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al.
25152515
covariates). Paper's Step 8 is "Keep learning." The mandatory with/without covariate
25162516
comparison is a diff-diff convention.
25172517

2518+
### Survey DGP (`generate_survey_did_data`)
2519+
2520+
- **Note:** The `icc` parameter calibrates `psu_re_sd` using the variance
2521+
decomposition `Var(Y) = sigma²_psu * (1 + psu_period_factor²) + sigma²_unit +
2522+
sigma²_noise + sigma²_cov`. When `add_covariates=True`, the covariate variance
2523+
`Var(0.5*x1) + Var(0.3*x2) = 0.2725` is included in the calibration.
2524+
- **Note:** Defensive enhancement: `informative_sampling` ranks units on structural
2525+
Y(0) (unit FE + PSU effects + time trend), excluding covariate contributions from
2526+
`add_covariates`. This models selection on structural characteristics (geography,
2527+
demographics) rather than residual variation, matching real survey sampling frames.
2528+
25182529
---
25192530

25202531
# Version History

tests/test_prep.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,45 @@ def test_informative_sampling_cross_section(self):
16391639
corr = np.corrcoef(p1["weight"], p1["outcome"])[0, 1]
16401640
assert corr > 0.1
16411641

1642+
def test_informative_sampling_cross_section_default_weights(self):
1643+
"""Cross-section informative sampling with default weight_variation."""
1644+
from diff_diff.prep_dgp import generate_survey_did_data
1645+
1646+
df = generate_survey_did_data(
1647+
n_units=1000,
1648+
informative_sampling=True,
1649+
panel=False,
1650+
seed=42,
1651+
)
1652+
p1 = df[df["period"] == 1]
1653+
for s in range(5):
1654+
expected_mean = 1.0 + 1.0 * (s / 4)
1655+
stratum_weights = p1.loc[p1["stratum"] == s, "weight"]
1656+
assert abs(stratum_weights.mean() - expected_mean) < 0.15
1657+
assert stratum_weights.std() > 0.01
1658+
1659+
def test_icc_with_covariates(self):
1660+
"""ICC calibration should account for covariate variance."""
1661+
from diff_diff.prep_dgp import generate_survey_did_data
1662+
1663+
target_icc = 0.3
1664+
df = generate_survey_did_data(
1665+
n_units=1000, icc=target_icc, add_covariates=True, seed=42
1666+
)
1667+
# ANOVA-based ICC on period 1
1668+
p1 = df[df["period"] == 1]
1669+
groups = p1.groupby("psu")["outcome"]
1670+
grand_mean = p1["outcome"].mean()
1671+
n_total = len(p1)
1672+
n_groups = groups.ngroups
1673+
n_bar = n_total / n_groups
1674+
ssb = (groups.size() * (groups.mean() - grand_mean) ** 2).sum()
1675+
msb = ssb / (n_groups - 1)
1676+
ssw = groups.apply(lambda x: ((x - x.mean()) ** 2).sum()).sum()
1677+
msw = ssw / (n_total - n_groups)
1678+
realized_icc = (msb - msw) / (msb + (n_bar - 1) * msw)
1679+
assert abs(realized_icc - target_icc) / target_icc < 0.50
1680+
16421681
def test_heterogeneous_te_by_strata(self):
16431682
"""Unweighted mean TE should differ from population ATT."""
16441683
from diff_diff.prep_dgp import generate_survey_did_data

0 commit comments

Comments
 (0)