Skip to content

Commit 7d381de

Browse files
igerberclaude
andcommitted
Address AI review feedback on survey tutorial
- Fix estimator support table: ImputationDiD and TwoStageDiD now show "Partial (no FPC)" instead of "Full" for strata/PSU/FPC support, matching the code which explicitly rejects FPC (P1) - Fix repeated cross-section example: add panel=False parameter to generate_survey_did_data() that draws fresh respondent effects each period instead of relabeling panel unit IDs (P1) - Add JK1 minimum-PSU guard: raise ValueError when n_psu < 2 to prevent division by zero in replicate weight generation (P2) - Clear stale notebook outputs committed from wrong environment (P2) - Add top-level import test and JK1 boundary test (P2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b9eed71 commit 7d381de

3 files changed

Lines changed: 152 additions & 104 deletions

File tree

diff_diff/prep_dgp.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,15 +1146,17 @@ def generate_survey_did_data(
11461146
noise_sd: float = 0.5,
11471147
include_replicate_weights: bool = False,
11481148
add_covariates: bool = False,
1149+
panel: bool = True,
11491150
seed: Optional[int] = None,
11501151
) -> pd.DataFrame:
11511152
"""
11521153
Generate synthetic staggered DiD data with survey structure.
11531154
1154-
Creates a balanced panel with stratified multi-stage sampling design
1155-
(strata, PSUs, FPC, sampling weights) and known treatment effects.
1156-
The survey structure introduces intra-cluster correlation via PSU
1157-
random effects, making design-based SEs larger than naive SEs.
1155+
Creates a balanced panel (or repeated cross-section) with stratified
1156+
multi-stage sampling design (strata, PSUs, FPC, sampling weights) and
1157+
known treatment effects. The survey structure introduces intra-cluster
1158+
correlation via PSU random effects, making design-based SEs larger
1159+
than naive SEs.
11581160
11591161
Modeled on ACS/BRFSS-style stratified household surveys: strata
11601162
represent geographic region types, PSUs are census tracts sampled
@@ -1163,7 +1165,7 @@ def generate_survey_did_data(
11631165
Parameters
11641166
----------
11651167
n_units : int, default=200
1166-
Number of units (respondents).
1168+
Number of units (respondents) per period.
11671169
n_periods : int, default=8
11681170
Number of time periods (1-indexed).
11691171
cohort_periods : list of int, optional
@@ -1196,8 +1198,14 @@ def generate_survey_did_data(
11961198
Standard deviation of idiosyncratic noise.
11971199
include_replicate_weights : bool, default=False
11981200
If True, add JK1 (delete-one-PSU) replicate weight columns.
1201+
Requires at least 2 PSUs.
11991202
add_covariates : bool, default=False
12001203
If True, add covariates x1 (continuous) and x2 (binary).
1204+
panel : bool, default=True
1205+
If True, generate panel data (same respondents across periods).
1206+
If False, generate repeated cross-sections with fresh respondent
1207+
effects and unique unit IDs each period (for use with
1208+
CallawaySantAnna(panel=False)).
12011209
seed : int, optional
12021210
Random seed for reproducibility.
12031211
@@ -1254,24 +1262,47 @@ def generate_survey_did_data(
12541262
unit_cohort[ci : ci + n_g] = g
12551263
ci += n_g
12561264

1265+
# --- JK1 guard ---
1266+
if include_replicate_weights and n_psu_total < 2:
1267+
raise ValueError(
1268+
"JK1 replicate weights require at least 2 PSUs, "
1269+
f"got {n_psu_total}."
1270+
)
1271+
12571272
# --- Random effects ---
12581273
psu_re = rng.normal(0, psu_re_sd, size=n_psu_total)
12591274
# PSU-period shocks: intra-cluster correlation that survives first-
12601275
# differencing in DiD. Without these, the time-invariant PSU RE
12611276
# cancels in the treatment-vs-control time-difference and the
12621277
# cluster-robust / survey SE would be *smaller* than naive OLS SE.
12631278
psu_period_re = rng.normal(0, psu_re_sd * 0.5, size=(n_psu_total, n_periods))
1264-
unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1265-
1266-
# Covariates (unit-level, time-invariant)
1267-
x1 = rng.normal(0, 1, size=n_units) if add_covariates else None
1268-
x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
12691279

1270-
# --- Generate panel ---
1280+
# --- Generate panel or repeated cross-sections ---
12711281
records = []
1272-
for i in range(n_units):
1273-
g_i = unit_cohort[i]
1274-
for t in range(1, n_periods + 1):
1282+
for t in range(1, n_periods + 1):
1283+
# For repeated cross-sections, draw fresh respondent effects each period
1284+
unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1285+
if panel and t > 1:
1286+
pass # reuse unit_fe from first period (set below)
1287+
if panel and t == 1:
1288+
_panel_unit_fe = unit_fe # save for reuse
1289+
if panel and t > 1:
1290+
unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
1291+
1292+
x1 = rng.normal(0, 1, size=n_units) if add_covariates else None
1293+
if panel and t > 1 and add_covariates:
1294+
x1 = _panel_x1 # type: ignore[possibly-undefined]
1295+
elif panel and t == 1 and add_covariates:
1296+
_panel_x1 = x1
1297+
1298+
x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
1299+
if panel and t > 1 and add_covariates:
1300+
x2 = _panel_x2 # type: ignore[possibly-undefined]
1301+
elif panel and t == 1 and add_covariates:
1302+
_panel_x2 = x2
1303+
1304+
for i in range(n_units):
1305+
g_i = unit_cohort[i]
12751306
# Outcome: unit FE + PSU RE + PSU-period shock + time trend
12761307
y = unit_fe[i] + psu_re[unit_psu[i]] + psu_period_re[unit_psu[i], t - 1] + 0.5 * t
12771308

@@ -1288,8 +1319,11 @@ def generate_survey_did_data(
12881319

12891320
y += rng.normal(0, noise_sd)
12901321

1322+
# In cross-section mode, each period gets unique unit IDs
1323+
uid = i if panel else (t - 1) * n_units + i
1324+
12911325
row = {
1292-
"unit": i,
1326+
"unit": uid,
12931327
"period": t,
12941328
"outcome": y,
12951329
"first_treat": g_i,

0 commit comments

Comments
 (0)