@@ -1146,15 +1146,17 @@ def generate_survey_did_data(
11461146 noise_sd : float = 0.5 ,
11471147 include_replicate_weights : bool = False ,
11481148 add_covariates : bool = False ,
1149+ panel : bool = True ,
11491150 seed : Optional [int ] = None ,
11501151) -> pd .DataFrame :
11511152 """
11521153 Generate synthetic staggered DiD data with survey structure.
11531154
1154- Creates a balanced panel with stratified multi-stage sampling design
1155- (strata, PSUs, FPC, sampling weights) and known treatment effects.
1156- The survey structure introduces intra-cluster correlation via PSU
1157- random effects, making design-based SEs larger than naive SEs.
1155+ Creates a balanced panel (or repeated cross-section) with stratified
1156+ multi-stage sampling design (strata, PSUs, FPC, sampling weights) and
1157+ known treatment effects. The survey structure introduces intra-cluster
1158+ correlation via PSU random effects, making design-based SEs larger
1159+ than naive SEs.
11581160
11591161 Modeled on ACS/BRFSS-style stratified household surveys: strata
11601162 represent geographic region types, PSUs are census tracts sampled
@@ -1163,7 +1165,7 @@ def generate_survey_did_data(
11631165 Parameters
11641166 ----------
11651167 n_units : int, default=200
1166- Number of units (respondents).
1168+ Number of units (respondents) per period .
11671169 n_periods : int, default=8
11681170 Number of time periods (1-indexed).
11691171 cohort_periods : list of int, optional
@@ -1196,8 +1198,14 @@ def generate_survey_did_data(
11961198 Standard deviation of idiosyncratic noise.
11971199 include_replicate_weights : bool, default=False
11981200 If True, add JK1 (delete-one-PSU) replicate weight columns.
1201+ Requires at least 2 PSUs.
11991202 add_covariates : bool, default=False
12001203 If True, add covariates x1 (continuous) and x2 (binary).
1204+ panel : bool, default=True
1205+ If True, generate panel data (same respondents across periods).
1206+ If False, generate repeated cross-sections with fresh respondent
1207+ effects and unique unit IDs each period (for use with
1208+ CallawaySantAnna(panel=False)).
12011209 seed : int, optional
12021210 Random seed for reproducibility.
12031211
@@ -1254,24 +1262,47 @@ def generate_survey_did_data(
12541262 unit_cohort [ci : ci + n_g ] = g
12551263 ci += n_g
12561264
1265+ # --- JK1 guard ---
1266+ if include_replicate_weights and n_psu_total < 2 :
1267+ raise ValueError (
1268+ "JK1 replicate weights require at least 2 PSUs, "
1269+ f"got { n_psu_total } ."
1270+ )
1271+
12571272 # --- Random effects ---
12581273 psu_re = rng .normal (0 , psu_re_sd , size = n_psu_total )
12591274 # PSU-period shocks: intra-cluster correlation that survives first-
12601275 # differencing in DiD. Without these, the time-invariant PSU RE
12611276 # cancels in the treatment-vs-control time-difference and the
12621277 # cluster-robust / survey SE would be *smaller* than naive OLS SE.
12631278 psu_period_re = rng .normal (0 , psu_re_sd * 0.5 , size = (n_psu_total , n_periods ))
1264- unit_fe = rng .normal (0 , unit_fe_sd , size = n_units )
1265-
1266- # Covariates (unit-level, time-invariant)
1267- x1 = rng .normal (0 , 1 , size = n_units ) if add_covariates else None
1268- x2 = rng .choice ([0 , 1 ], size = n_units ) if add_covariates else None
12691279
1270- # --- Generate panel ---
1280+ # --- Generate panel or repeated cross-sections ---
12711281 records = []
1272- for i in range (n_units ):
1273- g_i = unit_cohort [i ]
1274- for t in range (1 , n_periods + 1 ):
1282+ for t in range (1 , n_periods + 1 ):
1283+ # For repeated cross-sections, draw fresh respondent effects each period
1284+ unit_fe = rng .normal (0 , unit_fe_sd , size = n_units )
1285+ if panel and t > 1 :
1286+ pass # reuse unit_fe from first period (set below)
1287+ if panel and t == 1 :
1288+ _panel_unit_fe = unit_fe # save for reuse
1289+ if panel and t > 1 :
1290+ unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
1291+
1292+ x1 = rng .normal (0 , 1 , size = n_units ) if add_covariates else None
1293+ if panel and t > 1 and add_covariates :
1294+ x1 = _panel_x1 # type: ignore[possibly-undefined]
1295+ elif panel and t == 1 and add_covariates :
1296+ _panel_x1 = x1
1297+
1298+ x2 = rng .choice ([0 , 1 ], size = n_units ) if add_covariates else None
1299+ if panel and t > 1 and add_covariates :
1300+ x2 = _panel_x2 # type: ignore[possibly-undefined]
1301+ elif panel and t == 1 and add_covariates :
1302+ _panel_x2 = x2
1303+
1304+ for i in range (n_units ):
1305+ g_i = unit_cohort [i ]
12751306 # Outcome: unit FE + PSU RE + PSU-period shock + time trend
12761307 y = unit_fe [i ] + psu_re [unit_psu [i ]] + psu_period_re [unit_psu [i ], t - 1 ] + 0.5 * t
12771308
@@ -1288,8 +1319,11 @@ def generate_survey_did_data(
12881319
12891320 y += rng .normal (0 , noise_sd )
12901321
1322+ # In cross-section mode, each period gets unique unit IDs
1323+ uid = i if panel else (t - 1 ) * n_units + i
1324+
12911325 row = {
1292- "unit" : i ,
1326+ "unit" : uid ,
12931327 "period" : t ,
12941328 "outcome" : y ,
12951329 "first_treat" : g_i ,
0 commit comments