@@ -1127,3 +1127,288 @@ def generate_staggered_ddd_data(
11271127 records .append (row )
11281128
11291129 return pd .DataFrame (records )
1130+
1131+
1132+ def generate_survey_did_data (
1133+ n_units : int = 200 ,
1134+ n_periods : int = 8 ,
1135+ cohort_periods : Optional [List [int ]] = None ,
1136+ never_treated_frac : float = 0.3 ,
1137+ treatment_effect : float = 2.0 ,
1138+ dynamic_effects : bool = False ,
1139+ effect_growth : float = 0.3 ,
1140+ n_strata : int = 5 ,
1141+ psu_per_stratum : int = 8 ,
1142+ fpc_per_stratum : float = 200.0 ,
1143+ weight_variation : str = "moderate" ,
1144+ psu_re_sd : float = 2.0 ,
1145+ unit_fe_sd : float = 1.0 ,
1146+ noise_sd : float = 0.5 ,
1147+ include_replicate_weights : bool = False ,
1148+ add_covariates : bool = False ,
1149+ panel : bool = True ,
1150+ seed : Optional [int ] = None ,
1151+ ) -> pd .DataFrame :
1152+ """
1153+ Generate synthetic staggered DiD data with survey structure.
1154+
1155+ Creates a balanced panel (or repeated cross-section) with stratified
1156+ multi-stage sampling design (strata, PSUs, FPC, sampling weights) and
1157+ known treatment effects. The survey structure introduces intra-cluster
1158+ correlation via PSU random effects, making design-based SEs larger
1159+ than naive SEs.
1160+
1161+ Modeled on ACS/BRFSS-style stratified household surveys: strata
1162+ represent geographic region types, PSUs are census tracts sampled
1163+ within each stratum, and weights are inverse selection probabilities.
1164+
1165+ Parameters
1166+ ----------
1167+ n_units : int, default=200
1168+ Number of units (respondents) per period.
1169+ n_periods : int, default=8
1170+ Number of time periods (1-indexed).
1171+ cohort_periods : list of int, optional
1172+ Treatment cohort periods (1-indexed, each must be >= 2 for at least
1173+ one pre-treatment period). Default derived from n_periods; [3, 5]
1174+ when n_periods >= 8. Requires n_periods >= 4 when not specified.
1175+ never_treated_frac : float, default=0.3
1176+ Fraction of units that are never treated.
1177+ treatment_effect : float, default=2.0
1178+ True ATT for treated units.
1179+ dynamic_effects : bool, default=False
1180+ If True, effects grow over time since treatment.
1181+ effect_growth : float, default=0.3
1182+ Per-period effect growth rate when dynamic_effects=True.
1183+ n_strata : int, default=5
1184+ Number of geographic strata.
1185+ psu_per_stratum : int, default=8
1186+ Number of PSUs (census tracts) per stratum.
1187+ fpc_per_stratum : float, default=200.0
1188+ Finite population correction (total tracts per stratum).
1189+ weight_variation : str, default="moderate"
1190+ Controls sampling weight dispersion across strata.
1191+ "none": all weights equal (1.0).
1192+ "moderate": weights range ~1.0-2.0 across strata.
1193+ "high": weights range ~1.0-4.0 across strata.
1194+ psu_re_sd : float, default=2.0
1195+ Standard deviation of PSU random effects. Controls intra-cluster
1196+ correlation and drives DEFF > 1.
1197+ unit_fe_sd : float, default=1.0
1198+ Standard deviation of unit fixed effects.
1199+ noise_sd : float, default=0.5
1200+ Standard deviation of idiosyncratic noise.
1201+ include_replicate_weights : bool, default=False
1202+ If True, add JK1 (delete-one-PSU) replicate weight columns.
1203+ Requires at least 2 PSUs.
1204+ add_covariates : bool, default=False
1205+ If True, add covariates x1 (continuous) and x2 (binary).
1206+ panel : bool, default=True
1207+ If True, generate panel data (same respondents across periods).
1208+ If False, generate repeated cross-sections with fresh respondent
1209+ effects and unique unit IDs each period (for use with
1210+ CallawaySantAnna(panel=False)).
1211+ seed : int, optional
1212+ Random seed for reproducibility.
1213+
1214+ Returns
1215+ -------
1216+ pd.DataFrame
1217+ Columns: unit, period, outcome, first_treat, treated, true_effect,
1218+ stratum, psu, fpc, weight. Also rep_0..rep_K if
1219+ include_replicate_weights=True, and x1, x2 if add_covariates=True.
1220+ """
1221+ rng = np .random .default_rng (seed )
1222+
1223+ # --- Upfront parameter validation ---
1224+ if n_units < 1 :
1225+ raise ValueError (f"n_units must be positive, got { n_units } " )
1226+ if n_periods < 1 :
1227+ raise ValueError (f"n_periods must be positive, got { n_periods } " )
1228+ if n_strata < 1 :
1229+ raise ValueError (f"n_strata must be positive, got { n_strata } " )
1230+ if psu_per_stratum < 1 :
1231+ raise ValueError (f"psu_per_stratum must be positive, got { psu_per_stratum } " )
1232+ if not 0.0 <= never_treated_frac <= 1.0 :
1233+ raise ValueError (
1234+ f"never_treated_frac must be between 0 and 1, got { never_treated_frac } "
1235+ )
1236+ if fpc_per_stratum < psu_per_stratum :
1237+ raise ValueError (
1238+ f"fpc_per_stratum ({ fpc_per_stratum } ) must be >= psu_per_stratum "
1239+ f"({ psu_per_stratum } )"
1240+ )
1241+
1242+ if cohort_periods is None :
1243+ # Derive defaults from n_periods. Cohorts need g >= 2 (at least one
1244+ # pre-period for estimability with CallawaySantAnna).
1245+ if n_periods >= 8 :
1246+ cohort_periods = [3 , 5 ]
1247+ elif n_periods >= 4 :
1248+ cohort_periods = [max (2 , n_periods // 3 ), max (3 , 2 * n_periods // 3 )]
1249+ else :
1250+ raise ValueError (
1251+ f"n_periods={ n_periods } is too small for default cohort_periods "
1252+ f"(need n_periods >= 4 for at least one cohort with a pre-period). "
1253+ f"Pass cohort_periods explicitly for small panels."
1254+ )
1255+ # Coerce array-like to list (handles np.array inputs)
1256+ cohort_periods = list (cohort_periods )
1257+ if not cohort_periods :
1258+ raise ValueError ("cohort_periods must be a non-empty list of integers" )
1259+ for cp in cohort_periods :
1260+ if isinstance (cp , bool ) or not isinstance (cp , (int , np .integer )):
1261+ raise ValueError (
1262+ f"cohort_periods must contain integers, got { cp !r} "
1263+ )
1264+ if cp < 2 or cp > n_periods :
1265+ raise ValueError (
1266+ f"Cohort period { cp } must be between 2 and { n_periods } "
1267+ f"(g >= 2 ensures at least one pre-treatment period)"
1268+ )
1269+
1270+ valid_wv = ("none" , "moderate" , "high" )
1271+ if weight_variation not in valid_wv :
1272+ raise ValueError (
1273+ f"weight_variation must be one of { valid_wv } , got { weight_variation !r} "
1274+ )
1275+
1276+ # --- Survey structure: assign units to strata and PSUs ---
1277+ n_psu_total = n_strata * psu_per_stratum
1278+ units_per_stratum = n_units // n_strata
1279+ remainder = n_units % n_strata
1280+
1281+ unit_stratum = np .empty (n_units , dtype = int )
1282+ unit_psu = np .empty (n_units , dtype = int )
1283+ idx = 0
1284+ for s in range (n_strata ):
1285+ # Distribute remainder units across first strata
1286+ n_s = units_per_stratum + (1 if s < remainder else 0 )
1287+ unit_stratum [idx : idx + n_s ] = s
1288+
1289+ # Assign PSUs within this stratum
1290+ psu_start = s * psu_per_stratum
1291+ for j in range (n_s ):
1292+ unit_psu [idx + j ] = psu_start + (j % psu_per_stratum )
1293+ idx += n_s
1294+
1295+ # Sampling weights: vary by stratum (inverse selection probability)
1296+ scale_map = {"none" : 0.0 , "moderate" : 1.0 , "high" : 3.0 }
1297+ scale = scale_map .get (weight_variation , 1.0 )
1298+ denom = max (n_strata - 1 , 1 )
1299+ unit_weight = 1.0 + scale * (unit_stratum / denom )
1300+
1301+ # --- Treatment assignment (cohort structure) ---
1302+ n_never = int (n_units * never_treated_frac )
1303+ n_treated_total = n_units - n_never
1304+ n_per_cohort = n_treated_total // len (cohort_periods )
1305+
1306+ unit_cohort = np .zeros (n_units , dtype = int )
1307+ ci = n_never
1308+ for i , g in enumerate (cohort_periods ):
1309+ n_g = (
1310+ n_per_cohort
1311+ if i < len (cohort_periods ) - 1
1312+ else n_treated_total - ci + n_never
1313+ )
1314+ unit_cohort [ci : ci + n_g ] = g
1315+ ci += n_g
1316+
1317+ # --- JK1 early guard (configured count; populated count checked after build) ---
1318+ if include_replicate_weights and n_psu_total < 2 :
1319+ raise ValueError (
1320+ "JK1 replicate weights require at least 2 PSUs, "
1321+ f"got { n_psu_total } ."
1322+ )
1323+
1324+ # --- Random effects ---
1325+ psu_re = rng .normal (0 , psu_re_sd , size = n_psu_total )
1326+ # PSU-period shocks: intra-cluster correlation that survives first-
1327+ # differencing in DiD. Without these, the time-invariant PSU RE
1328+ # cancels in the treatment-vs-control time-difference and the
1329+ # cluster-robust / survey SE would be *smaller* than naive OLS SE.
1330+ psu_period_re = rng .normal (0 , psu_re_sd * 0.5 , size = (n_psu_total , n_periods ))
1331+
1332+ # --- Generate panel or repeated cross-sections ---
1333+ records = []
1334+ for t in range (1 , n_periods + 1 ):
1335+ # For repeated cross-sections, draw fresh respondent effects each period
1336+ unit_fe = rng .normal (0 , unit_fe_sd , size = n_units )
1337+ if panel and t > 1 :
1338+ pass # reuse unit_fe from first period (set below)
1339+ if panel and t == 1 :
1340+ _panel_unit_fe = unit_fe # save for reuse
1341+ if panel and t > 1 :
1342+ unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
1343+
1344+ x1 = rng .normal (0 , 1 , size = n_units ) if add_covariates else None
1345+ if panel and t > 1 and add_covariates :
1346+ x1 = _panel_x1 # type: ignore[possibly-undefined]
1347+ elif panel and t == 1 and add_covariates :
1348+ _panel_x1 = x1
1349+
1350+ x2 = rng .choice ([0 , 1 ], size = n_units ) if add_covariates else None
1351+ if panel and t > 1 and add_covariates :
1352+ x2 = _panel_x2 # type: ignore[possibly-undefined]
1353+ elif panel and t == 1 and add_covariates :
1354+ _panel_x2 = x2
1355+
1356+ for i in range (n_units ):
1357+ g_i = unit_cohort [i ]
1358+ # Outcome: unit FE + PSU RE + PSU-period shock + time trend
1359+ y = unit_fe [i ] + psu_re [unit_psu [i ]] + psu_period_re [unit_psu [i ], t - 1 ] + 0.5 * t
1360+
1361+ if add_covariates :
1362+ y += 0.5 * x1 [i ] + 0.3 * x2 [i ]
1363+
1364+ treated = int (g_i > 0 and t >= g_i )
1365+ true_eff = 0.0
1366+ if treated :
1367+ true_eff = treatment_effect
1368+ if dynamic_effects :
1369+ true_eff *= 1 + effect_growth * (t - g_i )
1370+ y += true_eff
1371+
1372+ y += rng .normal (0 , noise_sd )
1373+
1374+ # In cross-section mode, each period gets unique unit IDs
1375+ uid = i if panel else (t - 1 ) * n_units + i
1376+
1377+ row = {
1378+ "unit" : uid ,
1379+ "period" : t ,
1380+ "outcome" : y ,
1381+ "first_treat" : g_i ,
1382+ "treated" : treated ,
1383+ "true_effect" : true_eff ,
1384+ "stratum" : int (unit_stratum [i ]),
1385+ "psu" : int (unit_psu [i ]),
1386+ "fpc" : fpc_per_stratum ,
1387+ "weight" : float (unit_weight [i ]),
1388+ }
1389+ if add_covariates :
1390+ row ["x1" ] = x1 [i ]
1391+ row ["x2" ] = x2 [i ]
1392+ records .append (row )
1393+
1394+ df = pd .DataFrame (records )
1395+
1396+ # --- Replicate weights (JK1 delete-one-PSU) ---
1397+ if include_replicate_weights :
1398+ psu_ids = sorted (df ["psu" ].unique ())
1399+ n_rep = len (psu_ids )
1400+ if n_rep < 2 :
1401+ raise ValueError (
1402+ "JK1 replicate weights require at least 2 populated PSUs, "
1403+ f"got { n_rep } . Increase n_units or decrease psu_per_stratum."
1404+ )
1405+ base_w = df ["weight" ].values
1406+ for r , psu_id in enumerate (psu_ids ):
1407+ w_r = base_w .copy ()
1408+ mask = df ["psu" ].values == psu_id
1409+ w_r [mask ] = 0.0
1410+ # Rescale remaining: k/(k-1) for JK1
1411+ w_r [w_r > 0 ] *= n_rep / (n_rep - 1 )
1412+ df [f"rep_{ r } " ] = w_r
1413+
1414+ return df
0 commit comments