@@ -1127,3 +1127,197 @@ def generate_staggered_ddd_data(
11271127 records .append (row )
11281128
11291129 return pd .DataFrame (records )
1130+
1131+
1132+ def generate_survey_did_data (
1133+ n_units : int = 200 ,
1134+ n_periods : int = 8 ,
1135+ cohort_periods : Optional [List [int ]] = None ,
1136+ never_treated_frac : float = 0.3 ,
1137+ treatment_effect : float = 2.0 ,
1138+ dynamic_effects : bool = False ,
1139+ effect_growth : float = 0.3 ,
1140+ n_strata : int = 5 ,
1141+ psu_per_stratum : int = 8 ,
1142+ fpc_per_stratum : float = 200.0 ,
1143+ weight_variation : str = "moderate" ,
1144+ psu_re_sd : float = 2.0 ,
1145+ unit_fe_sd : float = 1.0 ,
1146+ noise_sd : float = 0.5 ,
1147+ include_replicate_weights : bool = False ,
1148+ add_covariates : bool = False ,
1149+ seed : Optional [int ] = None ,
1150+ ) -> pd .DataFrame :
1151+ """
1152+ Generate synthetic staggered DiD data with survey structure.
1153+
1154+ Creates a balanced panel with stratified multi-stage sampling design
1155+ (strata, PSUs, FPC, sampling weights) and known treatment effects.
1156+ The survey structure introduces intra-cluster correlation via PSU
1157+ random effects, making design-based SEs larger than naive SEs.
1158+
1159+ Modeled on ACS/BRFSS-style stratified household surveys: strata
1160+ represent geographic region types, PSUs are census tracts sampled
1161+ within each stratum, and weights are inverse selection probabilities.
1162+
1163+ Parameters
1164+ ----------
1165+ n_units : int, default=200
1166+ Number of units (respondents).
1167+ n_periods : int, default=8
1168+ Number of time periods (1-indexed).
1169+ cohort_periods : list of int, optional
1170+ Treatment cohort periods. Default: [3, 5].
1171+ never_treated_frac : float, default=0.3
1172+ Fraction of units that are never treated.
1173+ treatment_effect : float, default=2.0
1174+ True ATT for treated units.
1175+ dynamic_effects : bool, default=False
1176+ If True, effects grow over time since treatment.
1177+ effect_growth : float, default=0.3
1178+ Per-period effect growth rate when dynamic_effects=True.
1179+ n_strata : int, default=5
1180+ Number of geographic strata.
1181+ psu_per_stratum : int, default=8
1182+ Number of PSUs (census tracts) per stratum.
1183+ fpc_per_stratum : float, default=200.0
1184+ Finite population correction (total tracts per stratum).
1185+ weight_variation : str, default="moderate"
1186+ Controls sampling weight dispersion across strata.
1187+ "none": all weights equal (1.0).
1188+ "moderate": weights range ~1.0-2.0 across strata.
1189+ "high": weights range ~1.0-4.0 across strata.
1190+ psu_re_sd : float, default=2.0
1191+ Standard deviation of PSU random effects. Controls intra-cluster
1192+ correlation and drives DEFF > 1.
1193+ unit_fe_sd : float, default=1.0
1194+ Standard deviation of unit fixed effects.
1195+ noise_sd : float, default=0.5
1196+ Standard deviation of idiosyncratic noise.
1197+ include_replicate_weights : bool, default=False
1198+ If True, add JK1 (delete-one-PSU) replicate weight columns.
1199+ add_covariates : bool, default=False
1200+ If True, add covariates x1 (continuous) and x2 (binary).
1201+ seed : int, optional
1202+ Random seed for reproducibility.
1203+
1204+ Returns
1205+ -------
1206+ pd.DataFrame
1207+ Columns: unit, period, outcome, first_treat, treated, true_effect,
1208+ stratum, psu, fpc, weight. Also rep_0..rep_K if
1209+ include_replicate_weights=True, and x1, x2 if add_covariates=True.
1210+ """
1211+ rng = np .random .default_rng (seed )
1212+
1213+ if cohort_periods is None :
1214+ cohort_periods = [3 , 5 ]
1215+
1216+ # --- Survey structure: assign units to strata and PSUs ---
1217+ n_psu_total = n_strata * psu_per_stratum
1218+ units_per_stratum = n_units // n_strata
1219+ remainder = n_units % n_strata
1220+
1221+ unit_stratum = np .empty (n_units , dtype = int )
1222+ unit_psu = np .empty (n_units , dtype = int )
1223+ idx = 0
1224+ for s in range (n_strata ):
1225+ # Distribute remainder units across first strata
1226+ n_s = units_per_stratum + (1 if s < remainder else 0 )
1227+ unit_stratum [idx : idx + n_s ] = s
1228+
1229+ # Assign PSUs within this stratum
1230+ psu_start = s * psu_per_stratum
1231+ for j in range (n_s ):
1232+ unit_psu [idx + j ] = psu_start + (j % psu_per_stratum )
1233+ idx += n_s
1234+
1235+ # Sampling weights: vary by stratum (inverse selection probability)
1236+ scale_map = {"none" : 0.0 , "moderate" : 1.0 , "high" : 3.0 }
1237+ scale = scale_map .get (weight_variation , 1.0 )
1238+ denom = max (n_strata - 1 , 1 )
1239+ unit_weight = 1.0 + scale * (unit_stratum / denom )
1240+
1241+ # --- Treatment assignment (cohort structure) ---
1242+ n_never = int (n_units * never_treated_frac )
1243+ n_treated_total = n_units - n_never
1244+ n_per_cohort = n_treated_total // len (cohort_periods )
1245+
1246+ unit_cohort = np .zeros (n_units , dtype = int )
1247+ ci = n_never
1248+ for i , g in enumerate (cohort_periods ):
1249+ n_g = (
1250+ n_per_cohort
1251+ if i < len (cohort_periods ) - 1
1252+ else n_treated_total - ci + n_never
1253+ )
1254+ unit_cohort [ci : ci + n_g ] = g
1255+ ci += n_g
1256+
1257+ # --- Random effects ---
1258+ psu_re = rng .normal (0 , psu_re_sd , size = n_psu_total )
1259+ # PSU-period shocks: intra-cluster correlation that survives first-
1260+ # differencing in DiD. Without these, the time-invariant PSU RE
1261+ # cancels in the treatment-vs-control time-difference and the
1262+ # cluster-robust / survey SE would be *smaller* than naive OLS SE.
1263+ psu_period_re = rng .normal (0 , psu_re_sd * 0.5 , size = (n_psu_total , n_periods ))
1264+ unit_fe = rng .normal (0 , unit_fe_sd , size = n_units )
1265+
1266+ # Covariates (unit-level, time-invariant)
1267+ x1 = rng .normal (0 , 1 , size = n_units ) if add_covariates else None
1268+ x2 = rng .choice ([0 , 1 ], size = n_units ) if add_covariates else None
1269+
1270+ # --- Generate panel ---
1271+ records = []
1272+ for i in range (n_units ):
1273+ g_i = unit_cohort [i ]
1274+ for t in range (1 , n_periods + 1 ):
1275+ # Outcome: unit FE + PSU RE + PSU-period shock + time trend
1276+ y = unit_fe [i ] + psu_re [unit_psu [i ]] + psu_period_re [unit_psu [i ], t - 1 ] + 0.5 * t
1277+
1278+ if add_covariates :
1279+ y += 0.5 * x1 [i ] + 0.3 * x2 [i ]
1280+
1281+ treated = int (g_i > 0 and t >= g_i )
1282+ true_eff = 0.0
1283+ if treated :
1284+ true_eff = treatment_effect
1285+ if dynamic_effects :
1286+ true_eff *= 1 + effect_growth * (t - g_i )
1287+ y += true_eff
1288+
1289+ y += rng .normal (0 , noise_sd )
1290+
1291+ row = {
1292+ "unit" : i ,
1293+ "period" : t ,
1294+ "outcome" : y ,
1295+ "first_treat" : g_i ,
1296+ "treated" : treated ,
1297+ "true_effect" : true_eff ,
1298+ "stratum" : int (unit_stratum [i ]),
1299+ "psu" : int (unit_psu [i ]),
1300+ "fpc" : fpc_per_stratum ,
1301+ "weight" : float (unit_weight [i ]),
1302+ }
1303+ if add_covariates :
1304+ row ["x1" ] = x1 [i ]
1305+ row ["x2" ] = x2 [i ]
1306+ records .append (row )
1307+
1308+ df = pd .DataFrame (records )
1309+
1310+ # --- Replicate weights (JK1 delete-one-PSU) ---
1311+ if include_replicate_weights :
1312+ psu_ids = sorted (df ["psu" ].unique ())
1313+ n_rep = len (psu_ids )
1314+ base_w = df ["weight" ].values
1315+ for r , psu_id in enumerate (psu_ids ):
1316+ w_r = base_w .copy ()
1317+ mask = df ["psu" ].values == psu_id
1318+ w_r [mask ] = 0.0
1319+ # Rescale remaining: k/(k-1) for JK1
1320+ w_r [w_r > 0 ] *= n_rep / (n_rep - 1 )
1321+ df [f"rep_{ r } " ] = w_r
1322+
1323+ return df
0 commit comments