diff --git a/TODO.md b/TODO.md index 1b93bf9e..b996a903 100644 --- a/TODO.md +++ b/TODO.md @@ -54,6 +54,9 @@ Deferred items from PR reviews that were not addressed before merge. | EfficientDiD: API docs / tutorial page for new public estimator | `docs/` | #192 | Medium | | Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium | | TripleDifference power: `generate_ddd_data` is a fixed 2×2×2 cross-sectional DGP — no multi-period or unbalanced-group support. Add a `generate_ddd_panel_data` for panel DDD power analysis. | `prep_dgp.py`, `power.py` | #208 | Low | +| ContinuousDiD event-study aggregation does not filter by `anticipation` — uses all (g,t) cells instead of anticipation-filtered subset; pre-existing in both survey and non-survey paths | `continuous_did.py` | #226 | Medium | +| Survey design resolution/collapse patterns are inconsistent across panel estimators — ContinuousDiD rebuilds unit-level design in SE code, EfficientDiD builds once in fit(), StackedDiD re-resolves on stacked data; extract shared helpers for panel-to-unit collapse, post-filter re-resolution, and metadata recomputation | `continuous_did.py`, `efficient_did.py`, `stacked_did.py` | #226 | Low | +| Duplicated survey metadata summary formatting across 6 results classes — extract shared `_format_survey_metadata(sm, width)` helper to reduce maintenance burden as more estimators gain survey support in Phases 4-5 | `results.py`, `stacked_did_results.py`, `sun_abraham.py`, `bacon.py`, `triple_diff.py`, `continuous_did_results.py`, `efficient_did_results.py` | #226 | Low | #### Performance diff --git a/diff_diff/bacon.py b/diff_diff/bacon.py index 8f09d9fc..67fe794d 100644 --- a/diff_diff/bacon.py +++ b/diff_diff/bacon.py @@ -110,6 +110,8 @@ class BaconDecompositionResults: timing_groups: List[Any] n_obs: int = 0 decomposition_error: float = field(default=0.0) + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: return ( @@ -137,30 +139,56 @@ def summary(self) -> str: f"{'Never-treated units:':<35} {self.n_never_treated:>10}", f"{'Total 2x2 comparisons:':<35} {len(self.comparisons):>10}", "", - "-" * 85, - "TWFE Decomposition".center(85), - "-" * 85, - "", - f"{'TWFE Estimate:':<35} {self.twfe_estimate:>12.4f}", - f"{'Weighted Sum of 2x2 Estimates:':<35} {self._weighted_sum():>12.4f}", - f"{'Decomposition Error:':<35} {self.decomposition_error:>12.6f}", - "", ] + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "-" * 85, + "Survey Design".center(85), + "-" * 85, + f"{'Weight type:':<35} {sm.weight_type:>10}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<35} {sm.n_strata:>10}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<35} {sm.n_psu:>10}") + lines.append(f"{'Effective sample size:':<35} {sm.effective_n:>10.1f}") + lines.append(f"{'Design effect (DEFF):':<35} {sm.design_effect:>10.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<35} {sm.df_survey:>10}") + lines.extend(["-" * 85, ""]) + + lines.extend( + [ + "-" * 85, + "TWFE Decomposition".center(85), + "-" * 85, + "", + f"{'TWFE Estimate:':<35} {self.twfe_estimate:>12.4f}", + f"{'Weighted Sum of 2x2 Estimates:':<35} {self._weighted_sum():>12.4f}", + f"{'Decomposition Error:':<35} {self.decomposition_error:>12.6f}", + "", + ] + ) + # Weight breakdown by comparison type - lines.extend([ - "-" * 85, - "Weight Breakdown by Comparison Type".center(85), - "-" * 85, - f"{'Comparison Type':<30} {'Weight':>12} {'Avg Effect':>12} {'Contribution':>12}", - "-" * 85, - ]) + lines.extend( + [ + "-" * 85, + "Weight Breakdown by Comparison Type".center(85), + "-" * 85, + f"{'Comparison Type':<30} {'Weight':>12} {'Avg Effect':>12} {'Contribution':>12}", + "-" * 85, + ] + ) # Treated vs Never-treated if self.total_weight_treated_vs_never > 0: - contrib = self.total_weight_treated_vs_never * ( - self.weighted_avg_treated_vs_never or 0 - ) + contrib = self.total_weight_treated_vs_never * (self.weighted_avg_treated_vs_never or 0) lines.append( f"{'Treated vs Never-treated':<30} " f"{self.total_weight_treated_vs_never:>12.4f} " @@ -170,9 +198,7 @@ def summary(self) -> str: # Earlier vs Later if self.total_weight_earlier_vs_later > 0: - contrib = self.total_weight_earlier_vs_later * ( - self.weighted_avg_earlier_vs_later or 0 - ) + contrib = self.total_weight_earlier_vs_later * (self.weighted_avg_earlier_vs_later or 0) lines.append( f"{'Earlier vs Later treated':<30} " f"{self.total_weight_earlier_vs_later:>12.4f} " @@ -182,9 +208,7 @@ def summary(self) -> str: # Later vs Earlier (forbidden) if self.total_weight_later_vs_earlier > 0: - contrib = self.total_weight_later_vs_earlier * ( - self.weighted_avg_later_vs_earlier or 0 - ) + contrib = self.total_weight_later_vs_earlier * (self.weighted_avg_later_vs_earlier or 0) lines.append( f"{'Later vs Earlier (forbidden)':<30} " f"{self.total_weight_later_vs_earlier:>12.4f} " @@ -192,27 +216,29 @@ def summary(self) -> str: f"{contrib:>12.4f}" ) - lines.extend([ - "-" * 85, - f"{'Total':<30} {self._total_weight():>12.4f} " - f"{'':>12} {self._weighted_sum():>12.4f}", - "-" * 85, - "", - ]) + lines.extend( + [ + "-" * 85, + f"{'Total':<30} {self._total_weight():>12.4f} " + f"{'':>12} {self._weighted_sum():>12.4f}", + "-" * 85, + "", + ] + ) # Warning about forbidden comparisons if self.total_weight_later_vs_earlier > 0.01: pct = self.total_weight_later_vs_earlier * 100 - lines.extend([ - "WARNING: {:.1f}% of weight is on 'forbidden' comparisons where".format( - pct - ), - "already-treated units serve as controls. This can bias TWFE", - "when treatment effects are heterogeneous over time.", - "", - "Consider using Callaway-Sant'Anna or other robust estimators.", - "", - ]) + lines.extend( + [ + "WARNING: {:.1f}% of weight is on 'forbidden' comparisons where".format(pct), + "already-treated units serve as controls. This can bias TWFE", + "when treatment effects are heterogeneous over time.", + "", + "Consider using Callaway-Sant'Anna or other robust estimators.", + "", + ] + ) lines.append("=" * 85) @@ -241,17 +267,19 @@ def to_dataframe(self) -> pd.DataFrame: """ rows = [] for c in self.comparisons: - rows.append({ - "treated_group": c.treated_group, - "control_group": c.control_group, - "comparison_type": c.comparison_type, - "estimate": c.estimate, - "weight": c.weight, - "n_treated": c.n_treated, - "n_control": c.n_control, - "time_start": c.time_window[0], - "time_end": c.time_window[1], - }) + rows.append( + { + "treated_group": c.treated_group, + "control_group": c.control_group, + "comparison_type": c.comparison_type, + "estimate": c.estimate, + "weight": c.weight, + "n_treated": c.n_treated, + "n_control": c.n_control, + "time_start": c.time_window[0], + "time_end": c.time_window[1], + } + ) return pd.DataFrame(rows) def weight_by_type(self) -> Dict[str, float]: @@ -392,9 +420,7 @@ def __init__(self, weights: str = "approximate"): - "exact": Variance-based weights from Goodman-Bacon (2021) """ if weights not in ("approximate", "exact"): - raise ValueError( - f"weights must be 'approximate' or 'exact', got '{weights}'" - ) + raise ValueError(f"weights must be 'approximate' or 'exact', got '{weights}'") self.weights = weights self.results_: Optional[BaconDecompositionResults] = None self.is_fitted_: bool = False @@ -406,6 +432,7 @@ def fit( unit: str, time: str, first_treat: str, + survey_design=None, ) -> BaconDecompositionResults: """ Perform the Goodman-Bacon decomposition. @@ -423,6 +450,10 @@ def fit( first_treat : str Name of column indicating when unit was first treated. Use 0 (or np.inf) for never-treated units. + survey_design : SurveyDesign, optional + Survey design specification for weighted estimation. + When provided, all means and group shares use survey weights. + The decomposition remains diagnostic (no survey vcov needed). Returns ------- @@ -440,6 +471,22 @@ def fit( if missing: raise ValueError(f"Missing columns: {missing}") + # Resolve survey design if provided + from diff_diff.survey import _resolve_survey_for_fit + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Validate within-unit constancy for exact survey weights only. + # The exact-weight path collapses to per-unit weights via groupby().first(), + # which requires constant survey columns within units. The approximate path + # uses observation-level weighted means and does not need this constraint. + if resolved_survey is not None and self.weights == "exact": + from diff_diff.survey import _validate_unit_constant_survey + + _validate_unit_constant_survey(data, unit, survey_design) + # Create working copy df = data.copy() @@ -463,22 +510,21 @@ def fit( # Identify never-treated and timing groups # Never-treated: first_treat = 0 or inf never_treated_mask = (df[first_treat] == 0) | (df[first_treat] == np.inf) - timing_groups = sorted([g for g in df[first_treat].unique() - if g > 0 and g != np.inf]) + timing_groups = sorted([g for g in df[first_treat].unique() if g > 0 and g != np.inf]) # Get unit-level treatment timing - unit_info = df.groupby(unit).agg({first_treat: 'first'}).reset_index() - n_never_treated = ( - (unit_info[first_treat] == 0) | (unit_info[first_treat] == np.inf) - ).sum() + unit_info = df.groupby(unit).agg({first_treat: "first"}).reset_index() + n_never_treated = ((unit_info[first_treat] == 0) | (unit_info[first_treat] == np.inf)).sum() # Create treatment indicator (D_it = 1 if treated at time t) # Use unique internal name to avoid conflicts with user data - _TREAT_COL = '__bacon_treated_internal__' + _TREAT_COL = "__bacon_treated_internal__" df[_TREAT_COL] = (~never_treated_mask) & (df[time] >= df[first_treat]) # First, compute TWFE estimate for reference - twfe_estimate = self._compute_twfe(df, outcome, unit, time, _TREAT_COL) + twfe_estimate = self._compute_twfe( + df, outcome, unit, time, _TREAT_COL, weights=survey_weights + ) # Perform decomposition comparisons = [] @@ -487,26 +533,49 @@ def fit( if n_never_treated > 0: for g in timing_groups: comp = self._compute_treated_vs_never( - df, outcome, unit, time, first_treat, g, time_periods + df, + outcome, + unit, + time, + first_treat, + g, + time_periods, + weights=survey_weights, ) if comp is not None: comparisons.append(comp) # 2. Timing group comparisons (earlier vs later and later vs earlier) for i, g_early in enumerate(timing_groups): - for g_late in timing_groups[i + 1:]: + for g_late in timing_groups[i + 1 :]: # Earlier vs Later: g_early treated, g_late as control comp_early = self._compute_timing_comparison( - df, outcome, unit, time, first_treat, - g_early, g_late, time_periods, "earlier_vs_later" + df, + outcome, + unit, + time, + first_treat, + g_early, + g_late, + time_periods, + "earlier_vs_later", + weights=survey_weights, ) if comp_early is not None: comparisons.append(comp_early) # Later vs Earlier: g_late treated, g_early as control (forbidden) comp_late = self._compute_timing_comparison( - df, outcome, unit, time, first_treat, - g_late, g_early, time_periods, "later_vs_earlier" + df, + outcome, + unit, + time, + first_treat, + g_late, + g_early, + time_periods, + "later_vs_earlier", + weights=survey_weights, ) if comp_late is not None: comparisons.append(comp_late) @@ -514,7 +583,14 @@ def fit( # Recompute exact weights if requested if self.weights == "exact": self._recompute_exact_weights( - comparisons, df, outcome, unit, time, first_treat, time_periods + comparisons, + df, + outcome, + unit, + time, + first_treat, + time_periods, + weights=survey_weights, ) # Normalize weights to sum to 1 @@ -524,10 +600,12 @@ def fit( c.weight = c.weight / total_weight # Calculate weight totals and weighted averages by type - weight_by_type = {"treated_vs_never": 0.0, "earlier_vs_later": 0.0, - "later_vs_earlier": 0.0} - weighted_sum_by_type = {"treated_vs_never": 0.0, "earlier_vs_later": 0.0, - "later_vs_earlier": 0.0} + weight_by_type = {"treated_vs_never": 0.0, "earlier_vs_later": 0.0, "later_vs_earlier": 0.0} + weighted_sum_by_type = { + "treated_vs_never": 0.0, + "earlier_vs_later": 0.0, + "later_vs_earlier": 0.0, + } for c in comparisons: weight_by_type[c.comparison_type] += c.weight @@ -537,9 +615,7 @@ def fit( avg_by_type = {} for ctype in weight_by_type: if weight_by_type[ctype] > 0: - avg_by_type[ctype] = ( - weighted_sum_by_type[ctype] / weight_by_type[ctype] - ) + avg_by_type[ctype] = weighted_sum_by_type[ctype] / weight_by_type[ctype] else: avg_by_type[ctype] = None @@ -561,6 +637,7 @@ def fit( timing_groups=timing_groups, n_obs=len(df), decomposition_error=decomp_error, + survey_metadata=survey_metadata, ) self.is_fitted_ = True @@ -572,22 +649,29 @@ def _compute_twfe( outcome: str, unit: str, time: str, - treat_col: str = '__bacon_treated_internal__', + treat_col: str = "__bacon_treated_internal__", + weights: Optional[np.ndarray] = None, ) -> float: """Compute TWFE estimate using within-transformation.""" - # Apply two-way within transformation + # Apply two-way within transformation (weighted if survey weights provided) df_dm = _within_transform_util( - df, [outcome, treat_col], unit, time, suffix="_within" + df, + [outcome, treat_col], + unit, + time, + suffix="_within", + weights=weights, ) # Extract within-transformed values y_within = df_dm[f"{outcome}_within"].values d_within = df_dm[f"{treat_col}_within"].values - # OLS on demeaned data: beta = sum(d * y) / sum(d^2) - d_var = np.sum(d_within ** 2) + # OLS on demeaned data: beta = sum(w * d * y) / sum(w * d^2) + w = weights if weights is not None else np.ones(len(y_within)) + d_var = np.sum(w * d_within**2) if d_var > 0: - beta = np.sum(d_within * y_within) / d_var + beta = np.sum(w * d_within * y_within) / d_var else: beta = 0.0 @@ -602,6 +686,7 @@ def _recompute_exact_weights( time: str, first_treat: str, time_periods: List[Any], + weights: Optional[np.ndarray] = None, ) -> None: """ Recompute weights using exact variance-based formula from Theorem 1. @@ -609,8 +694,16 @@ def _recompute_exact_weights( This modifies comparison weights in-place to use the exact formula from Goodman-Bacon (2021) which accounts for within-group variance of the treatment indicator in each 2x2 comparison window. + + When survey weights are provided, uses weighted unit counts and + within-group variance of the treatment indicator. """ n_total_obs = len(df) + w_arr = weights if weights is not None else np.ones(n_total_obs) + # Store weights as a column for safe label-based subsetting + df = df.copy() + df["_sw"] = w_arr + w_total = np.sum(w_arr) n_total_units = df[unit].nunique() for comp in comparisons: @@ -620,9 +713,9 @@ def _recompute_exact_weights( post_periods = [t for t in time_periods if t >= comp.treated_group] # Get units in each group units_treated = df[df[first_treat] == comp.treated_group][unit].unique() - units_control = df[ - (df[first_treat] == 0) | (df[first_treat] == np.inf) - ][unit].unique() + units_control = df[(df[first_treat] == 0) | (df[first_treat] == np.inf)][ + unit + ].unique() elif comp.comparison_type == "earlier_vs_later": g_early = comp.treated_group g_late = comp.control_group @@ -646,10 +739,7 @@ def _recompute_exact_weights( relevant_periods = set(pre_periods) | set(post_periods) all_units = set(units_treated) | set(units_control) - df_22 = df[ - (df[unit].isin(all_units)) & - (df[time].isin(relevant_periods)) - ] + df_22 = df[(df[unit].isin(all_units)) & (df[time].isin(relevant_periods))] if len(df_22) == 0: comp.weight = 0.0 @@ -663,14 +753,17 @@ def _recompute_exact_weights( comp.weight = 0.0 continue - # Number of observations in this 2x2 sample - n_22 = len(df_22) + # Weighted observation counts for the 2x2 sample + w_22 = df_22["_sw"].values + w_22_sum = np.sum(w_22) - # Sample share of this comparison - sample_share = n_22 / n_total_obs + # Sample share of this comparison (weighted) + sample_share = w_22_sum / w_total - # Group shares within the 2x2 - n_k_share = n_k / (n_k + n_l) + # Weighted group shares within the 2x2 + treated_mask_22 = df_22[unit].isin(units_treated) + w_k = np.sum(w_22[treated_mask_22.values]) + n_k_share = w_k / w_22_sum if w_22_sum > 0 else 0.0 # Create treatment indicator for the 2x2 T_pre = len(pre_periods) @@ -682,13 +775,30 @@ def _recompute_exact_weights( # D = 0 for all periods for control units in this window D_k = T_post / T_window # proportion treated for treated group - # Within-comparison variance of treatment + # Within-comparison variance of treatment (weighted) # Var(D) = n_k/(n_k+n_l) * D_k * (1-D_k) for the 2x2 var_D_22 = n_k_share * D_k * (1 - D_k) # Exact weight: proportional to sample share * variance - # Scale by (n_k + n_l) / n_total_units to account for subsample - unit_share = (n_k + n_l) / n_total_units + # Scale by weighted unit share to account for subsample + # Use survey-weighted unit mass when weights present + if weights is not None: + # Sum of per-unit weights for treated + control units in this 2x2 + unit_w_k = ( + df_22.loc[treated_mask_22, "_sw"] + .groupby(df_22.loc[treated_mask_22, unit]) + .first() + .sum() + ) + unit_w_l = ( + df_22.loc[~treated_mask_22, "_sw"] + .groupby(df_22.loc[~treated_mask_22, unit]) + .first() + .sum() + ) + unit_share = (unit_w_k + unit_w_l) / w_total + else: + unit_share = (n_k + n_l) / n_total_units comp.weight = sample_share * var_D_22 * unit_share def _compute_treated_vs_never( @@ -700,6 +810,7 @@ def _compute_treated_vs_never( first_treat: str, treated_group: Any, time_periods: List[Any], + weights: Optional[np.ndarray] = None, ) -> Optional[Comparison2x2]: """ Compute 2x2 DiD comparing treated group to never-treated. @@ -728,24 +839,41 @@ def _compute_treated_vs_never( if not pre_periods or not post_periods: return None - # Compute 2x2 DiD estimate - # Mean change for treated - treated_pre = df_treated[df_treated[time].isin(pre_periods)][outcome].mean() - treated_post = df_treated[df_treated[time].isin(post_periods)][outcome].mean() + # Compute 2x2 DiD estimate using weighted means if survey weights provided + w = weights if weights is not None else np.ones(len(df)) + y = df[outcome].values + + treated_pre_mask = treated_mask & df[time].isin(pre_periods) + treated_post_mask = treated_mask & df[time].isin(post_periods) + never_pre_mask = never_mask & df[time].isin(pre_periods) + never_post_mask = never_mask & df[time].isin(post_periods) + + # Guard against empty cells (unbalanced/filtered panels) + if not ( + np.any(treated_pre_mask) + and np.any(treated_post_mask) + and np.any(never_pre_mask) + and np.any(never_post_mask) + ): + return None - # Mean change for never-treated - never_pre = df_never[df_never[time].isin(pre_periods)][outcome].mean() - never_post = df_never[df_never[time].isin(post_periods)][outcome].mean() + treated_pre = np.average(y[treated_pre_mask], weights=w[treated_pre_mask]) + treated_post = np.average(y[treated_post_mask], weights=w[treated_post_mask]) + never_pre = np.average(y[never_pre_mask], weights=w[never_pre_mask]) + never_post = np.average(y[never_post_mask], weights=w[never_post_mask]) estimate = (treated_post - treated_pre) - (never_post - never_pre) - # Calculate weight components + # Calculate weight components using weighted group shares n_treated = df_treated[unit].nunique() n_never = df_never[unit].nunique() - n_total = n_treated + n_never - # Group share - n_k = n_treated / n_total + w_treated_sum = np.sum(w[treated_mask]) + w_never_sum = np.sum(w[never_mask]) + w_total = w_treated_sum + w_never_sum + + # Weighted group share + n_k = w_treated_sum / w_total if w_total > 0 else 0.0 # Variance of treatment: proportion of post-treatment periods D_k = len(post_periods) / len(time_periods) @@ -776,6 +904,7 @@ def _compute_timing_comparison( control_group: Any, time_periods: List[Any], comparison_type: str, + weights: Optional[np.ndarray] = None, ) -> Optional[Comparison2x2]: """ Compute 2x2 DiD comparing two timing groups. @@ -794,7 +923,6 @@ def _compute_timing_comparison( n_treated = df_treated[unit].nunique() n_control = df_control[unit].nunique() - n_total = n_treated + n_control if comparison_type == "earlier_vs_later": # Earlier treated vs Later treated @@ -829,12 +957,28 @@ def _compute_timing_comparison( time_window = (g_early, max(time_periods)) - # Compute 2x2 DiD estimate - treated_pre = df_treated[df_treated[time].isin(pre_periods)][outcome].mean() - treated_post = df_treated[df_treated[time].isin(post_periods)][outcome].mean() + # Compute 2x2 DiD estimate using weighted means if survey weights provided + w = weights if weights is not None else np.ones(len(df)) + y = df[outcome].values + + treated_pre_mask = treated_mask & df[time].isin(pre_periods) + treated_post_mask = treated_mask & df[time].isin(post_periods) + control_pre_mask = control_mask & df[time].isin(pre_periods) + control_post_mask = control_mask & df[time].isin(post_periods) + + # Skip if any cell is empty + if ( + treated_pre_mask.sum() == 0 + or treated_post_mask.sum() == 0 + or control_pre_mask.sum() == 0 + or control_post_mask.sum() == 0 + ): + return None - control_pre = df_control[df_control[time].isin(pre_periods)][outcome].mean() - control_post = df_control[df_control[time].isin(post_periods)][outcome].mean() + treated_pre = np.average(y[treated_pre_mask], weights=w[treated_pre_mask]) + treated_post = np.average(y[treated_post_mask], weights=w[treated_post_mask]) + control_pre = np.average(y[control_pre_mask], weights=w[control_pre_mask]) + control_post = np.average(y[control_post_mask], weights=w[control_post_mask]) if np.isnan(treated_pre) or np.isnan(treated_post): return None @@ -843,8 +987,11 @@ def _compute_timing_comparison( estimate = (treated_post - treated_pre) - (control_post - control_pre) - # Calculate weight - n_k = n_treated / n_total + # Calculate weight using weighted group shares + w_treated_sum = np.sum(w[treated_mask]) + w_control_sum = np.sum(w[control_mask]) + w_total = w_treated_sum + w_control_sum + n_k = w_treated_sum / w_total if w_total > 0 else 0.0 # Variance of treatment within the comparison window total_periods_in_window = len(pre_periods) + len(post_periods) @@ -875,8 +1022,7 @@ def set_params(self, **params) -> "BaconDecomposition": if "weights" in params: if params["weights"] not in ("approximate", "exact"): raise ValueError( - f"weights must be 'approximate' or 'exact', " - f"got '{params['weights']}'" + f"weights must be 'approximate' or 'exact', " f"got '{params['weights']}'" ) self.weights = params["weights"] return self @@ -900,6 +1046,7 @@ def bacon_decompose( time: str, first_treat: str, weights: str = "approximate", + survey_design: object = None, ) -> BaconDecompositionResults: """ Convenience function for Goodman-Bacon decomposition. @@ -976,4 +1123,4 @@ def bacon_decompose( CallawaySantAnna : Robust estimator that avoids forbidden comparisons """ decomp = BaconDecomposition(weights=weights) - return decomp.fit(data, outcome, unit, time, first_treat) + return decomp.fit(data, outcome, unit, time, first_treat, survey_design=survey_design) diff --git a/diff_diff/continuous_did.py b/diff_diff/continuous_did.py index a8134377..f4c242e8 100644 --- a/diff_diff/continuous_did.py +++ b/diff_diff/continuous_did.py @@ -30,6 +30,12 @@ DoseResponseCurve, ) from diff_diff.linalg import solve_ols +from diff_diff.survey import ( + ResolvedSurveyDesign, + _resolve_survey_for_fit, + _validate_unit_constant_survey, + compute_survey_vcov, +) from diff_diff.utils import safe_inference __all__ = ["ContinuousDiD", "ContinuousDiDResults", "DoseResponseCurve"] @@ -159,6 +165,7 @@ def fit( first_treat: str, dose: str, aggregate: Optional[str] = None, + survey_design: object = None, ) -> ContinuousDiDResults: """ Fit the continuous DiD estimator. @@ -180,6 +187,10 @@ def fit( aggregate : str, optional ``"dose"`` for dose-response aggregation, ``"eventstudy"`` for binarized event study. + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. + Supports weighted estimation and Taylor series linearization + variance with strata, PSU, and FPC. Returns ------- @@ -189,8 +200,23 @@ def fit( _VALID_AGGREGATES = (None, "dose", "eventstudy") if aggregate not in _VALID_AGGREGATES: raise ValueError( - f"Invalid aggregate: '{aggregate}'. " - f"Must be one of {_VALID_AGGREGATES}." + f"Invalid aggregate: '{aggregate}'. " f"Must be one of {_VALID_AGGREGATES}." + ) + + # Resolve survey design if provided + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Validate within-unit constancy for panel survey designs + if resolved_survey is not None: + _validate_unit_constant_survey(data, unit, survey_design) + + # Guard: bootstrap + survey not yet supported + if self.n_bootstrap > 0 and resolved_survey is not None: + raise NotImplementedError( + "Multiplier bootstrap with survey weights is planned for Phase 5. " + "Use n_bootstrap=0 with survey_design for design-based standard errors." ) df = data.copy() @@ -211,9 +237,7 @@ def fit( # Drop units with positive first_treat but zero dose (R convention) unit_info = df.groupby(unit).first()[[first_treat, dose]] - drop_units = unit_info[ - (unit_info[first_treat] > 0) & (unit_info[dose] == 0) - ].index + drop_units = unit_info[(unit_info[first_treat] > 0) & (unit_info[dose] == 0)].index if len(drop_units) > 0: warnings.warn( f"Dropping {len(drop_units)} units with positive first_treat but zero dose.", @@ -285,9 +309,23 @@ def fit( "Add never-treated units or use a dataset with D=0 observations." ) + # Re-resolve survey design on filtered df if rows were dropped + # (survey arrays must align with df, not the original data) + if resolved_survey is not None and len(df) < len(data): + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, df, "analytical") + ) + # 2. Precompute structures precomp = self._precompute_structures( - df, outcome, unit, time, first_treat, dose, time_periods + df, + outcome, + unit, + time, + first_treat, + dose, + time_periods, + survey_weights=survey_weights, ) # Compute dvals (evaluation grid) @@ -309,7 +347,14 @@ def fit( for g in treatment_groups: for t in time_periods: result = self._compute_dose_response_gt( - precomp, g, t, knots, degree, dvals + precomp, + g, + t, + knots, + degree, + dvals, + survey_weights=precomp.get("unit_survey_weights"), + resolved_survey=resolved_survey, ) if result is not None: gt_results[(g, t)] = result @@ -319,11 +364,7 @@ def fit( raise ValueError("No valid (g,t) cells computed.") # 4. Aggregate - post_gt = { - (g, t): r - for (g, t), r in gt_results.items() - if t >= g - self.anticipation - } + post_gt = {(g, t): r for (g, t), r in gt_results.items() if t >= g - self.anticipation} # Dose-response aggregation n_grid = len(dvals) @@ -349,7 +390,14 @@ def fit( # Event study aggregation (binarized) — runs on ALL (g,t) cells event_study_effects = None if aggregate == "eventstudy": - event_study_effects = self._aggregate_event_study(gt_results) + event_study_effects = self._aggregate_event_study( + gt_results, + gt_bootstrap_info=gt_bootstrap_info, + unit_survey_weights=precomp.get("unit_survey_weights"), + unit_cohorts=precomp["unit_cohorts"], + ) + + _survey_df = None # Set by analytical branch when survey is active if len(post_gt) == 0: warnings.warn( @@ -366,12 +414,19 @@ def fit( else: # Compute cell weights: group-proportional (matching R's contdid convention). # Each group g gets weight proportional to its number of treated units. + # When survey weights present, use sum(w_g) / sum(w) instead of n_g / N. # Within each group, weight is divided equally among post-treatment cells. group_n_treated = {} group_n_post_cells = {} + unit_sw = precomp.get("unit_survey_weights") for (g, t), r in post_gt.items(): if g not in group_n_treated: - group_n_treated[g] = float(r["n_treated"]) + if unit_sw is not None: + # Survey-weighted group size: sum of weights for treated units in g + g_mask = precomp["unit_cohorts"] == g + group_n_treated[g] = float(np.sum(unit_sw[g_mask])) + else: + group_n_treated[g] = float(r["n_treated"]) group_n_post_cells[g] = 0 group_n_post_cells[g] += 1 @@ -397,9 +452,18 @@ def fit( # 5. Bootstrap / Analytical SE if self.n_bootstrap > 0: boot_result = self._run_bootstrap( - precomp, gt_results, gt_bootstrap_info, post_gt, cell_weights, - knots, degree, dvals, overall_att, overall_acrt, - agg_att_d, agg_acrt_d, + precomp, + gt_results, + gt_bootstrap_info, + post_gt, + cell_weights, + knots, + degree, + dvals, + overall_att, + overall_acrt, + agg_att_d, + agg_acrt_d, event_study_effects, ) att_d_se = boot_result["att_d_se"] @@ -411,54 +475,71 @@ def fit( att_d_p = boot_result["att_d_p"] acrt_d_p = boot_result["acrt_d_p"] overall_att_se = boot_result["overall_att_se"] - overall_att_t = safe_inference( - overall_att, overall_att_se, self.alpha - )[0] + overall_att_t = safe_inference(overall_att, overall_att_se, self.alpha)[0] overall_att_p = boot_result["overall_att_p"] overall_att_ci = boot_result["overall_att_ci"] overall_acrt_se = boot_result["overall_acrt_se"] - overall_acrt_t = safe_inference( - overall_acrt, overall_acrt_se, self.alpha - )[0] + overall_acrt_t = safe_inference(overall_acrt, overall_acrt_se, self.alpha)[0] overall_acrt_p = boot_result["overall_acrt_p"] overall_acrt_ci = boot_result["overall_acrt_ci"] if event_study_effects is not None: for e, info in event_study_effects.items(): if e in boot_result.get("es_se", {}): info["se"] = boot_result["es_se"][e] - info["t_stat"] = safe_inference( - info["effect"], info["se"], self.alpha - )[0] + info["t_stat"] = safe_inference(info["effect"], info["se"], self.alpha)[ + 0 + ] info["p_value"] = boot_result["es_p"][e] info["conf_int"] = boot_result["es_ci"][e] else: # Analytical SEs via influence functions analytic = self._compute_analytical_se( - precomp, gt_results, gt_bootstrap_info, post_gt, cell_weights, - knots, degree, dvals, agg_att_d, agg_acrt_d, + precomp, + gt_results, + gt_bootstrap_info, + post_gt, + cell_weights, + knots, + degree, + dvals, + agg_att_d, + agg_acrt_d, + resolved_survey=resolved_survey, ) att_d_se = analytic["att_d_se"] acrt_d_se = analytic["acrt_d_se"] overall_att_se = analytic["overall_att_se"] overall_acrt_se = analytic["overall_acrt_se"] + # Survey df for t-distribution inference (unit-level, not panel-level) + _survey_df = analytic.get("df_survey") + + # Recompute survey_metadata from unit-level design so reported + # effective_n/n_psu/df_survey match the inference actually run + _unit_resolved = analytic.get("unit_resolved") + if _unit_resolved is not None: + from diff_diff.survey import compute_survey_metadata + + raw_w_unit = _unit_resolved.weights + survey_metadata = compute_survey_metadata(_unit_resolved, raw_w_unit) + overall_att_t, overall_att_p, overall_att_ci = safe_inference( - overall_att, overall_att_se, self.alpha + overall_att, overall_att_se, self.alpha, df=_survey_df ) overall_acrt_t, overall_acrt_p, overall_acrt_ci = safe_inference( - overall_acrt, overall_acrt_se, self.alpha + overall_acrt, overall_acrt_se, self.alpha, df=_survey_df ) # Per-grid-point inference for dose-response for idx in range(n_grid): _, _, ci = safe_inference( - agg_att_d[idx], att_d_se[idx], self.alpha + agg_att_d[idx], att_d_se[idx], self.alpha, df=_survey_df ) att_d_ci_lower[idx] = ci[0] att_d_ci_upper[idx] = ci[1] _, _, ci = safe_inference( - agg_acrt_d[idx], acrt_d_se[idx], self.alpha + agg_acrt_d[idx], acrt_d_se[idx], self.alpha, df=_survey_df ) acrt_d_ci_lower[idx] = ci[0] acrt_d_ci_upper[idx] = ci[1] @@ -466,16 +547,62 @@ def fit( # Event study analytical SEs if event_study_effects is not None: n_units = precomp["n_units"] + unit_sw = precomp.get("unit_survey_weights") + + # Build unit-level ResolvedSurveyDesign once (reused per bin) + unit_resolved_es = None + if resolved_survey is not None: + row_idx = precomp["unit_first_panel_row"] + uw = ( + precomp.get("unit_survey_weights") + if precomp.get("unit_survey_weights") is not None + else np.ones(n_units) + ) + us = ( + resolved_survey.strata[row_idx] + if resolved_survey.strata is not None + else None + ) + up = ( + resolved_survey.psu[row_idx] + if resolved_survey.psu is not None + else None + ) + uf = ( + resolved_survey.fpc[row_idx] + if resolved_survey.fpc is not None + else None + ) + n_strata_u = len(np.unique(us)) if us is not None else 0 + n_psu_u = len(np.unique(up)) if up is not None else 0 + unit_resolved_es = ResolvedSurveyDesign( + weights=uw, + weight_type=resolved_survey.weight_type, + strata=us, + psu=up, + fpc=uf, + n_strata=n_strata_u, + n_psu=n_psu_u, + lonely_psu=resolved_survey.lonely_psu, + ) + for e_val, info_e in event_study_effects.items(): # Collect (g,t) cells for this event-time bin e_gts = [gt for gt in gt_results if gt[1] - gt[0] == e_val] if not e_gts: continue - # n_treated-proportional weights within this bin - ns = np.array( - [gt_results[gt]["n_treated"] for gt in e_gts], - dtype=float, - ) + # Weights within this bin: survey-weighted mass or n_treated + if unit_sw is not None: + unit_cohorts = precomp["unit_cohorts"] + ns = np.array( + [float(np.sum(unit_sw[unit_cohorts == gt[0]])) for gt in e_gts], + dtype=float, + ) + else: + ns = np.array( + [gt_results[gt]["n_treated"] for gt in e_gts], + dtype=float, + ) total_n = ns.sum() if total_n == 0: continue @@ -492,6 +619,10 @@ def fit( control_idx = b_info["control_indices"] n_t = b_info["n_treated"] n_c = b_info["n_control"] + # Use survey-weighted masses when available + if "w_treated" in b_info: + n_t = b_info["w_treated"] + n_c = b_info["w_control"] n_total_gt = n_t + n_c p_1 = n_t / n_total_gt p_0 = n_c / n_total_gt @@ -502,19 +633,24 @@ def fit( for k, uid in enumerate(treated_idx): if_es[uid] += ( - w - * (delta_y_treated[k] - att_glob_gt - mu_0) - / p_1 - / n_total_gt + w * (delta_y_treated[k] - att_glob_gt - mu_0) / p_1 / n_total_gt ) for k, uid in enumerate(control_idx): - if_es[uid] -= ( - w * ee_control[k] / p_0 / n_total_gt - ) + if_es[uid] -= w * ee_control[k] / p_0 / n_total_gt + + # Compute SE: survey-aware TSL or standard sqrt(sum(IF^2)) + if unit_resolved_es is not None: + X_ones_es = np.ones((n_units, 1)) + # Rescale IFs by total survey mass (not n_units) for fweight support + tsl_scale_es = float(unit_resolved_es.weights.sum()) + if_es_tsl = if_es * tsl_scale_es + vcov_es = compute_survey_vcov(X_ones_es, if_es_tsl, unit_resolved_es) + es_se = float(np.sqrt(np.abs(vcov_es[0, 0]))) + else: + es_se = float(np.sqrt(np.sum(if_es**2))) - es_se = float(np.sqrt(np.sum(if_es**2))) t_stat, p_val, ci_es = safe_inference( - info_e["effect"], es_se, self.alpha + info_e["effect"], es_se, self.alpha, df=_survey_df ) info_e["se"] = es_se info_e["t_stat"] = t_stat @@ -531,6 +667,7 @@ def fit( target="att", p_value=att_d_p, n_bootstrap=self.n_bootstrap, + df_survey=_survey_df, ) dose_response_acrt = DoseResponseCurve( dose_grid=dvals, @@ -541,14 +678,13 @@ def fit( target="acrt", p_value=acrt_d_p, n_bootstrap=self.n_bootstrap, + df_survey=_survey_df, ) # Strip bootstrap internals from gt_results clean_gt = {} for gt, r in gt_results.items(): - clean_gt[gt] = { - k: v for k, v in r.items() if not k.startswith("_") - } + clean_gt[gt] = {k: v for k, v in r.items() if not k.startswith("_")} return ContinuousDiDResults( dose_response_att=dose_response_att, @@ -581,6 +717,7 @@ def fit( seed=self.seed, rank_deficient_action=self.rank_deficient_action, event_study_effects=event_study_effects, + survey_metadata=survey_metadata, ) # ------------------------------------------------------------------ @@ -596,6 +733,7 @@ def _precompute_structures( first_treat: str, dose: str, time_periods: List[Any], + survey_weights: Optional[np.ndarray] = None, ) -> Dict[str, Any]: """Pivot to wide format and build lookup structures.""" all_units = sorted(df[unit].unique()) @@ -620,6 +758,21 @@ def _precompute_structures( unit_cohorts[i] = unit_first.loc[u, first_treat] dose_vector[i] = unit_first.loc[u, dose] + # Build unit-to-first-panel-row mapping (for subsetting panel-level arrays) + # This maps each unit index to the positional index of its first row in df. + unit_first_panel_row = np.zeros(n_units, dtype=int) + seen_units: set = set() + for pos_idx, (_, row) in enumerate(df.iterrows()): + u = row[unit] + if u not in seen_units: + seen_units.add(u) + unit_first_panel_row[unit_to_idx[u]] = pos_idx + + # Per-unit survey weights (take first obs per unit from panel data) + unit_survey_weights = None + if survey_weights is not None: + unit_survey_weights = survey_weights[unit_first_panel_row] + # Cohort masks cohort_masks = {} unique_cohorts = np.unique(unit_cohorts) @@ -639,6 +792,8 @@ def _precompute_structures( "never_treated_mask": never_treated_mask, "time_periods": time_periods, "n_units": n_units, + "unit_survey_weights": unit_survey_weights, + "unit_first_panel_row": unit_first_panel_row, } def _compute_dose_response_gt( @@ -649,6 +804,8 @@ def _compute_dose_response_gt( knots: np.ndarray, degree: int, dvals: np.ndarray, + survey_weights: Optional[np.ndarray] = None, + resolved_survey: object = None, ) -> Optional[Dict[str, Any]]: """Compute dose-response for a single (g,t) cell.""" period_to_col = precomp["period_to_col"] @@ -703,11 +860,25 @@ def _compute_dose_response_gt( return None # Outcome changes - delta_y_treated = outcome_matrix[treated_mask, col_t] - outcome_matrix[treated_mask, col_base] - delta_y_control = outcome_matrix[control_mask, col_t] - outcome_matrix[control_mask, col_base] + delta_y_treated = ( + outcome_matrix[treated_mask, col_t] - outcome_matrix[treated_mask, col_base] + ) + delta_y_control = ( + outcome_matrix[control_mask, col_t] - outcome_matrix[control_mask, col_base] + ) - # Control counterfactual - mu_0 = float(np.mean(delta_y_control)) + # Subset survey weights to the cell + w_treated = None + w_control = None + if survey_weights is not None: + w_treated = survey_weights[treated_mask] + w_control = survey_weights[control_mask] + + # Control counterfactual (weighted mean when survey weights present) + if w_control is not None: + mu_0 = float(np.average(delta_y_control, weights=w_control)) + else: + mu_0 = float(np.mean(delta_y_control)) # Demean delta_tilde_y = delta_y_treated - mu_0 @@ -722,8 +893,7 @@ def _compute_dose_response_gt( # Check for all-same dose if np.all(treated_doses == treated_doses[0]): warnings.warn( - f"All treated doses identical in (g={g}, t={t}). " - "ACRT(d) will be 0 everywhere.", + f"All treated doses identical in (g={g}, t={t}). " "ACRT(d) will be 0 everywhere.", UserWarning, stacklevel=3, ) @@ -738,12 +908,28 @@ def _compute_dose_response_gt( ) return None - # OLS regression - beta_hat, residuals, _ = solve_ols( - Psi, delta_tilde_y, - return_vcov=False, - rank_deficient_action=self.rank_deficient_action, - ) + # OLS or WLS regression + if w_treated is not None: + # WLS: apply sqrt(w) transformation + sqrt_w = np.sqrt(w_treated) + Psi_w = Psi * sqrt_w[:, np.newaxis] + delta_tilde_y_w = delta_tilde_y * sqrt_w + beta_hat, _, _ = solve_ols( + Psi_w, + delta_tilde_y_w, + return_vcov=False, + rank_deficient_action=self.rank_deficient_action, + ) + # Residuals on original scale (for influence functions) + beta_pred_tmp = np.where(np.isnan(beta_hat), 0.0, beta_hat) + residuals = delta_tilde_y - Psi @ beta_pred_tmp + else: + beta_hat, residuals, _ = solve_ols( + Psi, + delta_tilde_y, + return_vcov=False, + rank_deficient_action=self.rank_deficient_action, + ) # For prediction: zero out NaN (dropped rank-deficient columns). # solve_ols sets dropped-column coefficients to NaN (R convention); @@ -759,21 +945,37 @@ def _compute_dose_response_gt( acrt_d = dPsi_eval @ beta_pred # Summary parameters - att_glob = float(np.mean(delta_y_treated) - mu_0) + if w_treated is not None: + att_glob = float(np.average(delta_y_treated, weights=w_treated) - mu_0) + else: + att_glob = float(np.mean(delta_y_treated) - mu_0) # ACRT^{glob}: plug-in average of ACRT(D_i) for treated dPsi_treated = bspline_derivative_design_matrix( treated_doses, knots, degree, include_intercept=True ) - acrt_glob = float(np.mean(dPsi_treated @ beta_pred)) + if w_treated is not None: + acrt_glob = float(np.average(dPsi_treated @ beta_pred, weights=w_treated)) + else: + acrt_glob = float(np.mean(dPsi_treated @ beta_pred)) # Store bootstrap info for influence function computation - # bread = (Psi'Psi / n_treated)^{-1} - PtP = Psi.T @ Psi - try: - bread = np.linalg.inv(PtP / n_treated) - except np.linalg.LinAlgError: - bread = np.linalg.pinv(PtP / n_treated) + # bread = (Psi'WPsi / n_treated)^{-1} when survey, (Psi'Psi / n_treated)^{-1} otherwise + if w_treated is not None: + w_treated_sum = float(np.sum(w_treated)) + PtWP = Psi.T @ (Psi * w_treated[:, np.newaxis]) + # Normalize bread by weighted mass (not raw count) for consistency + # with downstream IF score denominators that also use weighted mass + try: + bread = np.linalg.inv(PtWP / w_treated_sum) + except np.linalg.LinAlgError: + bread = np.linalg.pinv(PtWP / w_treated_sum) + else: + PtP = Psi.T @ Psi + try: + bread = np.linalg.inv(PtP / n_treated) + except np.linalg.LinAlgError: + bread = np.linalg.pinv(PtP / n_treated) # ee_treated: per-unit estimating equation vectors (K-vector per unit) ee_treated = Psi * residuals[:, np.newaxis] # (n_treated, K) @@ -781,18 +983,28 @@ def _compute_dose_response_gt( # ee_control: per-unit deviation from control mean ee_control = delta_y_control - mu_0 # (n_control,) - # psi_bar: mean basis vector for treated - psi_bar = np.mean(Psi, axis=0) # (K,) + # psi_bar: mean basis vector for treated (weighted when survey) + if w_treated is not None: + psi_bar = np.average(Psi, axis=0, weights=w_treated) + else: + psi_bar = np.mean(Psi, axis=0) # (K,) # Unit indices for bootstrap treated_indices = np.where(treated_mask)[0] control_indices = np.where(control_mask)[0] + # dpsi_bar: mean derivative basis vector (weighted when survey) + if w_treated is not None: + dpsi_bar = np.average(dPsi_treated, axis=0, weights=w_treated) + else: + dpsi_bar = np.mean(dPsi_treated, axis=0) + bootstrap_info = { "bread": bread, "ee_treated": ee_treated, "ee_control": ee_control, "psi_bar": psi_bar, + "dpsi_bar": dpsi_bar, "beta_hat": beta_hat, "beta_pred": beta_pred, "treated_indices": treated_indices, @@ -809,6 +1021,11 @@ def _compute_dose_response_gt( "acrt_glob": acrt_glob, } + # Store survey-weighted masses for IF linearization + if w_treated is not None: + bootstrap_info["w_treated"] = float(np.sum(w_treated)) + bootstrap_info["w_control"] = float(np.sum(w_control)) + return { "att_d": att_d, "acrt_d": acrt_d, @@ -823,15 +1040,25 @@ def _compute_dose_response_gt( def _aggregate_event_study( self, gt_results: Dict[Tuple, Dict], + gt_bootstrap_info: Dict[Tuple, Dict] = None, + unit_survey_weights: Optional[np.ndarray] = None, + unit_cohorts: Optional[np.ndarray] = None, ) -> Dict[int, Dict[str, Any]]: """Aggregate binarized ATT_glob by relative period.""" - effects_by_e: Dict[int, List[Tuple[float, float]]] = {} + effects_by_e: Dict[int, List[Tuple[float, float, Tuple]]] = {} for (g, t), r in gt_results.items(): e = t - g if e not in effects_by_e: effects_by_e[e] = [] - effects_by_e[e].append((r["att_glob"], float(r["n_treated"]))) + # Compute weight for this (g,t) cell + if unit_survey_weights is not None and unit_cohorts is not None: + # Survey-weighted: sum of survey weights for treated units in group g + g_mask = unit_cohorts == g + cell_weight = float(np.sum(unit_survey_weights[g_mask])) + else: + cell_weight = float(r["n_treated"]) + effects_by_e[e].append((r["att_glob"], cell_weight, (g, t))) result = {} for e, entries in sorted(effects_by_e.items()): @@ -863,6 +1090,7 @@ def _compute_analytical_se( dvals: np.ndarray, agg_att_d: np.ndarray, agg_acrt_d: np.ndarray, + resolved_survey: object = None, ) -> Dict[str, Any]: """Compute analytical SEs using influence functions.""" n_units = precomp["n_units"] @@ -885,13 +1113,17 @@ def _compute_analytical_se( control_idx = info["control_indices"] n_t = info["n_treated"] n_c = info["n_control"] + # Use survey-weighted masses when available + if "w_treated" in info: + n_t = info["w_treated"] + n_c = info["w_control"] bread = info["bread"] ee_treated = info["ee_treated"] ee_control = info["ee_control"] psi_bar = info["psi_bar"] + dpsi_bar = info["dpsi_bar"] Psi_eval = info["Psi_eval"] dPsi_eval = info["dPsi_eval"] - dPsi_treated = info["dPsi_treated"] att_glob_gt = info["att_glob"] mu_0 = info["mu_0"] delta_y_treated = info["delta_y_treated"] @@ -924,7 +1156,6 @@ def _compute_analytical_se( if_acrt_d[idx] += w * (dPsi_eval @ beta_pert) # ACRT_glob IF: (1/n_t) sum_j dpsi(D_j)' @ beta_pert - dpsi_bar = np.mean(dPsi_treated, axis=0) for k, idx in enumerate(treated_idx): beta_pert = bread @ ee_treated[k] / n_t if_acrt_glob[idx] += w * float(dpsi_bar @ beta_pert) @@ -932,19 +1163,91 @@ def _compute_analytical_se( beta_pert = -bread @ psi_bar * ee_control[k] / n_c if_acrt_glob[idx] += w * float(dpsi_bar @ beta_pert) - # SE = sqrt(sum(IF_i^2)), matching CallawaySantAnna's convention - # (per-unit IFs already contain 1/n_t, 1/n_c scaling) - overall_att_se = float(np.sqrt(np.sum(if_att_glob**2))) - overall_acrt_se = float(np.sqrt(np.sum(if_acrt_glob**2))) + # Compute SEs from influence functions + if resolved_survey is not None: + # Survey design: use TSL variance on the aggregate influence functions. + # The IFs serve as "residuals" in the TSL sandwich; X is a column of ones + # (the estimand is a scalar/vector mean of the IFs). + # + # The resolved_survey has panel-level arrays (n_obs = n_units * n_periods), + # but influence functions are unit-level (n_units). Build a unit-level + # ResolvedSurveyDesign by subsetting to one obs per unit. + row_idx = precomp["unit_first_panel_row"] + unit_weights = precomp.get("unit_survey_weights") + if unit_weights is None: + unit_weights = np.ones(n_units) + + unit_strata = ( + resolved_survey.strata[row_idx] if resolved_survey.strata is not None else None + ) + unit_psu = resolved_survey.psu[row_idx] if resolved_survey.psu is not None else None + unit_fpc = resolved_survey.fpc[row_idx] if resolved_survey.fpc is not None else None + + # Count unique strata/PSU in the unit-level subset + n_strata_unit = len(np.unique(unit_strata)) if unit_strata is not None else 0 + n_psu_unit = len(np.unique(unit_psu)) if unit_psu is not None else 0 + + unit_resolved = ResolvedSurveyDesign( + weights=unit_weights, + weight_type=resolved_survey.weight_type, + strata=unit_strata, + psu=unit_psu, + fpc=unit_fpc, + n_strata=n_strata_unit, + n_psu=n_psu_unit, + lonely_psu=resolved_survey.lonely_psu, + ) - att_d_se = np.sqrt(np.sum(if_att_d**2, axis=0)) - acrt_d_se = np.sqrt(np.sum(if_acrt_d**2, axis=0)) + X_ones = np.ones((n_units, 1)) + + # Rescale IFs from 1/n convention to score scale for TSL sandwich. + # The per-unit IFs contain internal 1/n_t, 1/n_c scaling (for the + # unweighted SE = sqrt(sum(IF^2)) convention). compute_survey_vcov + # applies its own (X'WX)^{-1} bread, which would double-count. + # Rescale by the unit-level total survey mass (= n_units for + # pweight/aweight, but can differ for fweight). + tsl_scale = float(unit_resolved.weights.sum()) + if_att_glob_tsl = if_att_glob * tsl_scale + if_acrt_glob_tsl = if_acrt_glob * tsl_scale + if_att_d_tsl = if_att_d * tsl_scale + if_acrt_d_tsl = if_acrt_d * tsl_scale + + # Overall ATT SE via compute_survey_vcov + vcov_att = compute_survey_vcov(X_ones, if_att_glob_tsl, unit_resolved) + overall_att_se = float(np.sqrt(np.abs(vcov_att[0, 0]))) + + # Overall ACRT SE via compute_survey_vcov + vcov_acrt = compute_survey_vcov(X_ones, if_acrt_glob_tsl, unit_resolved) + overall_acrt_se = float(np.sqrt(np.abs(vcov_acrt[0, 0]))) + + # Per-grid-point SEs for dose-response curves + att_d_se = np.zeros(n_grid) + acrt_d_se = np.zeros(n_grid) + for d_idx in range(n_grid): + vcov_d = compute_survey_vcov(X_ones, if_att_d_tsl[:, d_idx], unit_resolved) + att_d_se[d_idx] = float(np.sqrt(np.abs(vcov_d[0, 0]))) + + vcov_d = compute_survey_vcov(X_ones, if_acrt_d_tsl[:, d_idx], unit_resolved) + acrt_d_se[d_idx] = float(np.sqrt(np.abs(vcov_d[0, 0]))) + else: + # SE = sqrt(sum(IF_i^2)), matching CallawaySantAnna's convention + # (per-unit IFs already contain 1/n_t, 1/n_c scaling) + overall_att_se = float(np.sqrt(np.sum(if_att_glob**2))) + overall_acrt_se = float(np.sqrt(np.sum(if_acrt_glob**2))) + + att_d_se = np.sqrt(np.sum(if_att_d**2, axis=0)) + acrt_d_se = np.sqrt(np.sum(if_acrt_d**2, axis=0)) + + # Return unit-level survey df and resolved design for metadata recomputation + unit_df_survey = unit_resolved.df_survey if resolved_survey is not None else None return { "overall_att_se": overall_att_se, "overall_acrt_se": overall_acrt_se, "att_d_se": att_d_se, "acrt_d_se": acrt_d_se, + "df_survey": unit_df_survey, + "unit_resolved": unit_resolved if resolved_survey is not None else None, } def _run_bootstrap( @@ -994,6 +1297,7 @@ def _run_bootstrap( if event_study_effects is not None: # Build event-time bin weights from n_treated from collections import defaultdict + es_bin_total: Dict[int, float] = defaultdict(float) for gt, r in gt_results.items(): g_val, t_val = gt @@ -1027,7 +1331,7 @@ def _bootstrap_gt_cell(gt, info): w_treated = all_weights[:, treated_idx] w_control = all_weights[:, control_idx] - with np.errstate(divide='ignore', invalid='ignore', over='ignore'): + with np.errstate(divide="ignore", invalid="ignore", over="ignore"): treated_sum = w_treated @ ee_treated / n_t control_sum = (w_control @ ee_control) / n_c psi_bar_outer = psi_bar[np.newaxis, :] @@ -1094,8 +1398,10 @@ def _bootstrap_gt_cell(gt, info): for idx in range(n_grid): se, ci, p = compute_effect_bootstrap_stats( - original_att_d[idx], boot_att_d[:, idx], - alpha=self.alpha, context=f"ATT(d) at grid point {idx}", + original_att_d[idx], + boot_att_d[:, idx], + alpha=self.alpha, + context=f"ATT(d) at grid point {idx}", ) att_d_se[idx] = se att_d_ci_lower[idx] = ci[0] @@ -1103,8 +1409,10 @@ def _bootstrap_gt_cell(gt, info): att_d_p[idx] = p se, ci, p = compute_effect_bootstrap_stats( - original_acrt_d[idx], boot_acrt_d[:, idx], - alpha=self.alpha, context=f"ACRT(d) at grid point {idx}", + original_acrt_d[idx], + boot_acrt_d[:, idx], + alpha=self.alpha, + context=f"ACRT(d) at grid point {idx}", ) acrt_d_se[idx] = se acrt_d_ci_lower[idx] = ci[0] @@ -1122,14 +1430,20 @@ def _bootstrap_gt_cell(gt, info): # Overall se, ci, p = compute_effect_bootstrap_stats( - original_att, boot_att_glob, alpha=self.alpha, context="overall ATT_glob", + original_att, + boot_att_glob, + alpha=self.alpha, + context="overall ATT_glob", ) result["overall_att_se"] = se result["overall_att_ci"] = ci result["overall_att_p"] = p se, ci, p = compute_effect_bootstrap_stats( - original_acrt, boot_acrt_glob, alpha=self.alpha, context="overall ACRT_glob", + original_acrt, + boot_acrt_glob, + alpha=self.alpha, + context="overall ACRT_glob", ) result["overall_acrt_se"] = se result["overall_acrt_ci"] = ci @@ -1142,8 +1456,10 @@ def _bootstrap_gt_cell(gt, info): es_p = {} for e in es_keys: se_e, ci_e, p_e = compute_effect_bootstrap_stats( - event_study_effects[e]["effect"], boot_es[e], - alpha=self.alpha, context=f"event study e={e}", + event_study_effects[e]["effect"], + boot_es[e], + alpha=self.alpha, + context=f"event study e={e}", ) es_se[e] = se_e es_ci[e] = ci_e diff --git a/diff_diff/continuous_did_results.py b/diff_diff/continuous_did_results.py index 3121160a..b9f332f4 100644 --- a/diff_diff/continuous_did_results.py +++ b/diff_diff/continuous_did_results.py @@ -45,6 +45,7 @@ class DoseResponseCurve: target: str p_value: Optional[np.ndarray] = None n_bootstrap: int = 0 + df_survey: Optional[int] = None def to_dataframe(self) -> pd.DataFrame: """Convert to DataFrame with dose, effect, se, CI, t_stat, p_value.""" @@ -60,7 +61,7 @@ def to_dataframe(self) -> pd.DataFrame: t_stat = np.full(n, np.nan) p_value = np.full(n, np.nan) for i in range(n): - t_i, p_i, _ = safe_inference(self.effects[i], self.se[i]) + t_i, p_i, _ = safe_inference(self.effects[i], self.se[i], df=self.df_survey) t_stat[i] = t_i p_value[i] = p_i return pd.DataFrame( @@ -139,6 +140,8 @@ class ContinuousDiDResults: seed: Optional[int] = None rank_deficient_action: str = "warn" event_study_effects: Optional[Dict[int, Dict[str, Any]]] = field(default=None) + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: sig_att = _get_significance_stars(self.overall_att_p_value) @@ -176,15 +179,38 @@ def summary(self, alpha: Optional[float] = None) -> str: "", ] + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "-" * w, + "Survey Design".center(w), + "-" * w, + f"{'Weight type:':<30} {sm.weight_type:>10}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<30} {sm.n_strata:>10}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>10}") + lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>10.1f}") + lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>10.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>10}") + lines.extend(["-" * w, ""]) + # Overall summary parameters - lines.extend([ - "-" * w, - "Overall Summary Parameters".center(w), - "-" * w, - f"{'Parameter':<15} {'Estimate':>12} {'Std. Err.':>12} " - f"{'t-stat':>10} {'P>|t|':>10} {'Sig.':>6}", - "-" * w, - ]) + lines.extend( + [ + "-" * w, + "Overall Summary Parameters".center(w), + "-" * w, + f"{'Parameter':<15} {'Estimate':>12} {'Std. Err.':>12} " + f"{'t-stat':>10} {'P>|t|':>10} {'Sig.':>6}", + "-" * w, + ] + ) for label, est, se, t, p in [ ( "ATT_glob", @@ -204,29 +230,30 @@ def summary(self, alpha: Optional[float] = None) -> str: t_str = f"{t:>10.3f}" if np.isfinite(t) else f"{'NaN':>10}" p_str = f"{p:>10.4f}" if np.isfinite(p) else f"{'NaN':>10}" sig = _get_significance_stars(p) - lines.append( - f"{label:<15} {est:>12.4f} {se:>12.4f} {t_str} {p_str} {sig:>6}" - ) - lines.extend([ - "-" * w, - "", - f"{conf_level}% CI for ATT_glob: " - f"[{self.overall_att_conf_int[0]:.4f}, {self.overall_att_conf_int[1]:.4f}]", - f"{conf_level}% CI for ACRT_glob: " - f"[{self.overall_acrt_conf_int[0]:.4f}, {self.overall_acrt_conf_int[1]:.4f}]", - "", - ]) + lines.append(f"{label:<15} {est:>12.4f} {se:>12.4f} {t_str} {p_str} {sig:>6}") + lines.extend( + [ + "-" * w, + "", + f"{conf_level}% CI for ATT_glob: " + f"[{self.overall_att_conf_int[0]:.4f}, {self.overall_att_conf_int[1]:.4f}]", + f"{conf_level}% CI for ACRT_glob: " + f"[{self.overall_acrt_conf_int[0]:.4f}, {self.overall_acrt_conf_int[1]:.4f}]", + "", + ] + ) # Dose-response curve summary (first/mid/last points) if len(self.dose_grid) > 0: - lines.extend([ - "-" * w, - "Dose-Response Curve (selected points)".center(w), - "-" * w, - f"{'Dose':>10} {'ATT(d)':>12} {'SE':>10} " - f"{'ACRT(d)':>12} {'SE':>10}", - "-" * w, - ]) + lines.extend( + [ + "-" * w, + "Dose-Response Curve (selected points)".center(w), + "-" * w, + f"{'Dose':>10} {'ATT(d)':>12} {'SE':>10} " f"{'ACRT(d)':>12} {'SE':>10}", + "-" * w, + ] + ) n_grid = len(self.dose_grid) indices = sorted(set([0, n_grid // 4, n_grid // 2, 3 * n_grid // 4, n_grid - 1])) for idx in indices: @@ -242,26 +269,22 @@ def summary(self, alpha: Optional[float] = None) -> str: # Event study effects if available if self.event_study_effects: - lines.extend([ - "-" * w, - "Event Study (Dynamic) Effects (Binarized ATT)".center(w), - "-" * w, - f"{'Rel. Period':<15} {'Estimate':>12} {'Std. Err.':>12} " - f"{'t-stat':>10} {'P>|t|':>10} {'Sig.':>6}", - "-" * w, - ]) + lines.extend( + [ + "-" * w, + "Event Study (Dynamic) Effects (Binarized ATT)".center(w), + "-" * w, + f"{'Rel. Period':<15} {'Estimate':>12} {'Std. Err.':>12} " + f"{'t-stat':>10} {'P>|t|':>10} {'Sig.':>6}", + "-" * w, + ] + ) for rel_t in sorted(self.event_study_effects.keys()): eff = self.event_study_effects[rel_t] sig = _get_significance_stars(eff["p_value"]) - t_str = ( - f"{eff['t_stat']:>10.3f}" - if np.isfinite(eff["t_stat"]) - else f"{'NaN':>10}" - ) + t_str = f"{eff['t_stat']:>10.3f}" if np.isfinite(eff["t_stat"]) else f"{'NaN':>10}" p_str = ( - f"{eff['p_value']:>10.4f}" - if np.isfinite(eff["p_value"]) - else f"{'NaN':>10}" + f"{eff['p_value']:>10.4f}" if np.isfinite(eff["p_value"]) else f"{'NaN':>10}" ) lines.append( f"{rel_t:<15} {eff['effect']:>12.4f} {eff['se']:>12.4f} " @@ -269,10 +292,12 @@ def summary(self, alpha: Optional[float] = None) -> str: ) lines.extend(["-" * w, ""]) - lines.extend([ - "Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1", - "=" * w, - ]) + lines.extend( + [ + "Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1", + "=" * w, + ] + ) return "\n".join(lines) def print_summary(self, alpha: Optional[float] = None) -> None: @@ -320,9 +345,7 @@ def to_dataframe(self, level: str = "dose_response") -> pd.DataFrame: return pd.DataFrame(rows) elif level == "event_study": if self.event_study_effects is None: - raise ValueError( - "Event study effects not computed. Use aggregate='eventstudy'." - ) + raise ValueError("Event study effects not computed. Use aggregate='eventstudy'.") rows = [] for rel_t, data in sorted(self.event_study_effects.items()): rows.append( diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py index aa047a0a..db1db036 100644 --- a/diff_diff/efficient_did.py +++ b/diff_diff/efficient_did.py @@ -133,6 +133,7 @@ def __init__( self.kernel_bandwidth = kernel_bandwidth self.is_fitted_ = False self.results_: Optional[EfficientDiDResults] = None + self._unit_resolved_survey = None self._validate_params() def _validate_params(self) -> None: @@ -204,6 +205,7 @@ def fit( covariates: Optional[List[str]] = None, aggregate: Optional[str] = None, balance_e: Optional[int] = None, + survey_design: Optional[Any] = None, ) -> EfficientDiDResults: """Fit the Efficient DiD estimator. @@ -229,6 +231,11 @@ def fit( ``"all"``. balance_e : int, optional Balance event study at this relative period. + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. + Applies survey weights to all means, covariances, and cohort + fractions, and uses Taylor Series Linearization for SE + estimation. Returns ------- @@ -239,9 +246,45 @@ def fit( ValueError Missing columns, unbalanced panel, non-absorbing treatment, or PT-Post without a never-treated group. + NotImplementedError + If ``n_bootstrap > 0`` with ``survey_design``. """ self._validate_params() + # Resolve survey design if provided + from diff_diff.survey import _resolve_survey_for_fit + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Validate within-unit constancy for panel survey designs + if resolved_survey is not None: + from diff_diff.survey import _validate_unit_constant_survey + + _validate_unit_constant_survey(data, unit, survey_design) + + # Store survey df for safe_inference calls (t-distribution with survey df) + self._survey_df = survey_metadata.df_survey if survey_metadata is not None else None + + # Guard bootstrap + survey + if self.n_bootstrap > 0 and resolved_survey is not None: + raise NotImplementedError( + "Multiplier bootstrap with survey weights is not yet supported " + "for EfficientDiD. Use analytical inference (n_bootstrap=0) with " + "survey_design for design-based standard errors." + ) + + # Guard covariates + survey (DR path does not yet thread survey weights) + if covariates is not None and len(covariates) > 0 and resolved_survey is not None: + raise NotImplementedError( + "Survey weights with covariates are not yet supported for " + "EfficientDiD. The doubly robust covariate path does not " + "thread survey weights through nuisance estimation. " + "Use covariates=None with survey_design, or drop survey_design " + "when using covariates." + ) + # Normalize empty covariates list to None (use nocov path) if covariates is not None and len(covariates) == 0: covariates = None @@ -328,6 +371,46 @@ def fit( all_units = sorted(df[unit].unique()) n_units = len(all_units) + # Build unit-to-first-panel-row index aligned to all_units (sorted) + # order. The previous approach (groupby cumcount == 0) yielded + # first-appearance order which can differ from sorted order when the + # input DataFrame is not pre-sorted by unit. + first_pos: Dict[Any, int] = {} + for i, u in enumerate(df[unit].values): + if u not in first_pos: + first_pos[u] = i + self._unit_first_panel_row = np.array([first_pos[u] for u in all_units]) + + # Build unit-level ResolvedSurveyDesign once (avoids repeated + # construction in _compute_survey_eif_se and ensures consistent + # unit-level df for safe_inference t-distribution). + if resolved_survey is not None: + from diff_diff.survey import ResolvedSurveyDesign + + row_idx = self._unit_first_panel_row + unit_weights_s = resolved_survey.weights[row_idx] + unit_strata = ( + resolved_survey.strata[row_idx] if resolved_survey.strata is not None else None + ) + unit_psu = resolved_survey.psu[row_idx] if resolved_survey.psu is not None else None + unit_fpc = resolved_survey.fpc[row_idx] if resolved_survey.fpc is not None else None + n_strata_u = len(np.unique(unit_strata)) if unit_strata is not None else 0 + n_psu_u = len(np.unique(unit_psu)) if unit_psu is not None else 0 + self._unit_resolved_survey = ResolvedSurveyDesign( + weights=unit_weights_s, + weight_type=resolved_survey.weight_type, + strata=unit_strata, + psu=unit_psu, + fpc=unit_fpc, + n_strata=n_strata_u, + n_psu=n_psu_u, + lonely_psu=resolved_survey.lonely_psu, + ) + # Use unit-level df (not panel-level) for t-distribution + self._survey_df = self._unit_resolved_survey.df_survey + else: + self._unit_resolved_survey = None + period_to_col = {p: i for i, p in enumerate(time_periods)} period_1 = time_periods[0] period_1_col = period_to_col[period_1] @@ -350,10 +433,30 @@ def fit( never_treated_mask = unit_cohorts == 0 cohort_masks[np.inf] = never_treated_mask # also keyed by inf sentinel + # ----- Unit-level survey weights ----- + # Survey weights in the panel are at obs level (unit x time). + # EfficientDiD works at unit level. Extract one weight per unit + # by taking the first observation per unit (balanced panel, so + # weights should be constant within unit). + unit_level_weights: Optional[np.ndarray] = None + if resolved_survey is not None: + # Use the resolved survey's weights (already normalized per weight_type) + # subset to unit level via _unit_first_panel_row (aligned to all_units) + unit_level_weights = self._unit_resolved_survey.weights + cohort_fractions: Dict[float, float] = {} - for g in treatment_groups: - cohort_fractions[g] = float(np.sum(cohort_masks[g])) / n_units - cohort_fractions[np.inf] = float(np.sum(never_treated_mask)) / n_units + if unit_level_weights is not None: + # Survey-weighted cohort fractions: sum(w_i for i in cohort) / sum(w_i) + total_w = float(np.sum(unit_level_weights)) + for g in treatment_groups: + cohort_fractions[g] = float(np.sum(unit_level_weights[cohort_masks[g]])) / total_w + cohort_fractions[np.inf] = ( + float(np.sum(unit_level_weights[never_treated_mask])) / total_w + ) + else: + for g in treatment_groups: + cohort_fractions[g] = float(np.sum(cohort_masks[g])) / n_units + cohort_fractions[np.inf] = float(np.sum(never_treated_mask)) / n_units # ----- Covariate preparation (if provided) ----- covariate_matrix: Optional[np.ndarray] = None @@ -582,6 +685,7 @@ def fit( period_to_col=period_to_col, period_1_col=effective_p1_col, cohort_fractions=cohort_fractions, + unit_weights=unit_level_weights, ) weights, _, cond_num = compute_efficient_weights(omega) @@ -598,6 +702,7 @@ def fit( never_treated_mask=never_treated_mask, period_to_col=period_to_col, period_1_col=effective_p1_col, + unit_weights=unit_level_weights, ) att_gt = float(weights @ y_hat) if len(weights) > 0 else np.nan @@ -614,13 +719,20 @@ def fit( period_1_col=effective_p1_col, cohort_fractions=cohort_fractions, n_units=n_units, + unit_weights=unit_level_weights, ) eif_by_gt[(g, t)] = eif_vals # Analytical SE = sqrt(mean(EIF^2) / n) [paper p.21] - se_gt = float(np.sqrt(np.mean(eif_vals**2) / n_units)) + # With survey: use TSL variance via compute_survey_vcov + if self._unit_resolved_survey is not None: + se_gt = self._compute_survey_eif_se(eif_vals) + else: + se_gt = float(np.sqrt(np.mean(eif_vals**2) / n_units)) - t_stat, p_val, ci = safe_inference(att_gt, se_gt, alpha=self.alpha) + t_stat, p_val, ci = safe_inference( + att_gt, se_gt, alpha=self.alpha, df=self._survey_df + ) group_time_effects[(g, t)] = { "effect": att_gt, @@ -642,7 +754,9 @@ def fit( overall_att, overall_se = self._aggregate_overall( group_time_effects, eif_by_gt, n_units, cohort_fractions, unit_cohorts ) - overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha) + overall_t, overall_p, overall_ci = safe_inference( + overall_att, overall_se, alpha=self.alpha, df=self._survey_df + ) event_study_effects = None group_effects = None @@ -761,10 +875,51 @@ def fit( sieve_criterion=self.sieve_criterion, ratio_clip=self.ratio_clip, kernel_bandwidth=self.kernel_bandwidth, + survey_metadata=( + self._recompute_unit_survey_metadata(survey_metadata) + if survey_metadata is not None + else None + ), ) self.is_fitted_ = True return self.results_ + def _recompute_unit_survey_metadata(self, panel_metadata): + """Recompute survey metadata from unit-level design if available.""" + if self._unit_resolved_survey is not None: + from diff_diff.survey import compute_survey_metadata + + return compute_survey_metadata( + self._unit_resolved_survey, + self._unit_resolved_survey.weights, + ) + return panel_metadata + + # -- Survey SE helpers ---------------------------------------------------- + + def _compute_survey_eif_se(self, eif_vals: np.ndarray) -> float: + """Compute SE from EIF scores using Taylor Series Linearization. + + Uses the pre-built unit-level ``_unit_resolved_survey`` constructed + once in ``fit()``, ensuring consistent unit-level arrays and + avoiding repeated subsetting of panel-level survey data. + """ + from diff_diff.survey import compute_survey_vcov + + X_ones = np.ones((len(eif_vals), 1)) + vcov = compute_survey_vcov(X_ones, eif_vals, self._unit_resolved_survey) + return float(np.sqrt(np.abs(vcov[0, 0]))) + + def _eif_se(self, eif_vals: np.ndarray, n_units: int) -> float: + """Compute SE from aggregated EIF scores. + + Dispatches to survey TSL when ``_unit_resolved_survey`` is set + (during fit), otherwise uses the standard analytical formula. + """ + if self._unit_resolved_survey is not None: + return self._compute_survey_eif_se(eif_vals) + return float(np.sqrt(np.mean(eif_vals**2) / n_units)) + # -- Aggregation helpers -------------------------------------------------- def _compute_wif_contribution( @@ -869,7 +1024,8 @@ def _aggregate_overall( agg_eif_total = agg_eif + wif # both O(1) scale # SE = sqrt(mean(EIF^2) / n) — standard IF-based SE - se = float(np.sqrt(np.mean(agg_eif_total**2) / n_units)) + # (dispatches to survey TSL when survey context is active) + se = self._eif_se(agg_eif_total, n_units) return overall_att, se @@ -961,9 +1117,11 @@ def _aggregate_event_study( ) agg_eif = agg_eif + wif - agg_se = float(np.sqrt(np.mean(agg_eif**2) / n_units)) + agg_se = self._eif_se(agg_eif, n_units) - t_stat, p_val, ci = safe_inference(agg_eff, agg_se, alpha=self.alpha) + t_stat, p_val, ci = safe_inference( + agg_eff, agg_se, alpha=self.alpha, df=self._survey_df + ) result[e] = { "effect": agg_eff, "se": agg_se, @@ -1021,9 +1179,11 @@ def _aggregate_by_group( agg_eif = np.zeros(n_units) for k, gt in enumerate(g_gts): agg_eif += w[k] * eif_by_gt[gt] - agg_se = float(np.sqrt(np.mean(agg_eif**2) / n_units)) + agg_se = self._eif_se(agg_eif, n_units) - t_stat, p_val, ci = safe_inference(agg_eff, agg_se, alpha=self.alpha) + t_stat, p_val, ci = safe_inference( + agg_eff, agg_se, alpha=self.alpha, df=self._survey_df + ) result[g] = { "effect": agg_eff, "se": agg_se, diff --git a/diff_diff/efficient_did_results.py b/diff_diff/efficient_did_results.py index 35393466..24ca1fa5 100644 --- a/diff_diff/efficient_did_results.py +++ b/diff_diff/efficient_did_results.py @@ -119,6 +119,8 @@ class EfficientDiDResults: sieve_criterion: str = "bic" ratio_clip: float = 20.0 kernel_bandwidth: Optional[float] = None + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: sig = _get_significance_stars(self.overall_p_value) @@ -155,6 +157,27 @@ def summary(self, alpha: Optional[float] = None) -> str: lines.append(f"{'Bootstrap:':<30} {self.n_bootstrap:>10} ({self.bootstrap_weights})") lines.append("") + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "-" * 85, + "Survey Design".center(85), + "-" * 85, + f"{'Weight type:':<30} {sm.weight_type:>10}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<30} {sm.n_strata:>10}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>10}") + lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>10.1f}") + lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>10.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>10}") + lines.extend(["-" * 85, ""]) + # Overall ATT lines.extend( [ diff --git a/diff_diff/efficient_did_weights.py b/diff_diff/efficient_did_weights.py index 453cf12c..685feaac 100644 --- a/diff_diff/efficient_did_weights.py +++ b/diff_diff/efficient_did_weights.py @@ -10,7 +10,7 @@ """ import warnings -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np @@ -101,15 +101,36 @@ def enumerate_valid_triples( return pairs -def _sample_cov(a: np.ndarray, b: np.ndarray) -> float: +def _sample_cov( + a: np.ndarray, + b: np.ndarray, + w: Optional[np.ndarray] = None, +) -> float: """Sample covariance between two 1-D arrays (ddof=1). Returns 0.0 if fewer than 2 observations. + + Parameters + ---------- + a, b : ndarray, shape (n,) + Data arrays. + w : ndarray, shape (n,), optional + Survey weights. When provided, computes the reliability-weighted + covariance: ``sum(w*(a-a_bar)*(b-b_bar)) / (sum(w) - 1)`` where + ``a_bar = average(a, weights=w)``. """ n = len(a) if n < 2: return 0.0 - return float(((a - a.mean()) * (b - b.mean())).sum() / (n - 1)) + if w is None: + return float(((a - a.mean()) * (b - b.mean())).sum() / (n - 1)) + # Weighted covariance with reliability weights (Bessel-style correction) + a_bar = float(np.average(a, weights=w)) + b_bar = float(np.average(b, weights=w)) + sum_w = float(np.sum(w)) + if sum_w <= 1.0: + return 0.0 + return float(np.sum(w * (a - a_bar) * (b - b_bar)) / (sum_w - 1.0)) def compute_omega_star_nocov( @@ -123,6 +144,7 @@ def compute_omega_star_nocov( period_1_col: int, cohort_fractions: Dict[float, float], never_treated_val: float = np.inf, + unit_weights: Optional[np.ndarray] = None, ) -> np.ndarray: """Build the |H| x |H| covariance matrix Omega* (Eq 3.12, unconditional). @@ -152,6 +174,9 @@ def compute_omega_star_nocov( ``{cohort: n_cohort / n}`` for each cohort. never_treated_val : float Sentinel for the never-treated group. + unit_weights : ndarray, shape (n_units,), optional + Survey weights at the unit level. When provided, all sample + means and covariances are weighted. Returns ------- @@ -170,6 +195,10 @@ def compute_omega_star_nocov( Y_g = outcome_wide[g_mask] # (n_g, n_periods) pi_g = cohort_fractions[target_g] + # Extract per-cohort weights (None propagates = unweighted) + w_g = unit_weights[g_mask] if unit_weights is not None else None + w_inf = unit_weights[never_treated_mask] if unit_weights is not None else None + # Y_t - Y_1 for the target group Yg_t_minus_1 = Y_g[:, t_col] - Y_g[:, y1_col] @@ -182,7 +211,7 @@ def compute_omega_star_nocov( # Hoist Term 1: (1/pi_g) * Var(Y_t - Y_1 | G=g) — same for all (j, k) term1 = 0.0 if pi_g > 0: - term1 = (1.0 / pi_g) * _sample_cov(Yg_t_minus_1, Yg_t_minus_1) + term1 = (1.0 / pi_g) * _sample_cov(Yg_t_minus_1, Yg_t_minus_1, w=w_g) # Precompute differenced arrays to avoid redundant slicing in the loop # Never-treated: Y_t - Y_{tpre} and Y_{tpre} - Y_1 for each tpre @@ -206,10 +235,14 @@ def compute_omega_star_nocov( # Comparison cohort submatrices: cache outcome_wide[cohort_masks[gp]] gp_outcomes: Dict[float, np.ndarray] = {} + gp_weights: Dict[float, Optional[np.ndarray]] = {} for gp, _ in valid_pairs: if not np.isinf(gp) and gp not in gp_outcomes: if gp in cohort_masks: gp_outcomes[gp] = outcome_wide[cohort_masks[gp]] + gp_weights[gp] = ( + unit_weights[cohort_masks[gp]] if unit_weights is not None else None + ) # Comparison cohort: Y_{tpre} - Y_1 for each (gp, tpre) pair in Term 5 gp_tpre_minus_1: Dict[Tuple[float, int], np.ndarray] = {} @@ -227,23 +260,35 @@ def compute_omega_star_nocov( # Term 2: (1/pi_inf) * SampleCov(Y_t - Y_{tpre_j}, Y_t - Y_{tpre_k} | G=inf) if pi_inf > 0 and tpre_j_col in inf_t_minus_tpre: val += (1.0 / pi_inf) * _sample_cov( - inf_t_minus_tpre[tpre_j_col], inf_t_minus_tpre[tpre_k_col] + inf_t_minus_tpre[tpre_j_col], + inf_t_minus_tpre[tpre_k_col], + w=w_inf, ) # Term 3: -1{g == g'_j} / pi_g * SampleCov(Y_t-Y_1, Y_{tpre_j}-Y_1 | G=g) if gp_j == target_g and tpre_j_col in g_tpre_minus_1: - val -= (1.0 / pi_g) * _sample_cov(Yg_t_minus_1, g_tpre_minus_1[tpre_j_col]) + val -= (1.0 / pi_g) * _sample_cov( + Yg_t_minus_1, + g_tpre_minus_1[tpre_j_col], + w=w_g, + ) # Term 4: -1{g == g'_k} / pi_g * SampleCov(Y_t-Y_1, Y_{tpre_k}-Y_1 | G=g) if gp_k == target_g and tpre_k_col in g_tpre_minus_1: - val -= (1.0 / pi_g) * _sample_cov(Yg_t_minus_1, g_tpre_minus_1[tpre_k_col]) + val -= (1.0 / pi_g) * _sample_cov( + Yg_t_minus_1, + g_tpre_minus_1[tpre_k_col], + w=w_g, + ) # Term 5: 1{g'_j == g'_k} / pi_{g'_j} * SampleCov(Y_{tpre_j}-Y_1, Y_{tpre_k}-Y_1 | G=g'_j) if gp_j == gp_k: if np.isinf(gp_j): if pi_inf > 0 and tpre_j_col in inf_tpre_minus_1: val += (1.0 / pi_inf) * _sample_cov( - inf_tpre_minus_1[tpre_j_col], inf_tpre_minus_1[tpre_k_col] + inf_tpre_minus_1[tpre_j_col], + inf_tpre_minus_1[tpre_k_col], + w=w_inf, ) else: pi_gp = cohort_fractions.get(gp_j, 0.0) @@ -251,6 +296,7 @@ def compute_omega_star_nocov( Y_gp = gp_outcomes.get(gp_j) if Y_gp is None: Y_gp = outcome_wide[cohort_masks[gp_j]] + w_gp = gp_weights.get(gp_j) if len(Y_gp) >= 2: # Cache tpre diffs for comparison cohorts key_j = (gp_j, tpre_j_col) @@ -260,7 +306,9 @@ def compute_omega_star_nocov( if key_k not in gp_tpre_minus_1: gp_tpre_minus_1[key_k] = Y_gp[:, tpre_k_col] - Y_gp[:, y1_col] val += (1.0 / pi_gp) * _sample_cov( - gp_tpre_minus_1[key_j], gp_tpre_minus_1[key_k] + gp_tpre_minus_1[key_j], + gp_tpre_minus_1[key_k], + w=w_gp, ) omega[j, k] = val @@ -354,6 +402,7 @@ def compute_generated_outcomes_nocov( period_to_col: Dict[float, int], period_1_col: int, never_treated_val: float = np.inf, + unit_weights: Optional[np.ndarray] = None, ) -> np.ndarray: """Compute generated outcome vector (one scalar per valid pair). @@ -378,6 +427,9 @@ def compute_generated_outcomes_nocov( Pre-computed data structures. never_treated_val : float Sentinel for never-treated. + unit_weights : ndarray, shape (n_units,), optional + Survey weights at the unit level. When provided, all sample + means become weighted means. Returns ------- @@ -391,10 +443,20 @@ def compute_generated_outcomes_nocov( t_col = period_to_col[target_t] y1_col = period_1_col - # Target group mean: mean(Y_t - Y_1 | G = g) + # Helper: weighted or unweighted mean + def _wmean(vals: np.ndarray, w: Optional[np.ndarray]) -> float: + if w is not None: + return float(np.average(vals, weights=w)) + return float(np.mean(vals)) + + # Per-cohort weights g_mask = cohort_masks[target_g] + w_g = unit_weights[g_mask] if unit_weights is not None else None + w_inf = unit_weights[never_treated_mask] if unit_weights is not None else None + + # Target group mean: mean(Y_t - Y_1 | G = g) Y_g = outcome_wide[g_mask] - mean_g_t_1 = float(np.mean(Y_g[:, t_col] - Y_g[:, y1_col])) + mean_g_t_1 = _wmean(Y_g[:, t_col] - Y_g[:, y1_col], w_g) # Never-treated outcomes Y_inf = outcome_wide[never_treated_mask] @@ -405,14 +467,16 @@ def compute_generated_outcomes_nocov( tpre_col = period_to_col[tpre] # mean(Y_t - Y_{tpre} | G = inf) - mean_inf_t_tpre = float(np.mean(Y_inf[:, t_col] - Y_inf[:, tpre_col])) + mean_inf_t_tpre = _wmean(Y_inf[:, t_col] - Y_inf[:, tpre_col], w_inf) # mean(Y_{tpre} - Y_1 | G = g') if np.isinf(gp): Y_gp = Y_inf + w_gp = w_inf else: Y_gp = outcome_wide[cohort_masks[gp]] - mean_gp_tpre_1 = float(np.mean(Y_gp[:, tpre_col] - Y_gp[:, y1_col])) + w_gp = unit_weights[cohort_masks[gp]] if unit_weights is not None else None + mean_gp_tpre_1 = _wmean(Y_gp[:, tpre_col] - Y_gp[:, y1_col], w_gp) y_hat[j] = mean_g_t_1 - mean_inf_t_tpre - mean_gp_tpre_1 @@ -432,6 +496,7 @@ def compute_eif_nocov( cohort_fractions: Dict[float, float], n_units: int, never_treated_val: float = np.inf, + unit_weights: Optional[np.ndarray] = None, ) -> np.ndarray: """Compute per-unit efficient influence function values. @@ -462,6 +527,9 @@ def compute_eif_nocov( outcome_wide, cohort_masks, never_treated_mask, period_to_col, period_1_col, cohort_fractions, n_units, never_treated_val : Pre-computed data structures. + unit_weights : ndarray, shape (n_units,), optional + Survey weights at the unit level. When provided, within-group + means are weighted means. Returns ------- @@ -482,11 +550,21 @@ def compute_eif_nocov( Y_inf = outcome_wide[never_treated_mask] pi_inf = cohort_fractions.get(never_treated_val, 0.0) + # Per-cohort weights + w_g = unit_weights[g_mask] if unit_weights is not None else None + w_inf = unit_weights[never_treated_mask] if unit_weights is not None else None + + # Helper for weighted/unweighted mean + def _wmean(vals: np.ndarray, w: Optional[np.ndarray]) -> float: + if w is not None: + return float(np.average(vals, weights=w)) + return float(np.mean(vals)) + eif = np.zeros(n_units) # Hoist treated-group computations out of the per-pair loop (j-invariant) Yg_t_minus_1 = Y_g[:, t_col] - Y_g[:, y1_col] - mean_g_t_1 = float(np.mean(Yg_t_minus_1)) + mean_g_t_1 = _wmean(Yg_t_minus_1, w_g) treated_demeaned = None if pi_g > 0: treated_demeaned = (1.0 / pi_g) * (Yg_t_minus_1 - mean_g_t_1) @@ -507,7 +585,7 @@ def compute_eif_nocov( # --- Never-treated term --- if tpre_col not in inf_diffs: inf_diffs[tpre_col] = Y_inf[:, t_col] - Y_inf[:, tpre_col] - inf_means[tpre_col] = float(np.mean(inf_diffs[tpre_col])) + inf_means[tpre_col] = _wmean(inf_diffs[tpre_col], w_inf) if pi_inf > 0: inf_contrib = -(1.0 / pi_inf) * (inf_diffs[tpre_col] - inf_means[tpre_col]) eif[never_treated_mask] += w_j * inf_contrib @@ -518,7 +596,7 @@ def compute_eif_nocov( # Comparison group is never-treated; contribution is folded into # the never-treated term via Y_{tpre} - Y_1 differencing. # Additional term: -(1/pi_inf) * demeaned (Y_{tpre} - Y_1 | G=inf) - mean_inf_tpre_1 = np.mean(Y_inf[:, tpre_col] - Y_inf[:, y1_col]) + mean_inf_tpre_1 = _wmean(Y_inf[:, tpre_col] - Y_inf[:, y1_col], w_inf) if pi_inf > 0: gp_contrib = -(1.0 / pi_inf) * ( (Y_inf[:, tpre_col] - Y_inf[:, y1_col]) - mean_inf_tpre_1 @@ -528,7 +606,8 @@ def compute_eif_nocov( gp_mask = cohort_masks[gp] Y_gp = outcome_wide[gp_mask] pi_gp = cohort_fractions.get(gp, 0.0) - mean_gp_tpre_1 = np.mean(Y_gp[:, tpre_col] - Y_gp[:, y1_col]) + w_gp = unit_weights[gp_mask] if unit_weights is not None else None + mean_gp_tpre_1 = _wmean(Y_gp[:, tpre_col] - Y_gp[:, y1_col], w_gp) if pi_gp > 0: gp_contrib = -(1.0 / pi_gp) * ( (Y_gp[:, tpre_col] - Y_gp[:, y1_col]) - mean_gp_tpre_1 diff --git a/diff_diff/stacked_did.py b/diff_diff/stacked_did.py index e01b6b23..4ef6a6ee 100644 --- a/diff_diff/stacked_did.py +++ b/diff_diff/stacked_did.py @@ -17,6 +17,7 @@ Difference-in-Differences. NBER Working Paper 32054. """ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple @@ -167,6 +168,7 @@ def fit( first_treat: str, aggregate: Optional[str] = None, population: Optional[str] = None, + survey_design=None, ) -> StackedDiDResults: """ Fit the stacked DiD estimator. @@ -193,6 +195,10 @@ def fit( population : str, optional Column name for population weights. Required only when weighting="population". + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. When + provided, uses Taylor Series Linearization for variance + estimation and applies sampling weights to the regression. Returns ------- @@ -227,6 +233,38 @@ def fit( if self.weighting == "population" and population is None: raise ValueError("population column must be specified when weighting='population'") + # ---- Resolve survey design ---- + from diff_diff.survey import ( + SurveyDesign, + _resolve_survey_for_fit, + ) + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Reject fweight and aweight — Q-weight composition is ratio-valued + # and breaks both frequency-weight (integer) and analytic-weight + # (inverse-variance) semantics after multiplicative composition + if ( + survey_design is not None + and hasattr(survey_design, "weight_type") + and survey_design.weight_type in ("fweight", "aweight") + ): + raise ValueError( + f"StackedDiD does not support weight_type='{survey_design.weight_type}' " + "because Q-weight composition changes the weight semantics. " + "Use weight_type='pweight' (default) instead." + ) + + # Collect survey design column names for propagation through sub-experiments + survey_cols: List[str] = [] + if survey_design is not None and isinstance(survey_design, SurveyDesign): + for attr in ("weights", "strata", "psu", "fpc"): + col_name = getattr(survey_design, attr, None) + if col_name is not None: + survey_cols.append(col_name) + df = data.copy() df[time] = pd.to_numeric(df[time]) df[first_treat] = pd.to_numeric(df[first_treat]) @@ -263,7 +301,16 @@ def fit( sub_experiments = [] skipped_events = [] for a in omega_kappa: - sub_exp = self._build_sub_experiment(df, unit_info, a, unit, time, first_treat, outcome) + sub_exp = self._build_sub_experiment( + df, + unit_info, + a, + unit, + time, + first_treat, + outcome, + extra_cols=survey_cols, + ) if sub_exp is not None and len(sub_exp) > 0: sub_experiments.append(sub_exp) else: @@ -331,7 +378,25 @@ def fit( # WLS via sqrt(w) transformation Q_weights = stacked_df["_Q_weight"].values - sqrt_w = np.sqrt(Q_weights) + n_stacked = len(stacked_df) + + # Compose Q-weights with survey weights if survey design is present + if resolved_survey is not None and survey_weights is not None: + # Survey weights were resolved on the original data; the stacked + # dataset carries the survey weight column through _build_sub_experiment. + # Re-extract from the stacked data so lengths match. + survey_weights_stacked = ( + stacked_df[survey_design.weights].values.astype(np.float64) + if survey_design.weights is not None + else np.ones(n_stacked, dtype=np.float64) + ) + composed_weights = Q_weights * survey_weights_stacked + # Normalize composed weights to sum = n_stacked + composed_weights = composed_weights * (n_stacked / np.sum(composed_weights)) + else: + composed_weights = Q_weights + + sqrt_w = np.sqrt(composed_weights) Y = stacked_df[outcome].values Y_t = Y * sqrt_w X_t = X * sqrt_w[:, np.newaxis] @@ -354,6 +419,43 @@ def fit( ) assert vcov is not None + # ---- Survey VCV override (TSL variance) ---- + if resolved_survey is not None: + from diff_diff.survey import ( + _inject_cluster_as_psu, + _resolve_effective_cluster, + compute_survey_metadata, + compute_survey_vcov, + ) + + # Re-resolve survey design on the stacked data so that strata/PSU + # arrays have the correct length for TSL variance estimation. + resolved_stacked = survey_design.resolve(stacked_df) + + # Create a copy with composed weights (normalized to sum=n_stacked) + resolved_composed = copy.copy(resolved_stacked) + resolved_composed.weights = composed_weights + + # Original-scale residuals for TSL variance + resid_orig = Y - X @ coef + + # Inject cluster as PSU when survey design has no explicit PSU + resolved_composed = _inject_cluster_as_psu(resolved_composed, cluster_ids) + + # Resolve effective cluster (PSU overrides user-specified cluster) + _resolve_effective_cluster(resolved_composed, cluster_ids, self.cluster) + + # Compute TSL variance + vcov = compute_survey_vcov(X, resid_orig, resolved_composed) + + # Recompute survey metadata on the stacked resolved design + raw_w_stacked = ( + stacked_df[survey_design.weights].values.astype(np.float64) + if survey_design.weights is not None + else np.ones(n_stacked, dtype=np.float64) + ) + survey_metadata = compute_survey_metadata(resolved_composed, raw_w_stacked) + # ---- Extract event study effects ---- event_study_effects: Optional[Dict[int, Dict[str, Any]]] = None if aggregate == "event_study": @@ -371,7 +473,14 @@ def fit( idx = interaction_indices[h] effect = float(coef[idx]) se = float(np.sqrt(max(vcov[idx, idx], 0.0))) - t_stat, p_value, conf_int = safe_inference(effect, se, alpha=self.alpha) + _survey_df = ( + max(survey_metadata.df_survey, 1) + if survey_metadata is not None and survey_metadata.df_survey is not None + else None + ) + t_stat, p_value, conf_int = safe_inference( + effect, se, alpha=self.alpha, df=_survey_df + ) n_obs_h = int(np.sum((et_vals == h) & (d_vals == 1))) event_study_effects[h] = { "effect": effect, @@ -401,7 +510,14 @@ def fit( overall_att = np.nan overall_se = np.nan - overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha) + _survey_df_overall = ( + max(survey_metadata.df_survey, 1) + if survey_metadata is not None and survey_metadata.df_survey is not None + else None + ) + overall_t, overall_p, overall_ci = safe_inference( + overall_att, overall_se, alpha=self.alpha, df=_survey_df_overall + ) # ---- Construct results ---- self.results_ = StackedDiDResults( @@ -426,6 +542,7 @@ def fit( weighting=self.weighting, clean_control=self.clean_control, alpha=self.alpha, + survey_metadata=survey_metadata, ) self.is_fitted_ = True @@ -535,6 +652,7 @@ def _build_sub_experiment( time: str, first_treat: str, outcome: str, + extra_cols: Optional[List[str]] = None, ) -> Optional[pd.DataFrame]: """ Build a single sub-experiment for adoption event a. @@ -549,6 +667,10 @@ def _build_sub_experiment( Adoption event time. unit, time, first_treat, outcome : str Column names. + extra_cols : list of str, optional + Additional columns to propagate from the source data into the + sub-experiment (e.g., survey design columns: weights, strata, + psu, fpc). Returns ------- @@ -816,6 +938,7 @@ def stacked_did( kappa_post: int = 1, aggregate: Optional[str] = None, population: Optional[str] = None, + survey_design=None, **kwargs: Any, ) -> StackedDiDResults: """ @@ -843,6 +966,8 @@ def stacked_did( Aggregation mode: None, "simple", or "event_study". population : str, optional Population column for weighting="population". + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. **kwargs Additional keyword arguments passed to StackedDiD constructor. @@ -869,4 +994,5 @@ def stacked_did( first_treat=first_treat, aggregate=aggregate, population=population, + survey_design=survey_design, ) diff --git a/diff_diff/stacked_did_results.py b/diff_diff/stacked_did_results.py index 99b141a0..631f3817 100644 --- a/diff_diff/stacked_did_results.py +++ b/diff_diff/stacked_did_results.py @@ -93,6 +93,8 @@ class StackedDiDResults: weighting: str = "aggregate" clean_control: str = "not_yet_treated" alpha: float = 0.05 + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: """Concise string representation.""" @@ -139,6 +141,27 @@ def summary(self, alpha: Optional[float] = None) -> str: "", ] + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "-" * 85, + "Survey Design".center(85), + "-" * 85, + f"{'Weight type:':<30} {sm.weight_type:>10}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<30} {sm.n_strata:>10}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>10}") + lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>10.1f}") + lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>10.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>10}") + lines.extend(["-" * 85, ""]) + # Overall ATT lines.extend( [ diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py index e650f5b0..107df479 100644 --- a/diff_diff/sun_abraham.py +++ b/diff_diff/sun_abraham.py @@ -17,7 +17,7 @@ import pandas as pd from diff_diff.bootstrap_utils import compute_effect_bootstrap_stats -from diff_diff.linalg import LinearRegression, compute_robust_vcov +from diff_diff.linalg import LinearRegression from diff_diff.results import _get_significance_stars from diff_diff.utils import ( safe_inference, @@ -83,6 +83,8 @@ class SunAbrahamResults: cohort_effects: Optional[Dict[Tuple[Any, int], Dict[str, Any]]] = field( default=None, repr=False ) + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: """Concise string representation.""" @@ -126,6 +128,27 @@ def summary(self, alpha: Optional[float] = None) -> str: "", ] + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "-" * 85, + "Survey Design".center(85), + "-" * 85, + f"{'Weight type:':<30} {sm.weight_type:>10}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<30} {sm.n_strata:>10}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>10}") + lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>10.1f}") + lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>10.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>10}") + lines.extend(["-" * 85, ""]) + # Overall ATT lines.extend( [ @@ -434,6 +457,7 @@ def fit( time: str, first_treat: str, covariates: Optional[List[str]] = None, + survey_design: object = None, ) -> SunAbrahamResults: """ Fit the Sun-Abraham estimator using saturated regression. @@ -453,6 +477,10 @@ def fit( Use 0 (or np.inf) for never-treated units. covariates : list, optional List of covariate column names to include in regression. + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. + Supports weighted estimation and Taylor series linearization + variance with strata, PSU, and FPC. Returns ------- @@ -473,6 +501,21 @@ def fit( if missing: raise ValueError(f"Missing columns: {missing}") + # Resolve survey design if provided + from diff_diff.survey import _resolve_effective_cluster, _resolve_survey_for_fit + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Reject bootstrap + survey (pairs bootstrap with survey weights needs Phase 5) + if self.n_bootstrap > 0 and resolved_survey is not None: + raise NotImplementedError( + "Bootstrap inference with survey weights is not yet supported " + "for SunAbraham. Use analytical inference (n_bootstrap=0) with " + "survey_design for design-based standard errors." + ) + # Create working copy df = data.copy() @@ -544,6 +587,23 @@ def fit( # Keep all units (not_yet_treated will be handled by the regression) df_reg = df.copy() + # Resolve effective cluster and inject cluster-as-PSU + cluster_ids_raw = df_reg[cluster_var].values if cluster_var in df_reg.columns else None + effective_cluster_ids = _resolve_effective_cluster( + resolved_survey, cluster_ids_raw, cluster_var if self.cluster is not None else None + ) + if resolved_survey is not None and effective_cluster_ids is not None: + from diff_diff.survey import _inject_cluster_as_psu, compute_survey_metadata + + resolved_survey = _inject_cluster_as_psu(resolved_survey, effective_cluster_ids) + if resolved_survey.psu is not None and survey_metadata is not None: + raw_w = ( + data[survey_design.weights].values.astype(np.float64) + if survey_design.weights + else np.ones(len(data), dtype=np.float64) + ) + survey_metadata = compute_survey_metadata(resolved_survey, raw_w) + # Fit saturated regression ( cohort_effects, @@ -560,6 +620,25 @@ def fit( rel_periods_to_estimate, covariates, cluster_var, + survey_weights=survey_weights, + survey_weight_type=survey_weight_type, + resolved_survey=resolved_survey, + ) + + # Resolve survey weight column name for cohort aggregation + survey_weight_col = ( + survey_design.weights + if survey_design is not None + and hasattr(survey_design, "weights") + and survey_design.weights + else None + ) + + # Survey degrees of freedom for t-distribution inference + _sa_survey_df = ( + max(survey_metadata.df_survey, 1) + if survey_metadata is not None and survey_metadata.df_survey is not None + else None ) # Compute interaction-weighted event study effects @@ -573,6 +652,8 @@ def fit( cohort_ses, vcov_cohort, coef_index_map, + survey_weight_col=survey_weight_col, + survey_df=_sa_survey_df, ) # Compute overall ATT (average of post-treatment effects) @@ -584,9 +665,12 @@ def fit( cohort_weights, vcov_cohort, coef_index_map, + survey_weight_col=survey_weight_col, ) - overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha) + overall_t, overall_p, overall_ci = safe_inference( + overall_att, overall_se, alpha=self.alpha, df=_sa_survey_df + ) # Run bootstrap if requested bootstrap_results = None @@ -652,6 +736,7 @@ def fit( control_group=self.control_group, bootstrap_results=bootstrap_results, cohort_effects=cohort_effects_storage, + survey_metadata=survey_metadata, ) self.is_fitted_ = True @@ -668,6 +753,9 @@ def _fit_saturated_regression( rel_periods: List[int], covariates: Optional[List[str]], cluster_var: str, + survey_weights: Optional[np.ndarray] = None, + survey_weight_type: str = "pweight", + resolved_survey: object = None, ) -> Tuple[ Dict[Tuple[Any, int], float], Dict[Tuple[Any, int], float], @@ -729,7 +817,9 @@ def _fit_saturated_regression( if covariates: variables_to_demean.extend(covariates) - df_demeaned = self._within_transform(df, variables_to_demean, unit, time) + df_demeaned = _within_transform_util( + df, variables_to_demean, unit, time, suffix="_dm", weights=survey_weights + ) # Build design matrix X_cols = [f"{col}_dm" for col in interaction_cols] @@ -752,9 +842,11 @@ def _fit_saturated_regression( robust=True, cluster_ids=cluster_ids, rank_deficient_action=self.rank_deficient_action, + weights=survey_weights, + weight_type=survey_weight_type, + survey_design=resolved_survey, ).fit(X, y, df_adjustment=df_adj) - coefficients = reg.coefficients_ vcov = reg.vcov_ # Extract cohort effects and standard errors using get_inference @@ -798,6 +890,8 @@ def _compute_iw_effects( cohort_ses: Dict[Tuple[Any, int], float], vcov_cohort: np.ndarray, coef_index_map: Dict[Tuple[Any, int], int], + survey_weight_col: Optional[str] = None, + survey_df: Optional[int] = None, ) -> Tuple[Dict[int, Dict[str, Any]], Dict[int, Dict[Any, float]]]: """ Compute interaction-weighted event study effects. @@ -807,6 +901,10 @@ def _compute_iw_effects( where w_{g,e} = n_{g,e} / Σ_g n_{g,e} is the share of observations from cohort g at event-time e among all treated observations at that event-time. + When survey weights are provided, n_{g,e} is the survey-weighted mass + (sum of weights) rather than raw observation counts, so the estimand + reflects the survey-weighted cohort composition. + Returns ------- event_study_effects : dict @@ -817,8 +915,15 @@ def _compute_iw_effects( event_study_effects: Dict[int, Dict[str, Any]] = {} cohort_weights: Dict[int, Dict[Any, float]] = {} - # Pre-compute per-event-time observation counts: n_{g,e} - event_time_counts = df[df[first_treat] > 0].groupby([first_treat, "_rel_time"]).size() + # Pre-compute per-event-time observation mass: n_{g,e} + # With survey weights, use weighted sum; otherwise raw counts. + treated_mask = df[first_treat] > 0 + if survey_weight_col is not None and survey_weight_col in df.columns: + event_time_counts = ( + df[treated_mask].groupby([first_treat, "_rel_time"])[survey_weight_col].sum() + ) + else: + event_time_counts = df[treated_mask].groupby([first_treat, "_rel_time"]).size() for e in rel_periods: # Get cohorts that have observations at this relative time @@ -858,7 +963,7 @@ def _compute_iw_effects( agg_var = float(weight_vec @ vcov_subset @ weight_vec) agg_se = np.sqrt(max(agg_var, 0)) - t_stat, p_val, ci = safe_inference(agg_effect, agg_se, alpha=self.alpha) + t_stat, p_val, ci = safe_inference(agg_effect, agg_se, alpha=self.alpha, df=survey_df) event_study_effects[e] = { "effect": agg_effect, @@ -880,10 +985,14 @@ def _compute_overall_att( cohort_weights: Dict[int, Dict[Any, float]], vcov_cohort: np.ndarray, coef_index_map: Dict[Tuple[Any, int], int], + survey_weight_col: Optional[str] = None, ) -> Tuple[float, float]: """ Compute overall ATT as weighted average of post-treatment effects. + When survey weights are provided, the per-period weights use + survey-weighted mass rather than raw observation counts. + Returns (att, se) tuple. """ post_effects = [(e, eff) for e, eff in event_study_effects.items() if e >= 0] @@ -891,13 +1000,19 @@ def _compute_overall_att( if not post_effects: return np.nan, np.nan - # Weight by number of treated observations at each relative time + # Weight by (survey-weighted) mass of treated observations at each relative time post_weights = [] post_estimates = [] for e, eff in post_effects: - n_at_e = len(df[(df["_rel_time"] == e) & (df[first_treat] > 0)]) - post_weights.append(max(n_at_e, 1)) + mask = (df["_rel_time"] == e) & (df[first_treat] > 0) + if survey_weight_col is not None and survey_weight_col in df.columns: + # No floor for survey weights — valid masses can be < 1 + n_at_e = df.loc[mask, survey_weight_col].sum() + post_weights.append(n_at_e if n_at_e > 0 else 0.0) + else: + n_at_e = len(df[mask]) + post_weights.append(max(n_at_e, 1)) post_estimates.append(eff["effect"]) post_weights_arr = np.array(post_weights, dtype=float) diff --git a/diff_diff/survey.py b/diff_diff/survey.py index 24684b74..d47d86ee 100644 --- a/diff_diff/survey.py +++ b/diff_diff/survey.py @@ -390,6 +390,46 @@ def compute_survey_metadata( ) +def _validate_unit_constant_survey(data, unit_col, survey_design): + """Validate that survey design columns are constant within units. + + Panel estimators (ContinuousDiD, EfficientDiD) collapse panel-level + survey info to one row per unit. This requires that survey columns + do not vary across time periods within a unit. + + Parameters + ---------- + data : pd.DataFrame + Panel data. + unit_col : str + Unit identifier column name. + survey_design : SurveyDesign + Survey design specification (uses attribute names, not resolved arrays). + + Raises + ------ + ValueError + If any survey column varies within units. + """ + cols_to_check = [ + survey_design.weights, + survey_design.strata, + survey_design.psu, + survey_design.fpc, + ] + for col in cols_to_check: + if col is not None and col in data.columns: + n_unique = data.groupby(unit_col)[col].nunique() + varying_units = n_unique[n_unique > 1] + if len(varying_units) > 0: + raise ValueError( + f"Survey column '{col}' varies within units " + f"(found {len(varying_units)} units with multiple values). " + f"Panel estimators require survey design columns to be " + f"constant within units." + ) + + def _resolve_survey_for_fit(survey_design, data, inference_mode="analytical"): """ Shared helper: validate and resolve a SurveyDesign for an estimator fit() call. @@ -468,9 +508,7 @@ def _inject_cluster_as_psu(resolved, cluster_ids): # When strata are present, make cluster IDs unique within strata # (same nesting logic as SurveyDesign.resolve() with nest=True) if resolved.strata is not None: - combined = np.array( - [f"{s}_{c}" for s, c in zip(resolved.strata, cluster_ids)] - ) + combined = np.array([f"{s}_{c}" for s, c in zip(resolved.strata, cluster_ids)]) codes, uniques = pd.factorize(combined) else: codes, uniques = pd.factorize(cluster_ids) diff --git a/diff_diff/triple_diff.py b/diff_diff/triple_diff.py index 3f989f51..15f04263 100644 --- a/diff_diff/triple_diff.py +++ b/diff_diff/triple_diff.py @@ -28,7 +28,8 @@ import numpy as np import pandas as pd -from diff_diff.linalg import solve_ols, solve_logit + +from diff_diff.linalg import solve_logit, solve_ols from diff_diff.results import _get_significance_stars from diff_diff.utils import safe_inference @@ -102,6 +103,8 @@ class TripleDifferenceResults: inference_method: str = field(default="analytical") n_bootstrap: Optional[int] = field(default=None) n_clusters: Optional[int] = field(default=None) + # Survey design metadata (SurveyMetadata instance from diff_diff.survey) + survey_metadata: Optional[Any] = field(default=None) def __repr__(self) -> str: """Concise string representation.""" @@ -146,6 +149,28 @@ def summary(self, alpha: Optional[float] = None) -> str: if self.r_squared is not None: lines.append(f"{'R-squared:':<30} {self.r_squared:>15.4f}") + # Add survey design info + if self.survey_metadata is not None: + sm = self.survey_metadata + lines.extend( + [ + "", + "-" * 75, + "Survey Design".center(75), + "-" * 75, + f"{'Weight type:':<30} {sm.weight_type:>15}", + ] + ) + if sm.n_strata is not None: + lines.append(f"{'Strata:':<30} {sm.n_strata:>15}") + if sm.n_psu is not None: + lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>15}") + lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>15.1f}") + lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>15.2f}") + if sm.df_survey is not None: + lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>15}") + lines.append("-" * 75) + if self.inference_method != "analytical": lines.append(f"{'Inference method:':<30} {self.inference_method:>15}") if self.n_bootstrap is not None: @@ -236,6 +261,15 @@ def to_dict(self) -> Dict[str, Any]: result["n_bootstrap"] = self.n_bootstrap if self.n_clusters is not None: result["n_clusters"] = self.n_clusters + if self.survey_metadata is not None: + sm = self.survey_metadata + result["weight_type"] = sm.weight_type + result["effective_n"] = sm.effective_n + result["design_effect"] = sm.design_effect + result["sum_weights"] = sm.sum_weights + result["n_strata"] = sm.n_strata + result["n_psu"] = sm.n_psu + result["df_survey"] = sm.df_survey return result def to_dataframe(self) -> pd.DataFrame: @@ -416,6 +450,7 @@ def fit( partition: str, time: str, covariates: Optional[List[str]] = None, + survey_design=None, ) -> TripleDifferenceResults: """ Fit the Triple Difference model. @@ -442,6 +477,11 @@ def fit( List of covariate column names to adjust for. These are properly incorporated using the selected estimation method (unlike naive DDD implementations). + survey_design : SurveyDesign, optional + Survey design specification for complex survey data. When + provided, uses survey weights for estimation and Taylor Series + Linearization (TSL) for variance estimation. Only supported + with estimation_method="reg". Returns ------- @@ -452,7 +492,28 @@ def fit( ------ ValueError If required columns are missing or data validation fails. + NotImplementedError + If survey_design is used with estimation_method="ipw" or "dr". """ + # Resolve survey design if provided + from diff_diff.survey import ( + _inject_cluster_as_psu, + _resolve_effective_cluster, + _resolve_survey_for_fit, + compute_survey_metadata, + ) + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + # Guard IPW/DR with survey weights + if survey_design is not None and self.estimation_method in ("ipw", "dr"): + raise NotImplementedError( + "IPW and doubly robust methods with survey weights require " + "weighted solve_logit(), planned for Phase 5." + ) + # Validate inputs self._validate_data(data, outcome, group, partition, time, covariates) @@ -467,6 +528,21 @@ def fit( if self._cluster_ids is not None and np.any(pd.isna(data[self.cluster])): raise ValueError(f"Cluster column '{self.cluster}' contains missing values") + # Resolve effective cluster and inject cluster-as-PSU for survey variance + if resolved_survey is not None: + effective_cluster_ids = _resolve_effective_cluster( + resolved_survey, self._cluster_ids, self.cluster + ) + if effective_cluster_ids is not None: + resolved_survey = _inject_cluster_as_psu(resolved_survey, effective_cluster_ids) + if resolved_survey.psu is not None and survey_metadata is not None: + raw_w = ( + data[survey_design.weights].values.astype(np.float64) + if survey_design.weights + else np.ones(len(data), dtype=np.float64) + ) + survey_metadata = compute_survey_metadata(resolved_survey, raw_w) + # Get covariates if specified X = None if covariates: @@ -482,21 +558,33 @@ def fit( n_control_ineligible = int(np.sum((G == 0) & (P == 0))) # Compute cell means for diagnostics - group_means = self._compute_cell_means(y, G, P, T) + group_means = self._compute_cell_means(y, G, P, T, weights=survey_weights) # Estimate ATT based on method if self.estimation_method == "reg": - att, se, r_squared, pscore_stats = self._regression_adjustment(y, G, P, T, X) + att, se, r_squared, pscore_stats = self._regression_adjustment( + y, + G, + P, + T, + X, + survey_weights=survey_weights, + resolved_survey=resolved_survey, + ) elif self.estimation_method == "ipw": att, se, r_squared, pscore_stats = self._ipw_estimation(y, G, P, T, X) else: # doubly robust att, se, r_squared, pscore_stats = self._doubly_robust(y, G, P, T, X) # Compute inference - df = n_obs - 8 # Approximate df (8 cell means) - if covariates: - df -= len(covariates) - df = max(df, 1) + # When survey design is active, use survey df (n_PSU - n_strata) + if survey_metadata is not None and survey_metadata.df_survey is not None: + df = max(survey_metadata.df_survey, 1) + else: + df = n_obs - 8 # Approximate df (8 cell means) + if covariates: + df -= len(covariates) + df = max(df, 1) t_stat, p_value, conf_int = safe_inference(att, se, alpha=self.alpha, df=df) @@ -524,6 +612,7 @@ def fit( r_squared=r_squared, inference_method="analytical", n_clusters=n_clusters, + survey_metadata=survey_metadata, ) self.is_fitted_ = True @@ -606,6 +695,7 @@ def _compute_cell_means( G: np.ndarray, P: np.ndarray, T: np.ndarray, + weights: Optional[np.ndarray] = None, ) -> Dict[str, float]: """Compute mean outcomes for each of the 8 DDD cells.""" means = {} @@ -614,7 +704,10 @@ def _compute_cell_means( for t_val, t_name in [(0, "Pre"), (1, "Post")]: mask = (G == g_val) & (P == p_val) & (T == t_val) cell_name = f"{g_name}, {p_name}, {t_name}" - means[cell_name] = float(np.mean(y[mask])) + if weights is not None: + means[cell_name] = float(np.average(y[mask], weights=weights[mask])) + else: + means[cell_name] = float(np.mean(y[mask])) return means # ========================================================================= @@ -644,6 +737,8 @@ def _regression_adjustment( P: np.ndarray, T: np.ndarray, X: Optional[np.ndarray], + survey_weights: Optional[np.ndarray] = None, + resolved_survey=None, ) -> Tuple[float, float, Optional[float], Optional[Dict[str, float]]]: """ Estimate ATT using regression adjustment via three-DiD decomposition. @@ -653,7 +748,15 @@ def _regression_adjustment( imputed counterfactual means. Matches R's triplediff::ddd() with est_method="reg". """ - return self._estimate_ddd_decomposition(y, G, P, T, X) + return self._estimate_ddd_decomposition( + y, + G, + P, + T, + X, + survey_weights=survey_weights, + resolved_survey=resolved_survey, + ) def _ipw_estimation( self, @@ -699,6 +802,8 @@ def _estimate_ddd_decomposition( P: np.ndarray, T: np.ndarray, X: Optional[np.ndarray], + survey_weights: Optional[np.ndarray] = None, + resolved_survey=None, ) -> Tuple[float, float, Optional[float], Optional[Dict[str, float]]]: """ Core DDD estimation via three-DiD decomposition. @@ -713,6 +818,9 @@ def _estimate_ddd_decomposition( Standard errors use the efficient influence function: SE = std(w3*IF_3 + w2*IF_2 - w1*IF_1) / sqrt(n) + + When resolved_survey is provided, survey-weighted SE is computed + using TSL on the combined influence function. """ n = len(y) est_method = self.estimation_method @@ -825,6 +933,9 @@ def _estimate_ddd_decomposition( overlap_issues.append((j, frac_trimmed)) hessian = None + # Subset survey weights for this comparison + w_sub = survey_weights[mask] if survey_weights is not None else None + # --- Outcome regression --- if est_method == "ipw": # IPW: no outcome regression @@ -835,16 +946,36 @@ def _estimate_ddd_decomposition( else: # Fit separate OLS per subgroup-time cell, predict for all or_ctrl_pre = self._fit_predict_mu( - y_sub, covX_sub, sg_sub == j, post_sub == 0, n_sub + y_sub, + covX_sub, + sg_sub == j, + post_sub == 0, + n_sub, + weights=w_sub, ) or_ctrl_post = self._fit_predict_mu( - y_sub, covX_sub, sg_sub == j, post_sub == 1, n_sub + y_sub, + covX_sub, + sg_sub == j, + post_sub == 1, + n_sub, + weights=w_sub, ) or_trt_pre = self._fit_predict_mu( - y_sub, covX_sub, sg_sub == 4, post_sub == 0, n_sub + y_sub, + covX_sub, + sg_sub == 4, + post_sub == 0, + n_sub, + weights=w_sub, ) or_trt_post = self._fit_predict_mu( - y_sub, covX_sub, sg_sub == 4, post_sub == 1, n_sub + y_sub, + covX_sub, + sg_sub == 4, + post_sub == 1, + n_sub, + weights=w_sub, ) # --- Compute DiD ATT and influence function --- @@ -862,6 +993,7 @@ def _estimate_ddd_decomposition( hessian, est_method, n_sub, + weights=w_sub, ) # Track non-finite IF values (flag for NaN SE later) @@ -893,18 +1025,33 @@ def _estimate_ddd_decomposition( ) # Influence function weights (matching R's att_dr_rc) - n3 = np.sum((subgroup == 3) | (subgroup == 4)) - n2 = np.sum((subgroup == 2) | (subgroup == 4)) - n1 = np.sum((subgroup == 1) | (subgroup == 4)) - w3 = n / n3 - w2 = n / n2 - w1 = n / n1 + if survey_weights is not None: + n3 = np.sum(survey_weights[(subgroup == 3) | (subgroup == 4)]) + n2 = np.sum(survey_weights[(subgroup == 2) | (subgroup == 4)]) + n1 = np.sum(survey_weights[(subgroup == 1) | (subgroup == 4)]) + n_total = np.sum(survey_weights) + else: + n3 = np.sum((subgroup == 3) | (subgroup == 4)) + n2 = np.sum((subgroup == 2) | (subgroup == 4)) + n1 = np.sum((subgroup == 1) | (subgroup == 4)) + n_total = n + w3 = n_total / n3 + w2 = n_total / n2 + w1 = n_total / n1 inf_func = ( w3 * did_results[3]["inf"] + w2 * did_results[2]["inf"] - w1 * did_results[1]["inf"] ) - if self._cluster_ids is not None: + if resolved_survey is not None: + # Survey-weighted SE via TSL on the combined influence function. + # Treat the IF as a single-parameter score vector: + # compute_survey_vcov(ones, IF, resolved) gives V(ATT). + from diff_diff.survey import compute_survey_vcov + + vcov_survey = compute_survey_vcov(np.ones((n, 1)), inf_func, resolved_survey) + se = float(np.sqrt(vcov_survey[0, 0])) + elif self._cluster_ids is not None: # Cluster-robust SE: sum IF within clusters, then Liang-Zeger variance unique_clusters = np.unique(self._cluster_ids) n_clusters_val = len(unique_clusters) @@ -977,8 +1124,9 @@ def _fit_predict_mu( subgroup_mask: np.ndarray, time_mask: np.ndarray, n_total: int, + weights: Optional[np.ndarray] = None, ) -> np.ndarray: - """Fit OLS on a subgroup-time cell, predict for all observations.""" + """Fit OLS (or WLS with survey weights) on a subgroup-time cell, predict for all observations.""" fit_mask = subgroup_mask & time_mask n_fit = int(np.sum(fit_mask)) @@ -989,20 +1137,35 @@ def _fit_predict_mu( y_fit = y[fit_mask] try: - beta, _, _ = solve_ols( - X_fit, - y_fit, - rank_deficient_action=self.rank_deficient_action, - ) + if weights is not None: + # WLS: transform by sqrt(weights) for weighted least squares + w_fit = weights[fit_mask] + sqrt_w = np.sqrt(w_fit) + beta, _, _ = solve_ols( + X_fit * sqrt_w[:, np.newaxis], + y_fit * sqrt_w, + rank_deficient_action=self.rank_deficient_action, + ) + else: + beta, _, _ = solve_ols( + X_fit, + y_fit, + rank_deficient_action=self.rank_deficient_action, + ) # Replace NaN coefficients (dropped columns) with 0 for prediction beta = np.where(np.isnan(beta), 0.0, beta) except ValueError: if self.rank_deficient_action == "error": raise + if weights is not None: + return np.full(n_total, np.average(y_fit, weights=weights[fit_mask])) return np.full(n_total, np.mean(y_fit)) except np.linalg.LinAlgError: + if weights is not None: + return np.full(n_total, np.average(y_fit, weights=weights[fit_mask])) return np.full(n_total, np.mean(y_fit)) + # Prediction uses original-scale X (unchanged) return covX @ beta def _compute_did_rc( @@ -1020,6 +1183,7 @@ def _compute_did_rc( hessian: Optional[np.ndarray], est_method: str, n: int, + weights: Optional[np.ndarray] = None, ) -> Tuple[float, np.ndarray]: """ Compute a single pairwise DiD (subgroup j vs 4) for RC data. @@ -1031,7 +1195,17 @@ def _compute_did_rc( return self._compute_did_rc_ipw(y, post, PA4, PAa, pscore, covX, hessian, n) elif est_method == "reg": return self._compute_did_rc_reg( - y, post, PA4, PAa, covX, or_ctrl_pre, or_ctrl_post, or_trt_pre, or_trt_post, n + y, + post, + PA4, + PAa, + covX, + or_ctrl_pre, + or_ctrl_post, + or_trt_pre, + or_trt_post, + n, + weights=weights, ) else: return self._compute_did_rc_dr( @@ -1128,8 +1302,16 @@ def _compute_did_rc_reg( or_trt_pre: np.ndarray, or_trt_post: np.ndarray, n: int, + weights: Optional[np.ndarray] = None, ) -> Tuple[float, np.ndarray]: """Regression adjustment DiD for a single pairwise comparison (RC).""" + + # Helper: weighted or unweighted mean + def _wmean(x): + if weights is not None: + return np.average(x, weights=weights) + return np.mean(x) + # Riesz representers riesz_treat_pre = PA4 * (1 - post) riesz_treat_post = PA4 * post @@ -1140,18 +1322,26 @@ def _compute_did_rc_reg( reg_att_treat_post = riesz_treat_post * y reg_att_control = riesz_control * (or_ctrl_post - or_ctrl_pre) - eta_treat_pre = np.mean(reg_att_treat_pre) / np.mean(riesz_treat_pre) - eta_treat_post = np.mean(reg_att_treat_post) / np.mean(riesz_treat_post) - eta_control = np.mean(reg_att_control) / np.mean(riesz_control) + eta_treat_pre = _wmean(reg_att_treat_pre) / _wmean(riesz_treat_pre) + eta_treat_post = _wmean(reg_att_treat_post) / _wmean(riesz_treat_post) + eta_control = _wmean(reg_att_control) / _wmean(riesz_control) att = (eta_treat_post - eta_treat_pre) - eta_control # Influence function # OLS asymptotic linear representation for pre/post + # When survey weights present, include them in both score and bread + # for consistency with the weighted OLS fit weights_ols_pre = PAa * (1 - post) - wols_x_pre = weights_ols_pre[:, None] * covX - wols_eX_pre = (weights_ols_pre * (y - or_ctrl_pre))[:, None] * covX - XpX_pre = wols_x_pre.T @ covX / n + if weights is not None: + w_sum = np.sum(weights) + wols_x_pre = (weights_ols_pre * weights)[:, None] * covX + wols_eX_pre = (weights_ols_pre * weights * (y - or_ctrl_pre))[:, None] * covX + XpX_pre = wols_x_pre.T @ covX / w_sum + else: + wols_x_pre = weights_ols_pre[:, None] * covX + wols_eX_pre = (weights_ols_pre * (y - or_ctrl_pre))[:, None] * covX + XpX_pre = wols_x_pre.T @ covX / n try: XpX_inv_pre = np.linalg.inv(XpX_pre) except np.linalg.LinAlgError: @@ -1159,28 +1349,36 @@ def _compute_did_rc_reg( asy_lin_rep_ols_pre = wols_eX_pre @ XpX_inv_pre weights_ols_post = PAa * post - wols_x_post = weights_ols_post[:, None] * covX - wols_eX_post = (weights_ols_post * (y - or_ctrl_post))[:, None] * covX - XpX_post = wols_x_post.T @ covX / n + if weights is not None: + wols_x_post = (weights_ols_post * weights)[:, None] * covX + wols_eX_post = (weights_ols_post * weights * (y - or_ctrl_post))[:, None] * covX + XpX_post = wols_x_post.T @ covX / w_sum + else: + wols_x_post = weights_ols_post[:, None] * covX + wols_eX_post = (weights_ols_post * (y - or_ctrl_post))[:, None] * covX + XpX_post = wols_x_post.T @ covX / n try: XpX_inv_post = np.linalg.inv(XpX_post) except np.linalg.LinAlgError: XpX_inv_post = np.linalg.pinv(XpX_post) asy_lin_rep_ols_post = wols_eX_post @ XpX_inv_post - inf_treat_pre = (reg_att_treat_pre - riesz_treat_pre * eta_treat_pre) / np.mean( + inf_treat_pre = (reg_att_treat_pre - riesz_treat_pre * eta_treat_pre) / _wmean( riesz_treat_pre ) - inf_treat_post = (reg_att_treat_post - riesz_treat_post * eta_treat_post) / np.mean( + inf_treat_post = (reg_att_treat_post - riesz_treat_post * eta_treat_post) / _wmean( riesz_treat_post ) inf_treat = inf_treat_post - inf_treat_pre inf_control_1 = reg_att_control - riesz_control * eta_control - M1 = np.mean(riesz_control[:, None] * covX, axis=0) + if weights is not None: + M1 = np.average(riesz_control[:, None] * covX, axis=0, weights=weights) + else: + M1 = np.mean(riesz_control[:, None] * covX, axis=0) inf_control_2_post = asy_lin_rep_ols_post @ M1 inf_control_2_pre = asy_lin_rep_ols_pre @ M1 - inf_control = (inf_control_1 + inf_control_2_post - inf_control_2_pre) / np.mean( + inf_control = (inf_control_1 + inf_control_2_post - inf_control_2_pre) / _wmean( riesz_control ) @@ -1508,6 +1706,7 @@ def triple_difference( cluster: Optional[str] = None, alpha: float = 0.05, rank_deficient_action: str = "warn", + survey_design: object = None, ) -> TripleDifferenceResults: """ Estimate Triple Difference (DDD) treatment effect. @@ -1582,4 +1781,5 @@ def triple_difference( partition=partition, time=time, covariates=covariates, + survey_design=survey_design, ) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 289acc43..f0c49178 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -494,6 +494,8 @@ See `docs/methodology/continuous-did.md` Section 4 for full details. - [ ] Covariate support (deferred, matching R v0.1.0) - [ ] Discrete treatment saturated regression - [ ] Lowest-dose-as-control (Remark 3.1) +- [x] Survey design support (Phase 3): weighted B-spline OLS, TSL on influence functions; bootstrap+survey deferred +- **Note:** ContinuousDiD bootstrap with survey weights deferred to Phase 5 --- @@ -670,10 +672,13 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`. - [x] Each ATT(g,t) can be estimated independently (parallelizable) - [x] Absorbing treatment validation - [x] Overlap diagnostics for propensity score ratios +- [x] Survey design support (Phase 3): survey-weighted means/covariances in Omega*, TSL on EIF scores; bootstrap+survey deferred - **Note:** Sieve ratio estimation uses polynomial basis functions (total degree up to K) with AIC/BIC model selection. The paper describes sieve estimators generally without specifying a particular basis family; polynomial sieves are a standard choice (Section 4, Eq 4.2). Negative sieve ratio predictions are clipped to a small positive value since the population ratio p_g(X)/p_{g'}(X) is non-negative. - **Note:** Kernel-smoothed conditional covariance Omega*(X) uses Gaussian kernel with Silverman's rule-of-thumb bandwidth by default. The paper specifies kernel smoothing (step 5, Section 4) without mandating a particular kernel or bandwidth selection method. - **Note:** Conditional covariance Omega*(X) scales each term by per-unit sieve-estimated inverse propensities s_hat_{g'}(X) = 1/p_{g'}(X) (algorithm step 4), matching Eq 3.12. The inverse propensity estimation uses the same polynomial sieve convex minimization as the ratio estimator. Estimated s_hat values are clipped to [1, n] with a UserWarning when clipping binds, mirroring the ratio path's overlap diagnostics. - **Note:** Outcome regressions m_hat_{g',t,tpre}(X) use linear OLS working models. The paper's Section 4 describes flexible nonparametric nuisance estimation (sieve regression, kernel smoothing, or ML methods). The DR property ensures consistency if either the OLS outcome model or the sieve propensity ratio is correctly specified, but the linear OLS specification does not generically guarantee attainment of the semiparametric efficiency bound unless the conditional mean is linear in the covariates. +- **Note:** EfficientDiD bootstrap with survey weights deferred to Phase 5 +- **Note:** EfficientDiD covariates (DR path) with survey weights deferred — the doubly robust nuisance estimation does not yet thread survey weights through sieve/kernel steps --- @@ -744,6 +749,7 @@ where weights ŵ_{g,e} = n_{g,e} / Σ_g n_{g,e} (sample share of cohort g at eve - [x] R comparison: ATT matches within machine precision (<1e-11) - [x] R comparison: SE matches within 0.3% (well within 1% threshold) - [x] R comparison: Event study effects match perfectly (correlation 1.0) +- [x] Survey design support (Phase 3): weighted within-transform, survey weights in LinearRegression with TSL vcov; bootstrap+survey deferred --- @@ -1014,6 +1020,8 @@ The paper text states a stricter bound (T_min + 1) but the R code by the co-auth - [x] Overall ATT as average of post-treatment delta_h with delta-method SE - [x] Anticipation parameter support - [x] Never-treated encoding (0 and inf) +- [x] Survey design support (Phase 3): Q-weights compose multiplicatively with survey weights; TSL vcov on composed weights; survey design columns propagated through sub-experiments +- **Note:** Survey weights compose multiplicatively with Q-weights for StackedDiD; only `weight_type="pweight"` (default) is supported — `fweight` and `aweight` are rejected because Q-weight composition changes weight semantics (non-integer for fweight, non-inverse-variance for aweight) --- @@ -1230,6 +1238,8 @@ has no additional effect. - [x] Influence function SE: std(w3·IF_3 + w2·IF_2 - w1·IF_1) / sqrt(n) - [x] Cluster-robust SE via Liang-Zeger variance on influence function - [x] ATT and SE match R within <0.001% for all methods and DGP types +- [x] Survey design support (Phase 3): regression method with weighted OLS + TSL on combined influence functions; IPW/DR deferred +- **Note:** TripleDifference IPW/DR with survey weights deferred until weighted solve_logit() (Phase 5) --- @@ -1565,6 +1575,8 @@ Weights depend on group sizes and variance in treatment timing. - [ ] Weights sum to approximately 1 (numerical precision) - [ ] TWFE coefficient ≈ weighted sum of 2×2 estimates - [ ] Visualization shows weight vs. estimate by comparison type +- [x] Survey design support (Phase 3): weighted cell means, weighted within-transform, weighted group shares +- **Note:** Bacon decomposition with survey weights is diagnostic; exact-sum guarantee is approximate; `weights="exact"` requires within-unit-constant survey columns (approximate path accepts time-varying weights) --- diff --git a/docs/survey-roadmap.md b/docs/survey-roadmap.md index c87c7b1e..e59fd211 100644 --- a/docs/survey-roadmap.md +++ b/docs/survey-roadmap.md @@ -1,8 +1,7 @@ # Survey Data Support Roadmap This document captures planned future work for survey data support in diff-diff. -Phases 1-2 (core infrastructure + base estimator integration) are implemented. -Phases 3-5 are deferred for future PRs. +Phases 1-3 are implemented. Phases 4-5 are deferred for future PRs. ## Implemented (Phases 1-2) @@ -15,19 +14,32 @@ Phases 3-5 are deferred for future PRs. - **Weighted demeaning**: `demean_by_group()` and `within_transform()` with weights - **SurveyMetadata** in results objects with effective n, DEFF, weight_range -## Phase 3: OLS-Based Standalone Estimators +## Implemented (Phase 3): OLS-Based Standalone Estimators -These estimators use `LinearRegression` or `solve_ols()` internally, so weight -support is largely mechanical (threading `survey_design` through `fit()`). +| Estimator | File | Survey Support | Notes | +|-----------|------|----------------|-------| +| StackedDiD | `stacked_did.py` | pweight only | Q-weights compose multiplicatively with survey weights; TSL vcov on composed weights; fweight/aweight rejected (composition changes weight semantics) | +| SunAbraham | `sun_abraham.py` | Full | Survey weights in LinearRegression + weighted within-transform; bootstrap+survey deferred | +| BaconDecomposition | `bacon.py` | Diagnostic | Weighted cell means, weighted within-transform, weighted group shares; no inference (diagnostic only) | +| TripleDifference | `triple_diff.py` | Reg only | Regression method with weighted OLS + TSL on influence functions; IPW/DR deferred (needs weighted `solve_logit()`) | +| ContinuousDiD | `continuous_did.py` | Analytical | Weighted B-spline OLS + TSL on influence functions; bootstrap+survey deferred | +| EfficientDiD | `efficient_did.py` | Analytical | Weighted means/covariances in Omega* + TSL on EIF scores; bootstrap+survey deferred | -| Estimator | File | Complexity | Notes | -|-----------|------|------------|-------| -| StackedDiD | `stacked_did.py` | Low | Q-weights compose multiplicatively with survey weights | -| SunAbraham | `sun_abraham.py` | Low | Single saturated regression via LinearRegression | -| BaconDecomposition | `bacon.py` | Medium | Weighted demeaning, weighted cell means, effective group sizes | -| TripleDifference | `triple_diff.py` | Medium-High | Three methods (reg, ipw, dr); weighted cell means, weighted logistic regression (requires Phase 5) | -| ContinuousDiD | `continuous_did.py` | Low-Medium | Uses LinearRegression internally | -| EfficientDiD | `efficient_did.py` | Low-Medium | Uses LinearRegression internally | +### Phase 3 Deferred Work + +The following capabilities were deferred from Phase 3 because they depend on +Phase 5 infrastructure (weighted `solve_logit()` or bootstrap+survey interaction): + +| Estimator | Deferred Capability | Blocker | +|-----------|-------------------|---------| +| SunAbraham | Pairs bootstrap + survey | Phase 5: bootstrap+survey interaction | +| TripleDifference | IPW and DR methods + survey | Phase 5: weighted `solve_logit()` | +| ContinuousDiD | Multiplier bootstrap + survey | Phase 5: bootstrap+survey interaction | +| EfficientDiD | Multiplier bootstrap + survey | Phase 5: bootstrap+survey interaction | +| EfficientDiD | Covariates (DR path) + survey | DR nuisance estimation needs survey weight threading | + +All blocked combinations raise `NotImplementedError` when attempted, with a +message pointing to the planned phase or describing the limitation. ## Phase 4: Complex Standalone Estimators diff --git a/tests/test_sun_abraham.py b/tests/test_sun_abraham.py index b2a417ab..4dd79381 100644 --- a/tests/test_sun_abraham.py +++ b/tests/test_sun_abraham.py @@ -1291,7 +1291,8 @@ def test_variance_fallback_warning(self): def patched_compute_overall_att(df, first_treat, event_study_effects, cohort_effects, cohort_weights, - vcov_cohort, coef_index_map): + vcov_cohort, coef_index_map, + survey_weight_col=None): # Pass an empty coef_index_map to trigger the fallback return original_method( df, first_treat, event_study_effects, diff --git a/tests/test_survey_phase3.py b/tests/test_survey_phase3.py new file mode 100644 index 00000000..3248fa03 --- /dev/null +++ b/tests/test_survey_phase3.py @@ -0,0 +1,1167 @@ +"""Tests for Phase 3 survey support: OLS-based standalone estimators. + +Covers: StackedDiD, SunAbraham, BaconDecomposition, TripleDifference, +ContinuousDiD, EfficientDiD. +""" + +import numpy as np +import pandas as pd +import pytest + +from diff_diff import SurveyDesign + +# ============================================================================= +# Shared Fixtures +# ============================================================================= + + +@pytest.fixture +def staggered_survey_data(): + """Staggered treatment panel with survey design columns. + + 60 units, 8 periods, 2 treatment cohorts (t=4, t=6), 20 never-treated. + 5 strata, 12 PSUs, FPC, and sampling weights included. + """ + np.random.seed(42) + n_units = 60 + n_periods = 8 + rows = [] + for unit in range(n_units): + if unit < 20: + ft = 4 # Early cohort + elif unit < 40: + ft = 6 # Late cohort + else: + ft = 0 # Never treated + + stratum = unit // 12 # 5 strata + psu = unit // 5 # 12 PSUs + fpc_val = 120.0 # Population per stratum + wt = 1.0 + 0.3 * stratum + + for t in range(1, n_periods + 1): + y = 10.0 + unit * 0.05 + t * 0.2 + if ft > 0 and t >= ft: + y += 2.0 # Treatment effect + y += np.random.normal(0, 0.5) + + rows.append( + { + "unit": unit, + "time": t, + "first_treat": ft, + "outcome": y, + "weight": wt, + "stratum": stratum, + "psu": psu, + "fpc": fpc_val, + } + ) + return pd.DataFrame(rows) + + +@pytest.fixture +def ddd_survey_data(): + """Cross-sectional DDD data with survey columns.""" + np.random.seed(42) + n = 400 + data = pd.DataFrame( + { + "outcome": np.random.randn(n) + 0.5, + "group": np.random.choice([0, 1], n), + "partition": np.random.choice([0, 1], n), + "time": np.random.choice([0, 1], n), + "weight": np.random.uniform(0.5, 2.0, n), + "stratum": np.random.choice([1, 2, 3], n), + } + ) + # Add treatment effect for treated+eligible+post + mask = (data["group"] == 1) & (data["partition"] == 1) & (data["time"] == 1) + data.loc[mask, "outcome"] += 1.5 + return data + + +@pytest.fixture +def continuous_survey_data(): + """Panel data for continuous DiD with survey columns.""" + np.random.seed(42) + n_u, n_t = 80, 4 + units = np.repeat(range(n_u), n_t) + times = np.tile(range(1, n_t + 1), n_u) + ft = np.repeat(np.where(np.arange(n_u) < 40, 3, 0), n_t) + dose_per_unit = np.where(np.arange(n_u) < 40, np.random.uniform(0.5, 2.0, n_u), 0.0) + dose = np.repeat(dose_per_unit, n_t) + y = np.random.randn(len(units)) + 0.5 * dose * (times >= ft) * (ft > 0) + w = np.repeat(np.random.uniform(0.5, 2.0, n_u), n_t) # constant within unit + strata = np.repeat(np.where(np.arange(n_u) < 40, 1, 2), n_t) + + return pd.DataFrame( + { + "unit": units, + "time": times, + "first_treat": ft, + "dose": dose, + "outcome": y, + "weight": w, + "stratum": strata, + } + ) + + +# ============================================================================= +# SunAbraham +# ============================================================================= + + +class TestSunAbrahamSurvey: + """Survey design support for SunAbraham.""" + + def test_smoke_weights_only(self, staggered_survey_data): + """SunAbraham runs with weights-only survey design.""" + from diff_diff import SunAbraham + + sd = SurveyDesign(weights="weight") + est = SunAbraham() + result = est.fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + assert result.survey_metadata is not None + + def test_uniform_weights_match_unweighted(self, staggered_survey_data): + """Uniform survey weights should match unweighted result.""" + from diff_diff import SunAbraham + + staggered_survey_data["uniform_w"] = 1.0 + sd = SurveyDesign(weights="uniform_w") + + r_unw = SunAbraham().fit(staggered_survey_data, "outcome", "unit", "time", "first_treat") + r_w = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert abs(r_unw.overall_att - r_w.overall_att) < 1e-10 + + def test_survey_metadata_fields(self, staggered_survey_data): + """survey_metadata has correct fields with full design.""" + from diff_diff import SunAbraham + + sd = SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc", nest=True) + result = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + sm = result.survey_metadata + assert sm is not None + assert sm.weight_type == "pweight" + assert sm.effective_n > 0 + assert sm.design_effect > 0 + assert sm.n_strata is not None + assert sm.n_psu is not None + + def test_se_differs_with_design(self, staggered_survey_data): + """SEs should differ between weights-only and full design.""" + from diff_diff import SunAbraham + + sd_w = SurveyDesign(weights="weight") + sd_full = SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc", nest=True) + r_w = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd_w, + ) + r_full = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd_full, + ) + # ATTs should be the same (same weights) + assert abs(r_w.overall_att - r_full.overall_att) < 1e-10 + # SEs should differ due to different variance estimators + assert r_w.overall_se != r_full.overall_se + + def test_bootstrap_survey_raises(self, staggered_survey_data): + """Bootstrap + survey should raise NotImplementedError.""" + from diff_diff import SunAbraham + + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="Bootstrap"): + SunAbraham(n_bootstrap=99).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + + def test_summary_includes_survey(self, staggered_survey_data): + """Summary output should include survey design section.""" + from diff_diff import SunAbraham + + sd = SurveyDesign(weights="weight", strata="stratum") + result = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + summary = result.summary() + assert "Survey Design" in summary + assert "pweight" in summary + + def test_no_survey_metadata_is_none(self, staggered_survey_data): + """Without survey, survey_metadata should be None.""" + from diff_diff import SunAbraham + + result = SunAbraham().fit(staggered_survey_data, "outcome", "unit", "time", "first_treat") + assert result.survey_metadata is None + + +# ============================================================================= +# StackedDiD +# ============================================================================= + + +class TestStackedDiDSurvey: + """Survey design support for StackedDiD.""" + + def test_smoke_weights_only(self, staggered_survey_data): + """StackedDiD runs with weights-only survey design.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(weights="weight") + result = StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + assert result.survey_metadata is not None + + def test_survey_metadata_present(self, staggered_survey_data): + """survey_metadata populated with full design.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert result.survey_metadata is not None + assert result.survey_metadata.weight_type == "pweight" + + def test_q_weight_composition(self, staggered_survey_data): + """Survey weights should change results vs unweighted.""" + from diff_diff import StackedDiD + + r_unw = StackedDiD().fit(staggered_survey_data, "outcome", "unit", "time", "first_treat") + sd = SurveyDesign(weights="weight") + r_w = StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + # ATT should differ (non-uniform weights) + assert r_unw.overall_att != r_w.overall_att + + def test_convenience_function(self, staggered_survey_data): + """stacked_did() convenience function threads survey_design.""" + from diff_diff.stacked_did import stacked_did + + sd = SurveyDesign(weights="weight") + result = stacked_did( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert result.survey_metadata is not None + + def test_summary_includes_survey(self, staggered_survey_data): + """Summary includes survey design section.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert "Survey Design" in result.summary() + + def test_fweight_rejected(self, staggered_survey_data): + """StackedDiD should reject fweight since Q-weight composition breaks it.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(weights="weight", weight_type="fweight") + with pytest.raises(ValueError, match="fweight"): + StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + + def test_aweight_rejected(self, staggered_survey_data): + """StackedDiD should reject aweight since Q-weight composition breaks it.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(weights="weight", weight_type="aweight") + with pytest.raises(ValueError, match="aweight"): + StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + + +# ============================================================================= +# BaconDecomposition +# ============================================================================= + + +class TestBaconDecompositionSurvey: + """Survey design support for BaconDecomposition.""" + + def test_smoke_weights_only(self, staggered_survey_data): + """BaconDecomposition runs with weights-only survey design.""" + from diff_diff import BaconDecomposition + + sd = SurveyDesign(weights="weight") + result = BaconDecomposition().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(result.twfe_estimate) + assert len(result.comparisons) > 0 + assert result.survey_metadata is not None + + def test_weighted_changes_twfe(self, staggered_survey_data): + """Survey weights should change TWFE estimate.""" + from diff_diff import BaconDecomposition + + r_unw = BaconDecomposition().fit( + staggered_survey_data, "outcome", "unit", "time", "first_treat" + ) + sd = SurveyDesign(weights="weight") + r_w = BaconDecomposition().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert r_unw.twfe_estimate != r_w.twfe_estimate + + def test_summary_includes_survey(self, staggered_survey_data): + """Summary includes survey design section.""" + from diff_diff import BaconDecomposition + + sd = SurveyDesign(weights="weight") + result = BaconDecomposition().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert "Survey Design" in result.summary() + + def test_exact_weights_survey_weighted(self, staggered_survey_data): + """BaconDecomposition exact weights should use survey-weighted shares.""" + from diff_diff import BaconDecomposition + + sd = SurveyDesign(weights="weight") + r = BaconDecomposition(weights="exact").fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(r.twfe_estimate) + # Exact weights should still produce valid comparisons + assert len(r.comparisons) > 0 + for comp in r.comparisons: + assert np.isfinite(comp.weight) + # With non-uniform weights, exact weights should differ from + # approximate weights (approximate uses n_k*(1-n_k)*Var(D)) + r_approx = BaconDecomposition(weights="approximate").fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + # At least one comparison weight should differ + exact_weights = {(c.treated_group, c.control_group): c.weight for c in r.comparisons} + approx_weights = { + (c.treated_group, c.control_group): c.weight for c in r_approx.comparisons + } + common_keys = set(exact_weights) & set(approx_weights) + assert len(common_keys) > 0 + diffs = [abs(exact_weights[k] - approx_weights[k]) for k in common_keys] + assert max(diffs) > 1e-10, "Exact and approximate weights should differ" + + def test_convenience_function(self, staggered_survey_data): + """bacon_decompose() convenience function threads survey_design.""" + from diff_diff.bacon import bacon_decompose + + sd = SurveyDesign(weights="weight") + result = bacon_decompose( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert result.survey_metadata is not None + + +# ============================================================================= +# TripleDifference +# ============================================================================= + + +class TestTripleDifferenceSurvey: + """Survey design support for TripleDifference (reg method only).""" + + def test_smoke_reg_method(self, ddd_survey_data): + """TripleDifference reg method runs with survey design.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight", strata="stratum") + result = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + assert np.isfinite(result.att) + assert np.isfinite(result.se) + assert result.survey_metadata is not None + + def test_ipw_survey_raises(self, ddd_survey_data): + """IPW + survey should raise NotImplementedError.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="IPW"): + TripleDifference(estimation_method="ipw").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + + def test_dr_survey_raises(self, ddd_survey_data): + """DR + survey should raise NotImplementedError.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="doubly robust"): + TripleDifference(estimation_method="dr").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + + def test_weighted_changes_att(self, ddd_survey_data): + """Survey weights should change ATT.""" + from diff_diff import TripleDifference + + r_unw = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, "outcome", "group", "partition", "time" + ) + sd = SurveyDesign(weights="weight") + r_w = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + assert r_unw.att != r_w.att + + def test_survey_metadata_in_to_dict(self, ddd_survey_data): + """to_dict() includes survey metadata fields.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight", strata="stratum") + result = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + d = result.to_dict() + assert "weight_type" in d + assert "effective_n" in d + assert "design_effect" in d + + def test_summary_includes_survey(self, ddd_survey_data): + """Summary includes survey design section.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight", strata="stratum") + result = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + assert "Survey Design" in result.summary() + + def test_reg_with_covariates_survey(self, ddd_survey_data): + """TripleDifference reg method works with covariates + survey.""" + from diff_diff import TripleDifference + + # Add a covariate that affects the outcome + ddd_survey_data["x1"] = np.random.randn(len(ddd_survey_data)) * 0.5 + ddd_survey_data["outcome"] += 0.3 * ddd_survey_data["x1"] + sd = SurveyDesign(weights="weight", strata="stratum") + result = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + covariates=["x1"], + survey_design=sd, + ) + assert np.isfinite(result.att) + assert np.isfinite(result.se) + assert result.survey_metadata is not None + # Survey df should be used for inference + assert result.survey_metadata.df_survey is not None + + def test_convenience_function(self, ddd_survey_data): + """triple_difference() convenience function threads survey_design.""" + from diff_diff.triple_diff import triple_difference + + sd = SurveyDesign(weights="weight") + result = triple_difference( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + estimation_method="reg", + survey_design=sd, + ) + assert result.survey_metadata is not None + + +# ============================================================================= +# ContinuousDiD +# ============================================================================= + + +class TestContinuousDiDSurvey: + """Survey design support for ContinuousDiD.""" + + def test_smoke_weights_only(self, continuous_survey_data): + """ContinuousDiD runs with survey design (analytical SEs).""" + from diff_diff import ContinuousDiD + + sd = SurveyDesign(weights="weight") + result = ContinuousDiD(n_bootstrap=0).fit( + continuous_survey_data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + assert result.survey_metadata is not None + + def test_bootstrap_survey_raises(self, continuous_survey_data): + """Bootstrap + survey should raise NotImplementedError.""" + from diff_diff import ContinuousDiD + + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="bootstrap"): + ContinuousDiD(n_bootstrap=99).fit( + continuous_survey_data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + survey_design=sd, + ) + + def test_summary_includes_survey(self, continuous_survey_data): + """Summary includes survey design section.""" + from diff_diff import ContinuousDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = ContinuousDiD(n_bootstrap=0).fit( + continuous_survey_data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + survey_design=sd, + ) + assert "Survey Design" in result.summary() + + +# ============================================================================= +# EfficientDiD +# ============================================================================= + + +class TestEfficientDiDSurvey: + """Survey design support for EfficientDiD.""" + + def test_smoke_weights_only(self, staggered_survey_data): + """EfficientDiD runs with weights-only survey design.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + assert result.survey_metadata is not None + + def test_bootstrap_survey_raises(self, staggered_survey_data): + """Bootstrap + survey should raise NotImplementedError.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="bootstrap"): + EfficientDiD(n_bootstrap=99).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + + def test_covariates_survey_raises(self, staggered_survey_data): + """Covariates + survey should raise NotImplementedError.""" + from diff_diff import EfficientDiD + + # Add a covariate column + staggered_survey_data["x1"] = np.random.randn(len(staggered_survey_data)) + sd = SurveyDesign(weights="weight") + with pytest.raises(NotImplementedError, match="covariates"): + EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + covariates=["x1"], + survey_design=sd, + ) + + def test_survey_metadata_fields(self, staggered_survey_data): + """survey_metadata has correct fields.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + sm = result.survey_metadata + assert sm is not None + assert sm.weight_type == "pweight" + assert sm.effective_n > 0 + + def test_summary_includes_survey(self, staggered_survey_data): + """Summary includes survey design section.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert "Survey Design" in result.summary() + + def test_no_survey_metadata_is_none(self, staggered_survey_data): + """Without survey, survey_metadata should be None.""" + from diff_diff import EfficientDiD + + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, "outcome", "unit", "time", "first_treat" + ) + assert result.survey_metadata is None + + def test_survey_event_study_aggregation(self, staggered_survey_data): + """EfficientDiD survey with aggregate='event_study' produces finite results.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + aggregate="event_study", + survey_design=sd, + ) + assert result.event_study_effects is not None + for e, eff in result.event_study_effects.items(): + assert np.isfinite(eff["effect"]) + assert np.isfinite(eff["se"]) + assert eff["se"] > 0 + + def test_survey_group_aggregation(self, staggered_survey_data): + """EfficientDiD survey with aggregate='group' produces finite results.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + aggregate="group", + survey_design=sd, + ) + assert result.group_effects is not None + for g, eff in result.group_effects.items(): + assert np.isfinite(eff["effect"]) + assert np.isfinite(eff["se"]) + + def test_survey_all_aggregation(self, staggered_survey_data): + """EfficientDiD survey with aggregate='all' produces finite results.""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + result = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + aggregate="all", + survey_design=sd, + ) + assert result.event_study_effects is not None + assert result.group_effects is not None + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + + +# ============================================================================= +# Scale Invariance (applies to all estimators) +# ============================================================================= + + +class TestScaleInvariance: + """Multiplying all survey weights by a constant should not change ATT or SE.""" + + def test_sun_abraham_scale_invariance(self, staggered_survey_data): + from diff_diff import SunAbraham + + sd1 = SurveyDesign(weights="weight") + r1 = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd1, + ) + + staggered_survey_data["weight_x10"] = staggered_survey_data["weight"] * 10.0 + sd2 = SurveyDesign(weights="weight_x10") + r2 = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd2, + ) + + assert abs(r1.overall_att - r2.overall_att) < 1e-10 + assert abs(r1.overall_se - r2.overall_se) < 1e-8 + + def test_efficient_did_scale_invariance(self, staggered_survey_data): + from diff_diff import EfficientDiD + + sd1 = SurveyDesign(weights="weight") + r1 = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd1, + ) + + staggered_survey_data["weight_x10"] = staggered_survey_data["weight"] * 10.0 + sd2 = SurveyDesign(weights="weight_x10") + r2 = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd2, + ) + + assert abs(r1.overall_att - r2.overall_att) < 1e-10 + assert abs(r1.overall_se - r2.overall_se) < 1e-8 + + def test_triple_diff_scale_invariance(self, ddd_survey_data): + from diff_diff import TripleDifference + + sd1 = SurveyDesign(weights="weight") + r1 = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd1, + ) + ddd_survey_data["weight_x10"] = ddd_survey_data["weight"] * 10.0 + sd2 = SurveyDesign(weights="weight_x10") + r2 = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd2, + ) + assert abs(r1.att - r2.att) < 1e-10 + assert abs(r1.se - r2.se) < 1e-8 + + def test_sun_abraham_sub_unit_weight_scale_invariance(self, staggered_survey_data): + """SunAbraham overall ATT should be invariant to sub-1 weight rescaling.""" + from diff_diff import SunAbraham + + sd1 = SurveyDesign(weights="weight") + r1 = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd1, + ) + + # Scale weights to be < 1 (e.g., 0.01x) + staggered_survey_data["weight_tiny"] = staggered_survey_data["weight"] * 0.01 + sd2 = SurveyDesign(weights="weight_tiny") + r2 = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd2, + ) + + assert abs(r1.overall_att - r2.overall_att) < 1e-10 + assert abs(r1.overall_se - r2.overall_se) < 1e-8 + + +# ============================================================================= +# Regression Tests (PR #226 review feedback) +# ============================================================================= + + +class TestReviewRegressions: + """Targeted tests for issues found in PR #226 review.""" + + def test_stacked_did_no_weight_survey(self, staggered_survey_data): + """StackedDiD should handle SurveyDesign without weights column.""" + from diff_diff import StackedDiD + + sd = SurveyDesign(strata="stratum") # No weights + result = StackedDiD().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + assert result.survey_metadata is not None + + def test_triple_diff_survey_df(self, ddd_survey_data): + """TripleDifference should use survey df for p-values when survey active.""" + from diff_diff import TripleDifference + + sd = SurveyDesign(weights="weight", strata="stratum") + r_survey = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, + "outcome", + "group", + "partition", + "time", + survey_design=sd, + ) + r_nosurv = TripleDifference(estimation_method="reg").fit( + ddd_survey_data, "outcome", "group", "partition", "time" + ) + # P-values should differ (different df) + if np.isfinite(r_survey.p_value) and np.isfinite(r_nosurv.p_value): + assert r_survey.p_value != r_nosurv.p_value + + def test_efficient_did_weights_only_se(self, staggered_survey_data): + """EfficientDiD weights-only survey SE should be reasonable (not tiny).""" + from diff_diff import EfficientDiD + + sd = SurveyDesign(weights="weight") + r = EfficientDiD(n_bootstrap=0).fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + assert np.isfinite(r.overall_se) + assert r.overall_se > 0 + assert r.overall_se > 0.01 # Not artificially tiny + + def test_continuous_did_eventstudy_survey(self, continuous_survey_data): + """ContinuousDiD aggregate=eventstudy should work with survey design.""" + from diff_diff import ContinuousDiD + + sd = SurveyDesign(weights="weight", strata="stratum") + result = ContinuousDiD(n_bootstrap=0).fit( + continuous_survey_data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + aggregate="eventstudy", + survey_design=sd, + ) + assert result.event_study_effects is not None + assert result.survey_metadata is not None + for e, eff in result.event_study_effects.items(): + if np.isfinite(eff["effect"]): + assert np.isfinite(eff["se"]), f"Non-finite SE for e={e}" + + def test_within_unit_varying_weights_rejected(self): + """Time-varying survey weights within units should be rejected.""" + from diff_diff import ContinuousDiD + + np.random.seed(42) + n_u, n_t = 20, 4 + data = pd.DataFrame( + { + "unit": np.repeat(range(n_u), n_t), + "time": np.tile(range(1, n_t + 1), n_u), + "first_treat": np.repeat(np.where(np.arange(n_u) < 10, 3, 0), n_t), + "dose": np.repeat(np.where(np.arange(n_u) < 10, 1.0, 0.0), n_t), + "outcome": np.random.randn(n_u * n_t), + "w": np.random.uniform(0.5, 2.0, n_u * n_t), + } + ) + sd = SurveyDesign(weights="w") + with pytest.raises(ValueError, match="varies within units"): + ContinuousDiD(n_bootstrap=0).fit( + data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + survey_design=sd, + ) + + def test_within_unit_varying_strata_rejected(self): + """Time-varying strata within units should be rejected.""" + from diff_diff import EfficientDiD + + np.random.seed(42) + n_u, n_t = 20, 4 + data = pd.DataFrame( + { + "unit": np.repeat(range(n_u), n_t), + "time": np.tile(range(1, n_t + 1), n_u), + "first_treat": np.repeat(np.where(np.arange(n_u) < 10, 3, 0), n_t), + "outcome": np.random.randn(n_u * n_t), + "w": np.repeat(np.ones(n_u), n_t), + "strat": np.tile([1, 2, 1, 2], n_u), + } + ) + sd = SurveyDesign(weights="w", strata="strat") + with pytest.raises(ValueError, match="varies within units"): + EfficientDiD(n_bootstrap=0).fit( + data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + + def test_bacon_exact_varying_weights_rejected(self): + """BaconDecomposition exact weights should reject time-varying survey weights.""" + from diff_diff import BaconDecomposition + + np.random.seed(42) + n_u, n_t = 20, 4 + data = pd.DataFrame( + { + "unit": np.repeat(range(n_u), n_t), + "time": np.tile(range(1, n_t + 1), n_u), + "first_treat": np.repeat(np.where(np.arange(n_u) < 10, 3, 0), n_t), + "outcome": np.random.randn(n_u * n_t), + # Time-varying weights (different per period) + "w": np.random.uniform(0.5, 2.0, n_u * n_t), + } + ) + sd = SurveyDesign(weights="w") + with pytest.raises(ValueError, match="varies within units"): + BaconDecomposition(weights="exact").fit( + data, "outcome", "unit", "time", "first_treat", survey_design=sd + ) + + def test_sun_abraham_survey_df_regression(self, staggered_survey_data): + """SunAbraham survey inference should use survey df, not normal approx.""" + from diff_diff import SunAbraham + + sd = SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc", nest=True) + result = SunAbraham().fit( + staggered_survey_data, + "outcome", + "unit", + "time", + "first_treat", + survey_design=sd, + ) + sm = result.survey_metadata + assert sm is not None + assert sm.df_survey is not None + # Overall p-value should use t-distribution (survey df), not normal + # Recompute with normal approx and verify they differ + from diff_diff.utils import safe_inference + + _, p_normal, _ = safe_inference(result.overall_att, result.overall_se, alpha=0.05, df=None) + _, p_survey, _ = safe_inference( + result.overall_att, result.overall_se, alpha=0.05, df=sm.df_survey + ) + # With small survey df, t-dist p-value > normal p-value + if np.isfinite(result.overall_p_value) and np.isfinite(p_normal): + assert result.overall_p_value == pytest.approx(p_survey, rel=1e-10) + + def test_continuous_did_dose_response_survey_pvalue(self, continuous_survey_data): + """DoseResponseCurve.to_dataframe() p-values should use survey df.""" + from diff_diff import ContinuousDiD + from diff_diff.utils import safe_inference + + sd = SurveyDesign(weights="weight", strata="stratum") + result = ContinuousDiD(n_bootstrap=0).fit( + continuous_survey_data, + "outcome", + "unit", + "time", + "first_treat", + "dose", + survey_design=sd, + ) + sm = result.survey_metadata + assert sm is not None + assert sm.df_survey is not None + # Check that dose-response curve carries survey df + assert result.dose_response_att.df_survey == sm.df_survey + # Check exported p-values use survey df, not normal approx + att_df = result.dose_response_att.to_dataframe() + for i in range(min(3, len(att_df))): + row = att_df.iloc[i] + if np.isfinite(row["effect"]) and np.isfinite(row["se"]) and row["se"] > 0: + _, expected_p, _ = safe_inference(row["effect"], row["se"], df=sm.df_survey) + assert row["p_value"] == pytest.approx(expected_p, rel=1e-10)