feat: add conditional_pt parameter to survey DGP for conditional PT simulation

igerber · claude · igerber · commit fcc9717d1f79 · 2026-04-12T13:13:05.000-04:00
Adds a `conditional_pt` parameter to `generate_survey_did_data()` that creates
X-dependent time trends violating unconditional parallel trends while preserving
conditional PT. When nonzero, treated units' x1 is drawn from N(1,1) instead of
N(0,1), and the outcome includes `conditional_pt * x1 * (t/T)`. This unblocks
simulation scenario 4 for the survey variance paper: DR/IPW with covariates
recovers truth while no-covariate estimators are biased.

Also adds `paper/` to .gitignore for local manuscript files and marks the
conditional PT DGP gap as resolved in the survey roadmap.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -89,6 +89,7 @@ trop_avg_ref/
 
 # Academic papers (local only, not for distribution)
 papers/
+paper/
 
 # Local analysis notebooks (not committed)
 analysis/
diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py
@@ -1189,6 +1189,7 @@ def generate_survey_did_data(
     return_true_population_att: bool = False,
     covariate_effects: Optional[tuple] = None,
     te_covariate_interaction: float = 0.0,
+    conditional_pt: float = 0.0,
 ) -> pd.DataFrame:
     """
     Generate synthetic staggered DiD data with survey structure.
@@ -1301,6 +1302,19 @@ def generate_survey_did_data(
         ``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
         unit-level treatment effect heterogeneity driven by the continuous
         covariate. Requires ``add_covariates=True``.
+    conditional_pt : float, default=0.0
+        Coefficient for X-dependent time trend:
+        ``y += conditional_pt * x1_i * (t / n_periods)``. When nonzero,
+        treated units' x1 is drawn from N(1, 1) instead of N(0, 1),
+        creating differential pre-trends correlated with covariates.
+        Conditional on x1, trends remain parallel (conditional PT holds).
+        DR/IPW estimators with covariates recover truth; no-covariate
+        estimators are biased. Uses normalized time (t/n_periods) for
+        scale independence. Requires ``add_covariates=True``.
+
+        .. note:: When used with ``icc``, the ICC calibration is approximate
+           because the x1 mean shift creates a mixture distribution with
+           slightly higher marginal variance than the assumed Var(x1) = 1.
 
     Returns
     -------
@@ -1435,6 +1449,13 @@ def generate_survey_did_data(
             "te_covariate_interaction requires add_covariates=True"
         )
 
+    if not np.isfinite(conditional_pt):
+        raise ValueError(
+            f"conditional_pt must be finite, got {conditional_pt}"
+        )
+    if conditional_pt != 0.0 and not add_covariates:
+        raise ValueError("conditional_pt requires add_covariates=True")
+
     # --- ICC -> psu_re_sd resolution ---
     if icc is not None:
         # Covariate variance: Var(beta1*x1) + Var(beta2*x2)
@@ -1533,8 +1554,12 @@ def generate_survey_did_data(
         )
         if add_covariates:
             _panel_x1 = rng.normal(0, 1, size=n_units)
+            if conditional_pt != 0.0:
+                _panel_x1[unit_cohort > 0] += 1.0
             _panel_x2 = rng.choice([0, 1], size=n_units)
             y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
+            if conditional_pt != 0.0:
+                y0_period1 = y0_period1 + conditional_pt * _panel_x1 * (1 / n_periods)
         _rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
 
     # Save base weights for cross-section informative sampling (reset each period)
@@ -1572,6 +1597,8 @@ def generate_survey_did_data(
             # Draw covariates early so they can be included in Y(0) ranking
             if add_covariates:
                 x1 = rng.normal(0, 1, size=n_units)
+                if conditional_pt != 0.0:
+                    x1[unit_cohort > 0] += 1.0
                 x2 = rng.choice([0, 1], size=n_units)
             unit_weight = _base_weight.copy()  # type: ignore[possibly-undefined]
             y0_t = (
@@ -1582,6 +1609,8 @@ def generate_survey_did_data(
             )
             if add_covariates:
                 y0_t = y0_t + _beta1 * x1 + _beta2 * x2
+                if conditional_pt != 0.0:
+                    y0_t = y0_t + conditional_pt * x1 * (t / n_periods)
             _rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
 
         # Covariates — may already be drawn by informative sampling above
@@ -1592,6 +1621,8 @@ def generate_survey_did_data(
             pass  # x1, x2 already drawn in cross-section ranking block
         elif add_covariates:
             x1 = rng.normal(0, 1, size=n_units)
+            if conditional_pt != 0.0:
+                x1[unit_cohort > 0] += 1.0
             x2 = rng.choice([0, 1], size=n_units)
         else:
             x1 = None
@@ -1610,6 +1641,8 @@ def generate_survey_did_data(
 
             if add_covariates:
                 y += _beta1 * x1[i] + _beta2 * x2[i]
+                if conditional_pt != 0.0:
+                    y += conditional_pt * x1[i] * (t / n_periods)
 
             treated = int(g_i > 0 and t >= g_i)
             true_eff = 0.0
@@ -1713,6 +1746,7 @@ def generate_survey_did_data(
             "deff_kish": float(deff_kish),
             "base_stratum_effects": stratum_effects,
             "icc_realized": icc_realized,
+            "conditional_pt_active": conditional_pt != 0.0,
         }
 
     return df
diff --git a/docs/survey-roadmap.md b/docs/survey-roadmap.md
@@ -164,10 +164,10 @@ Enhanced `generate_survey_did_data()` with 8 research-grade parameters:
 `return_true_population_att`. All backward-compatible. Supports panel
 and repeated cross-section modes.
 
-**Remaining gap for 10e:** Conditional parallel trends — the DGP has
-unconditional PT by construction. A `conditional_pt` parameter is needed
-before the simulation study so that unconditional PT fails but conditional
-PT holds after covariate adjustment (DR/IPW recovers truth).
+**Resolved:** `conditional_pt` parameter added. When nonzero, shifts treated
+units' x1 mean by +1 SD and adds `conditional_pt * x1_i * (t/T)` to the
+outcome, creating X-dependent time trends. Unconditional PT fails; conditional
+PT holds after covariate adjustment. DR/IPW estimators recover truth.
 
 ### 10c. Expand R Validation Coverage (HIGH priority) ✅
 
diff --git a/tests/test_prep.py b/tests/test_prep.py
@@ -1966,6 +1966,152 @@ def test_te_covariate_interaction_validation(self):
         with pytest.raises(ValueError, match="te_covariate_interaction must be finite"):
             generate_survey_did_data(add_covariates=True, te_covariate_interaction=np.nan, seed=42)
 
+    # --- conditional_pt parameter tests ---
+
+    def test_conditional_pt_requires_covariates(self):
+        """conditional_pt requires add_covariates=True."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        with pytest.raises(ValueError, match="conditional_pt requires add_covariates"):
+            generate_survey_did_data(conditional_pt=0.3, add_covariates=False, seed=42)
+
+    def test_conditional_pt_nonfinite_rejected(self):
+        """conditional_pt must be finite."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        with pytest.raises(ValueError, match="conditional_pt must be finite"):
+            generate_survey_did_data(
+                add_covariates=True, conditional_pt=np.inf, seed=42
+            )
+        with pytest.raises(ValueError, match="conditional_pt must be finite"):
+            generate_survey_did_data(
+                add_covariates=True, conditional_pt=np.nan, seed=42
+            )
+
+    def test_conditional_pt_x1_distribution_shift(self):
+        """Treated units should have higher x1 when conditional_pt is active."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        df = generate_survey_did_data(
+            n_units=1000,
+            n_periods=4,
+            add_covariates=True,
+            conditional_pt=0.3,
+            seed=42,
+        )
+        p1 = df[df["period"] == 1]
+        x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].values
+        x1_control = p1.loc[p1["first_treat"] == 0, "x1"].values
+        shift = x1_treated.mean() - x1_control.mean()
+        # Expect ~1.0 SD shift; require at least 0.5
+        assert shift > 0.5, f"x1 mean shift too small: {shift:.3f}"
+
+    def test_conditional_pt_unconditional_pt_fails(self):
+        """With conditional_pt active, unconditional pre-trends should differ."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        df = generate_survey_did_data(
+            n_units=2000,
+            n_periods=8,
+            add_covariates=True,
+            conditional_pt=0.5,
+            never_treated_frac=0.5,
+            seed=42,
+        )
+        # Compute mean outcome change (period 2 - period 1) for each group
+        # before any treatment (use periods 1 and 2, treatment starts at 3+)
+        p1 = df[df["period"] == 1].set_index("unit")
+        p2 = df[df["period"] == 2].set_index("unit")
+        common = p1.index.intersection(p2.index)
+        dy = p2.loc[common, "outcome"] - p1.loc[common, "outcome"]
+        is_treated = p1.loc[common, "first_treat"] > 0
+
+        trend_treated = dy[is_treated].mean()
+        trend_control = dy[~is_treated].mean()
+        gap = abs(trend_treated - trend_control)
+        # With conditional_pt=0.5 and 1 SD shift, expect a detectable gap
+        assert gap > 0.01, f"Unconditional PT gap too small: {gap:.4f}"
+
+    def test_conditional_pt_conditional_pt_holds(self):
+        """Controlling for x1, treated/control pre-trends should be equal.
+
+        Use low PSU noise so the conditional_pt signal dominates.
+        """
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        df = generate_survey_did_data(
+            n_units=2000,
+            n_periods=8,
+            add_covariates=True,
+            conditional_pt=2.0,
+            never_treated_frac=0.5,
+            psu_re_sd=0.1,
+            psu_period_factor=0.1,
+            noise_sd=0.2,
+            seed=42,
+        )
+        p1 = df[df["period"] == 1].set_index("unit")
+        p2 = df[df["period"] == 2].set_index("unit")
+        common = p1.index.intersection(p2.index)
+        dy = p2.loc[common, "outcome"].values - p1.loc[common, "outcome"].values
+        x1_vals = p1.loc[common, "x1"].values
+        is_treated = (p1.loc[common, "first_treat"] > 0).values.astype(float)
+
+        # Unconditional regression: dy ~ treated (should show large gap)
+        n = len(dy)
+        X_uncond = np.column_stack([np.ones(n), is_treated])
+        beta_uncond = np.linalg.lstsq(X_uncond, dy, rcond=None)[0]
+        uncond_gap = abs(beta_uncond[1])
+
+        # Conditional regression: dy ~ treated + x1 (gap should shrink)
+        X_cond = np.column_stack([np.ones(n), is_treated, x1_vals])
+        beta_cond = np.linalg.lstsq(X_cond, dy, rcond=None)[0]
+        cond_gap = abs(beta_cond[1])
+
+        # With low noise and strong signal, controlling for x1 should
+        # substantially reduce the treated coefficient
+        assert uncond_gap > 0.05, f"Unconditional gap too small: {uncond_gap:.4f}"
+        assert cond_gap < uncond_gap * 0.5, (
+            f"Conditional gap ({cond_gap:.4f}) should be much smaller than "
+            f"unconditional ({uncond_gap:.4f})"
+        )
+
+    def test_conditional_pt_backward_compatible(self):
+        """conditional_pt=0.0 should produce identical output to default."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        df_default = generate_survey_did_data(
+            n_units=100, add_covariates=True, seed=99
+        )
+        df_explicit = generate_survey_did_data(
+            n_units=100, add_covariates=True, conditional_pt=0.0, seed=99
+        )
+        pd.testing.assert_frame_equal(df_default, df_explicit)
+
+    def test_conditional_pt_panel_and_crosssection(self):
+        """conditional_pt should work in both panel and cross-section modes."""
+        from diff_diff.prep_dgp import generate_survey_did_data
+
+        for panel_mode in [True, False]:
+            df = generate_survey_did_data(
+                n_units=500,
+                n_periods=4,
+                add_covariates=True,
+                conditional_pt=0.3,
+                panel=panel_mode,
+                seed=42,
+            )
+            # Basic sanity: data is produced
+            assert len(df) == 500 * 4
+            assert "x1" in df.columns
+            # Check x1 shift exists in period 1
+            p1 = df[df["period"] == 1]
+            x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].mean()
+            x1_control = p1.loc[p1["first_treat"] == 0, "x1"].mean()
+            assert x1_treated > x1_control, (
+                f"panel={panel_mode}: treated x1 not shifted"
+            )
+
 
 class TestAggregateSurvey:
     """Tests for aggregate_survey function."""