Skip to content

Commit fcc9717

Browse files
igerberclaude
andcommitted
feat: add conditional_pt parameter to survey DGP for conditional PT simulation
Adds a `conditional_pt` parameter to `generate_survey_did_data()` that creates X-dependent time trends violating unconditional parallel trends while preserving conditional PT. When nonzero, treated units' x1 is drawn from N(1,1) instead of N(0,1), and the outcome includes `conditional_pt * x1 * (t/T)`. This unblocks simulation scenario 4 for the survey variance paper: DR/IPW with covariates recovers truth while no-covariate estimators are biased. Also adds `paper/` to .gitignore for local manuscript files and marks the conditional PT DGP gap as resolved in the survey roadmap. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 81d9960 commit fcc9717

4 files changed

Lines changed: 185 additions & 4 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ trop_avg_ref/
8989

9090
# Academic papers (local only, not for distribution)
9191
papers/
92+
paper/
9293

9394
# Local analysis notebooks (not committed)
9495
analysis/

diff_diff/prep_dgp.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,7 @@ def generate_survey_did_data(
11891189
return_true_population_att: bool = False,
11901190
covariate_effects: Optional[tuple] = None,
11911191
te_covariate_interaction: float = 0.0,
1192+
conditional_pt: float = 0.0,
11921193
) -> pd.DataFrame:
11931194
"""
11941195
Generate synthetic staggered DiD data with survey structure.
@@ -1301,6 +1302,19 @@ def generate_survey_did_data(
13011302
``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
13021303
unit-level treatment effect heterogeneity driven by the continuous
13031304
covariate. Requires ``add_covariates=True``.
1305+
conditional_pt : float, default=0.0
1306+
Coefficient for X-dependent time trend:
1307+
``y += conditional_pt * x1_i * (t / n_periods)``. When nonzero,
1308+
treated units' x1 is drawn from N(1, 1) instead of N(0, 1),
1309+
creating differential pre-trends correlated with covariates.
1310+
Conditional on x1, trends remain parallel (conditional PT holds).
1311+
DR/IPW estimators with covariates recover truth; no-covariate
1312+
estimators are biased. Uses normalized time (t/n_periods) for
1313+
scale independence. Requires ``add_covariates=True``.
1314+
1315+
.. note:: When used with ``icc``, the ICC calibration is approximate
1316+
because the x1 mean shift creates a mixture distribution with
1317+
slightly higher marginal variance than the assumed Var(x1) = 1.
13041318
13051319
Returns
13061320
-------
@@ -1435,6 +1449,13 @@ def generate_survey_did_data(
14351449
"te_covariate_interaction requires add_covariates=True"
14361450
)
14371451

1452+
if not np.isfinite(conditional_pt):
1453+
raise ValueError(
1454+
f"conditional_pt must be finite, got {conditional_pt}"
1455+
)
1456+
if conditional_pt != 0.0 and not add_covariates:
1457+
raise ValueError("conditional_pt requires add_covariates=True")
1458+
14381459
# --- ICC -> psu_re_sd resolution ---
14391460
if icc is not None:
14401461
# Covariate variance: Var(beta1*x1) + Var(beta2*x2)
@@ -1533,8 +1554,12 @@ def generate_survey_did_data(
15331554
)
15341555
if add_covariates:
15351556
_panel_x1 = rng.normal(0, 1, size=n_units)
1557+
if conditional_pt != 0.0:
1558+
_panel_x1[unit_cohort > 0] += 1.0
15361559
_panel_x2 = rng.choice([0, 1], size=n_units)
15371560
y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
1561+
if conditional_pt != 0.0:
1562+
y0_period1 = y0_period1 + conditional_pt * _panel_x1 * (1 / n_periods)
15381563
_rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
15391564

15401565
# Save base weights for cross-section informative sampling (reset each period)
@@ -1572,6 +1597,8 @@ def generate_survey_did_data(
15721597
# Draw covariates early so they can be included in Y(0) ranking
15731598
if add_covariates:
15741599
x1 = rng.normal(0, 1, size=n_units)
1600+
if conditional_pt != 0.0:
1601+
x1[unit_cohort > 0] += 1.0
15751602
x2 = rng.choice([0, 1], size=n_units)
15761603
unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
15771604
y0_t = (
@@ -1582,6 +1609,8 @@ def generate_survey_did_data(
15821609
)
15831610
if add_covariates:
15841611
y0_t = y0_t + _beta1 * x1 + _beta2 * x2
1612+
if conditional_pt != 0.0:
1613+
y0_t = y0_t + conditional_pt * x1 * (t / n_periods)
15851614
_rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
15861615

15871616
# Covariates — may already be drawn by informative sampling above
@@ -1592,6 +1621,8 @@ def generate_survey_did_data(
15921621
pass # x1, x2 already drawn in cross-section ranking block
15931622
elif add_covariates:
15941623
x1 = rng.normal(0, 1, size=n_units)
1624+
if conditional_pt != 0.0:
1625+
x1[unit_cohort > 0] += 1.0
15951626
x2 = rng.choice([0, 1], size=n_units)
15961627
else:
15971628
x1 = None
@@ -1610,6 +1641,8 @@ def generate_survey_did_data(
16101641

16111642
if add_covariates:
16121643
y += _beta1 * x1[i] + _beta2 * x2[i]
1644+
if conditional_pt != 0.0:
1645+
y += conditional_pt * x1[i] * (t / n_periods)
16131646

16141647
treated = int(g_i > 0 and t >= g_i)
16151648
true_eff = 0.0
@@ -1713,6 +1746,7 @@ def generate_survey_did_data(
17131746
"deff_kish": float(deff_kish),
17141747
"base_stratum_effects": stratum_effects,
17151748
"icc_realized": icc_realized,
1749+
"conditional_pt_active": conditional_pt != 0.0,
17161750
}
17171751

17181752
return df

docs/survey-roadmap.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,10 @@ Enhanced `generate_survey_did_data()` with 8 research-grade parameters:
164164
`return_true_population_att`. All backward-compatible. Supports panel
165165
and repeated cross-section modes.
166166

167-
**Remaining gap for 10e:** Conditional parallel trends — the DGP has
168-
unconditional PT by construction. A `conditional_pt` parameter is needed
169-
before the simulation study so that unconditional PT fails but conditional
170-
PT holds after covariate adjustment (DR/IPW recovers truth).
167+
**Resolved:** `conditional_pt` parameter added. When nonzero, shifts treated
168+
units' x1 mean by +1 SD and adds `conditional_pt * x1_i * (t/T)` to the
169+
outcome, creating X-dependent time trends. Unconditional PT fails; conditional
170+
PT holds after covariate adjustment. DR/IPW estimators recover truth.
171171

172172
### 10c. Expand R Validation Coverage (HIGH priority) ✅
173173

tests/test_prep.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1966,6 +1966,152 @@ def test_te_covariate_interaction_validation(self):
19661966
with pytest.raises(ValueError, match="te_covariate_interaction must be finite"):
19671967
generate_survey_did_data(add_covariates=True, te_covariate_interaction=np.nan, seed=42)
19681968

1969+
# --- conditional_pt parameter tests ---
1970+
1971+
def test_conditional_pt_requires_covariates(self):
1972+
"""conditional_pt requires add_covariates=True."""
1973+
from diff_diff.prep_dgp import generate_survey_did_data
1974+
1975+
with pytest.raises(ValueError, match="conditional_pt requires add_covariates"):
1976+
generate_survey_did_data(conditional_pt=0.3, add_covariates=False, seed=42)
1977+
1978+
def test_conditional_pt_nonfinite_rejected(self):
1979+
"""conditional_pt must be finite."""
1980+
from diff_diff.prep_dgp import generate_survey_did_data
1981+
1982+
with pytest.raises(ValueError, match="conditional_pt must be finite"):
1983+
generate_survey_did_data(
1984+
add_covariates=True, conditional_pt=np.inf, seed=42
1985+
)
1986+
with pytest.raises(ValueError, match="conditional_pt must be finite"):
1987+
generate_survey_did_data(
1988+
add_covariates=True, conditional_pt=np.nan, seed=42
1989+
)
1990+
1991+
def test_conditional_pt_x1_distribution_shift(self):
1992+
"""Treated units should have higher x1 when conditional_pt is active."""
1993+
from diff_diff.prep_dgp import generate_survey_did_data
1994+
1995+
df = generate_survey_did_data(
1996+
n_units=1000,
1997+
n_periods=4,
1998+
add_covariates=True,
1999+
conditional_pt=0.3,
2000+
seed=42,
2001+
)
2002+
p1 = df[df["period"] == 1]
2003+
x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].values
2004+
x1_control = p1.loc[p1["first_treat"] == 0, "x1"].values
2005+
shift = x1_treated.mean() - x1_control.mean()
2006+
# Expect ~1.0 SD shift; require at least 0.5
2007+
assert shift > 0.5, f"x1 mean shift too small: {shift:.3f}"
2008+
2009+
def test_conditional_pt_unconditional_pt_fails(self):
2010+
"""With conditional_pt active, unconditional pre-trends should differ."""
2011+
from diff_diff.prep_dgp import generate_survey_did_data
2012+
2013+
df = generate_survey_did_data(
2014+
n_units=2000,
2015+
n_periods=8,
2016+
add_covariates=True,
2017+
conditional_pt=0.5,
2018+
never_treated_frac=0.5,
2019+
seed=42,
2020+
)
2021+
# Compute mean outcome change (period 2 - period 1) for each group
2022+
# before any treatment (use periods 1 and 2, treatment starts at 3+)
2023+
p1 = df[df["period"] == 1].set_index("unit")
2024+
p2 = df[df["period"] == 2].set_index("unit")
2025+
common = p1.index.intersection(p2.index)
2026+
dy = p2.loc[common, "outcome"] - p1.loc[common, "outcome"]
2027+
is_treated = p1.loc[common, "first_treat"] > 0
2028+
2029+
trend_treated = dy[is_treated].mean()
2030+
trend_control = dy[~is_treated].mean()
2031+
gap = abs(trend_treated - trend_control)
2032+
# With conditional_pt=0.5 and 1 SD shift, expect a detectable gap
2033+
assert gap > 0.01, f"Unconditional PT gap too small: {gap:.4f}"
2034+
2035+
def test_conditional_pt_conditional_pt_holds(self):
2036+
"""Controlling for x1, treated/control pre-trends should be equal.
2037+
2038+
Use low PSU noise so the conditional_pt signal dominates.
2039+
"""
2040+
from diff_diff.prep_dgp import generate_survey_did_data
2041+
2042+
df = generate_survey_did_data(
2043+
n_units=2000,
2044+
n_periods=8,
2045+
add_covariates=True,
2046+
conditional_pt=2.0,
2047+
never_treated_frac=0.5,
2048+
psu_re_sd=0.1,
2049+
psu_period_factor=0.1,
2050+
noise_sd=0.2,
2051+
seed=42,
2052+
)
2053+
p1 = df[df["period"] == 1].set_index("unit")
2054+
p2 = df[df["period"] == 2].set_index("unit")
2055+
common = p1.index.intersection(p2.index)
2056+
dy = p2.loc[common, "outcome"].values - p1.loc[common, "outcome"].values
2057+
x1_vals = p1.loc[common, "x1"].values
2058+
is_treated = (p1.loc[common, "first_treat"] > 0).values.astype(float)
2059+
2060+
# Unconditional regression: dy ~ treated (should show large gap)
2061+
n = len(dy)
2062+
X_uncond = np.column_stack([np.ones(n), is_treated])
2063+
beta_uncond = np.linalg.lstsq(X_uncond, dy, rcond=None)[0]
2064+
uncond_gap = abs(beta_uncond[1])
2065+
2066+
# Conditional regression: dy ~ treated + x1 (gap should shrink)
2067+
X_cond = np.column_stack([np.ones(n), is_treated, x1_vals])
2068+
beta_cond = np.linalg.lstsq(X_cond, dy, rcond=None)[0]
2069+
cond_gap = abs(beta_cond[1])
2070+
2071+
# With low noise and strong signal, controlling for x1 should
2072+
# substantially reduce the treated coefficient
2073+
assert uncond_gap > 0.05, f"Unconditional gap too small: {uncond_gap:.4f}"
2074+
assert cond_gap < uncond_gap * 0.5, (
2075+
f"Conditional gap ({cond_gap:.4f}) should be much smaller than "
2076+
f"unconditional ({uncond_gap:.4f})"
2077+
)
2078+
2079+
def test_conditional_pt_backward_compatible(self):
2080+
"""conditional_pt=0.0 should produce identical output to default."""
2081+
from diff_diff.prep_dgp import generate_survey_did_data
2082+
2083+
df_default = generate_survey_did_data(
2084+
n_units=100, add_covariates=True, seed=99
2085+
)
2086+
df_explicit = generate_survey_did_data(
2087+
n_units=100, add_covariates=True, conditional_pt=0.0, seed=99
2088+
)
2089+
pd.testing.assert_frame_equal(df_default, df_explicit)
2090+
2091+
def test_conditional_pt_panel_and_crosssection(self):
2092+
"""conditional_pt should work in both panel and cross-section modes."""
2093+
from diff_diff.prep_dgp import generate_survey_did_data
2094+
2095+
for panel_mode in [True, False]:
2096+
df = generate_survey_did_data(
2097+
n_units=500,
2098+
n_periods=4,
2099+
add_covariates=True,
2100+
conditional_pt=0.3,
2101+
panel=panel_mode,
2102+
seed=42,
2103+
)
2104+
# Basic sanity: data is produced
2105+
assert len(df) == 500 * 4
2106+
assert "x1" in df.columns
2107+
# Check x1 shift exists in period 1
2108+
p1 = df[df["period"] == 1]
2109+
x1_treated = p1.loc[p1["first_treat"] > 0, "x1"].mean()
2110+
x1_control = p1.loc[p1["first_treat"] == 0, "x1"].mean()
2111+
assert x1_treated > x1_control, (
2112+
f"panel={panel_mode}: treated x1 not shifted"
2113+
)
2114+
19692115

19702116
class TestAggregateSurvey:
19712117
"""Tests for aggregate_survey function."""

0 commit comments

Comments
 (0)