Skip to content

Commit be1b527

Browse files
igerberclaude
andcommitted
Codex CI R1: DiD unit-column front-door guard + DiD/MPD combined-kernel tests
Address CI codex review round 1 (PR #433). P1 #1 — DiD.fit() missing unit-column existence check - The new Conley path validated `unit is not None` but never confirmed the named column actually exists, so a bad column name fell through to a raw `pandas` KeyError at `data[unit]` instead of a clear estimator-level message. Adds the same front-door guard that `MultiPeriodDiD.fit` and `TwoWayFixedEffects.fit` already have: if unit not in data.columns: raise ValueError(f"Unit column '{unit}' not found in data") P1 #2 — Combined-kernel estimator-path coverage - The low-level helper and TWFE-with-cluster were covered, but the DiD and MultiPeriodDiD public estimator paths with explicit `cluster=` were not. Adds four new tests in `TestConleyCluster`: * test_did_combined_kernel_finite_se_and_cluster_name: DiD + Conley + cluster='region' produces finite SE, propagates cluster_name to result + to_dict(), differs from no-cluster baseline (combined kernel zeros out cross-cluster pairs). * test_did_combined_kernel_time_varying_cluster_raises: panel time-invariance contract enforced on the DiD path. * test_mpd_combined_kernel_finite_se_and_cluster_name: same finite-SE + cluster_name + to_dict() invariants on MPD. * test_mpd_combined_kernel_time_varying_cluster_raises: panel time-invariance contract on MPD. - test_did_conley_unknown_unit_column_raises (Code Quality P1 regression): asserts the new ValueError fires before the bad pandas lookup. Targeted regression: tests/test_conley_vcov.py + test_estimators.py + test_methodology_twfe.py: 332 passed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 986612b commit be1b527

2 files changed

Lines changed: 152 additions & 0 deletions

File tree

diff_diff/estimators.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,8 @@ def fit(
399399
"sandwich needs the unit identifier to compute the per-unit "
400400
"serial sum. Pass DiD(...).fit(data, ..., unit='<col>')."
401401
)
402+
if unit not in data.columns:
403+
raise ValueError(f"Unit column '{unit}' not found in data")
402404
if self.conley_lag_cutoff is None:
403405
raise ValueError(
404406
"DifferenceInDifferences(vcov_type='conley') requires "

tests/test_conley_vcov.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,27 @@ def test_did_conley_missing_unit_raises(self, two_period_panel):
10781078
conley_lag_cutoff=1,
10791079
).fit(two_period_panel, outcome="y", treatment="treated", time="time")
10801080

1081+
def test_did_conley_unknown_unit_column_raises(self, two_period_panel):
1082+
"""vcov_type='conley' with `unit=<name>` referring to an absent column
1083+
raises a clear estimator-level ValueError, NOT a raw pandas KeyError.
1084+
Front-door check mirrors MultiPeriodDiD / TwoWayFixedEffects.
1085+
Codex CI R1 P1 #1."""
1086+
from diff_diff import DifferenceInDifferences
1087+
1088+
with pytest.raises(ValueError, match="Unit column 'missing_unit' not found"):
1089+
DifferenceInDifferences(
1090+
vcov_type="conley",
1091+
conley_coords=("lat", "lon"),
1092+
conley_cutoff_km=2000.0,
1093+
conley_lag_cutoff=1,
1094+
).fit(
1095+
two_period_panel,
1096+
outcome="y",
1097+
treatment="treated",
1098+
time="time",
1099+
unit="missing_unit",
1100+
)
1101+
10811102
def test_did_conley_missing_lag_cutoff_raises(self, two_period_panel):
10821103
"""vcov_type='conley' without conley_lag_cutoff raises ValueError."""
10831104
from diff_diff import DifferenceInDifferences
@@ -3157,3 +3178,132 @@ def test_twfe_explicit_cluster_propagates_to_cluster_name(self):
31573178
assert res.cluster_name == "region"
31583179
d = res.to_dict()
31593180
assert d.get("cluster_name") == "region"
3181+
3182+
def _multi_period_panel_with_region(self, n_units=12, T=4, seed=41):
3183+
"""Multi-period panel with a time-invariant `region` column for
3184+
combined-kernel estimator tests."""
3185+
import pandas as _pd
3186+
3187+
rng = np.random.default_rng(seed=seed)
3188+
rows = []
3189+
for u in range(n_units):
3190+
treated = u >= n_units // 2
3191+
lat = rng.uniform(-30, 30)
3192+
lon = rng.uniform(-100, 100)
3193+
region = u // 3 # time-invariant within unit; spans multiple units
3194+
for t in range(T):
3195+
effect = 1.0 if (treated and t >= T // 2) else 0.0
3196+
yv = 0.2 * t + effect + rng.normal(0, 0.3)
3197+
rows.append(
3198+
{
3199+
"unit": u,
3200+
"time": t,
3201+
"y": yv,
3202+
"treated": int(treated),
3203+
"lat": lat,
3204+
"lon": lon,
3205+
"region": region,
3206+
}
3207+
)
3208+
return _pd.DataFrame(rows)
3209+
3210+
def test_did_combined_kernel_finite_se_and_cluster_name(self):
3211+
"""DifferenceInDifferences(vcov_type='conley', cluster='region') on
3212+
a 2-period panel produces a finite SE, propagates `region` to
3213+
res.cluster_name and to_dict(), and differs from the no-cluster
3214+
baseline (combined kernel zeros out cross-cluster off-diagonals)."""
3215+
from diff_diff import DifferenceInDifferences
3216+
3217+
df = self._multi_period_panel_with_region(n_units=12, T=2, seed=43)
3218+
kwargs = dict(
3219+
vcov_type="conley",
3220+
conley_coords=("lat", "lon"),
3221+
conley_cutoff_km=2000.0,
3222+
conley_lag_cutoff=1,
3223+
)
3224+
res_combined = DifferenceInDifferences(cluster="region", **kwargs).fit(
3225+
df, outcome="y", treatment="treated", time="time", unit="unit"
3226+
)
3227+
res_bare = DifferenceInDifferences(**kwargs).fit(
3228+
df, outcome="y", treatment="treated", time="time", unit="unit"
3229+
)
3230+
assert np.isfinite(res_combined.att)
3231+
assert np.isfinite(res_combined.se) and res_combined.se > 0
3232+
assert res_combined.cluster_name == "region"
3233+
d = res_combined.to_dict()
3234+
assert d.get("cluster_name") == "region"
3235+
# Combined kernel zeros out off-cluster pairs → SE differs from bare
3236+
assert not np.isclose(res_combined.se, res_bare.se, atol=1e-8)
3237+
3238+
def test_did_combined_kernel_time_varying_cluster_raises(self):
3239+
"""DiD + Conley + cluster=<col> on the panel block-decomposed path
3240+
must raise when the cluster column varies across periods within a
3241+
unit (time-invariance contract). Codex CI R1 P1 #2."""
3242+
from diff_diff import DifferenceInDifferences
3243+
3244+
df = self._multi_period_panel_with_region(n_units=10, T=2, seed=47)
3245+
# Make region time-varying for unit 0 (different region in t=1)
3246+
mask_u0_t1 = (df["unit"] == 0) & (df["time"] == 1)
3247+
df.loc[mask_u0_t1, "region"] = 99
3248+
with pytest.raises(ValueError, match="constant within each unit"):
3249+
DifferenceInDifferences(
3250+
vcov_type="conley",
3251+
cluster="region",
3252+
conley_coords=("lat", "lon"),
3253+
conley_cutoff_km=2000.0,
3254+
conley_lag_cutoff=1,
3255+
).fit(df, outcome="y", treatment="treated", time="time", unit="unit")
3256+
3257+
def test_mpd_combined_kernel_finite_se_and_cluster_name(self):
3258+
"""MultiPeriodDiD(vcov_type='conley', cluster='region') on a 4-period
3259+
panel produces a finite SE and propagates `region` to cluster_name
3260+
on the result + to_dict()."""
3261+
from diff_diff import MultiPeriodDiD
3262+
3263+
df = self._multi_period_panel_with_region(n_units=12, T=4, seed=53)
3264+
res = MultiPeriodDiD(
3265+
vcov_type="conley",
3266+
cluster="region",
3267+
conley_coords=("lat", "lon"),
3268+
conley_cutoff_km=2000.0,
3269+
conley_lag_cutoff=1,
3270+
).fit(
3271+
df,
3272+
outcome="y",
3273+
treatment="treated",
3274+
time="time",
3275+
unit="unit",
3276+
post_periods=[2, 3],
3277+
reference_period=0,
3278+
)
3279+
assert np.isfinite(res.avg_att)
3280+
assert np.isfinite(res.avg_se) and res.avg_se > 0
3281+
assert res.cluster_name == "region"
3282+
d = res.to_dict()
3283+
assert d.get("cluster_name") == "region"
3284+
3285+
def test_mpd_combined_kernel_time_varying_cluster_raises(self):
3286+
"""MultiPeriodDiD + Conley + cluster=<col> with a cluster that
3287+
varies across periods within a unit raises ValueError (same time-
3288+
invariance contract as the linalg validator). Codex CI R1 P1 #2."""
3289+
from diff_diff import MultiPeriodDiD
3290+
3291+
df = self._multi_period_panel_with_region(n_units=10, T=3, seed=59)
3292+
mask_violator = (df["unit"] == 2) & (df["time"] == 2)
3293+
df.loc[mask_violator, "region"] = 77
3294+
with pytest.raises(ValueError, match="constant within each unit"):
3295+
MultiPeriodDiD(
3296+
vcov_type="conley",
3297+
cluster="region",
3298+
conley_coords=("lat", "lon"),
3299+
conley_cutoff_km=2000.0,
3300+
conley_lag_cutoff=1,
3301+
).fit(
3302+
df,
3303+
outcome="y",
3304+
treatment="treated",
3305+
time="time",
3306+
unit="unit",
3307+
post_periods=[1, 2],
3308+
reference_period=0,
3309+
)

0 commit comments

Comments
 (0)