Skip to content

Commit 04a5fa1

Browse files
igerberclaude
andcommitted
Codex CI R7 P1: extend shared validator to check cluster column existence
P1 (Code Quality) [new in R7] — `cluster=<col>` is load-bearing on the Conley combined-kernel paths (Wave A #119) but none of DiD / MPD / TWFE validated that the named cluster column exists in `data` before the downstream `data[self.cluster]` access. A typo like `cluster="missing_region"` fell through to a raw pandas KeyError instead of the estimator-level ValueError pattern the rest of the Conley validation surface now uses. Same class as R1's unit-column guard and R2/R6's conley_coords guard: extends the shared `_validate_conley_estimator_inputs` helper added in R6 with an 8th check `if cluster is not None and cluster not in data.columns: raise ValueError("Cluster column ... not found in data")`. The three call sites in DiD/MPD/TWFE now pass `cluster=self.cluster` through and pick up the new guard via one-line opt-in. Future Conley surfaces that add cluster support get the validator's behavior for free. Tests: regressions on all three estimator surfaces (DiD/MPD/TWFE) asserting `cluster="missing_region"` raises the estimator-level ValueError before any pandas-level error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 68678e9 commit 04a5fa1

4 files changed

Lines changed: 117 additions & 8 deletions

File tree

diff_diff/conley.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -208,19 +208,20 @@ def _validate_conley_estimator_inputs(
208208
conley_lag_cutoff: Optional[int],
209209
survey_design: object,
210210
inference: str,
211+
cluster: Optional[str] = None,
211212
) -> None:
212213
"""Shared front-door validation for ``vcov_type='conley'`` on the
213214
estimator entry points (``DifferenceInDifferences``, ``MultiPeriodDiD``,
214215
``TwoWayFixedEffects``).
215216
216217
Each estimator's ``fit()`` calls this BEFORE building Conley arrays or
217-
threading them into the variance computation. The seven checks below
218+
threading them into the variance computation. The eight checks below
218219
are the union of what each estimator needs; estimator-specific bits
219220
(e.g. building the array from a column name) remain inline at the
220221
caller. Centralizing this in one place prevents the validation-class
221-
drift that surfaced repeatedly across Wave A CI rounds (DiD's
222-
front-door guard for `unit` and `conley_coords` was missing on MPD
223-
/ TWFE).
222+
drift that surfaced repeatedly across Wave A CI rounds (front-door
223+
guards for `unit`, `conley_coords`, and `cluster` were each missing
224+
on at least one estimator surface and required separate fixes).
224225
225226
Parameters
226227
----------
@@ -229,8 +230,8 @@ def _validate_conley_estimator_inputs(
229230
``"DifferenceInDifferences"``).
230231
data : pandas.DataFrame
231232
The dataset passed to ``fit()``. Used to check column existence
232-
for ``unit`` and ``conley_coords``. Typed as ``Any`` here to
233-
avoid importing pandas at module load time.
233+
for ``unit``, ``conley_coords``, and ``cluster``. Typed as
234+
``Any`` here to avoid importing pandas at module load time.
234235
unit : str or None
235236
Name of the unit identifier column (required for Conley).
236237
conley_coords : tuple/list/None
@@ -243,13 +244,19 @@ def _validate_conley_estimator_inputs(
243244
``SurveyDesign`` instance or ``None``. Survey + Conley is deferred.
244245
inference : str
245246
Estimator inference mode. ``"wild_bootstrap"`` + Conley is rejected.
247+
cluster : str or None, default None
248+
Name of the cluster column if the user opted into the combined
249+
spatial + cluster product kernel (Wave A #119). When non-None,
250+
the column must exist in ``data``; otherwise the downstream
251+
``data[cluster]`` access would raise an opaque pandas
252+
``KeyError``. Pass ``None`` (the default) to skip this check.
246253
247254
Raises
248255
------
249256
ValueError
250257
Missing/malformed conley_coords, missing conley_cutoff_km,
251-
missing/unknown unit, missing conley_lag_cutoff,
252-
coord column not in ``data``.
258+
missing/unknown unit, missing conley_lag_cutoff, coord column
259+
not in ``data``, or ``cluster`` set to a name not in ``data``.
253260
NotImplementedError
254261
``survey_design`` is non-None, or ``inference == "wild_bootstrap"``.
255262
"""
@@ -285,6 +292,8 @@ def _validate_conley_estimator_inputs(
285292
"(non-negative int; 0 means spatial-within-period only, no serial "
286293
"component). See R conleyreg's `lag_cutoff` argument for the convention."
287294
)
295+
if cluster is not None and cluster not in data.columns:
296+
raise ValueError(f"Cluster column '{cluster}' not found in data")
288297
if survey_design is not None:
289298
raise NotImplementedError(
290299
f"{estimator_name}(vcov_type='conley') + survey_design is a "

diff_diff/estimators.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ def fit(
407407
conley_lag_cutoff=self.conley_lag_cutoff,
408408
survey_design=survey_design,
409409
inference=self.inference,
410+
cluster=self.cluster,
410411
)
411412

412413
if absorb:
@@ -1473,6 +1474,7 @@ def fit( # type: ignore[override]
14731474
conley_lag_cutoff=self.conley_lag_cutoff,
14741475
survey_design=survey_design,
14751476
inference=self.inference,
1477+
cluster=self.cluster,
14761478
)
14771479
# Pre-compute non_ref_periods (needed for absorb demeaning)
14781480
non_ref_periods = [p for p in all_periods if p != reference_period]

diff_diff/twfe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def fit( # type: ignore[override]
184184
conley_lag_cutoff=self.conley_lag_cutoff,
185185
survey_design=survey_design,
186186
inference=self.inference,
187+
cluster=self.cluster,
187188
)
188189

189190
# Check for staggered treatment timing and warn if detected

tests/test_conley_vcov.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,6 +1162,103 @@ def test_did_conley_unknown_coord_column_raises(self, two_period_panel):
11621162
unit="unit",
11631163
)
11641164

1165+
def test_did_conley_unknown_cluster_column_raises(self, two_period_panel):
1166+
"""DiD + Conley + cluster=<missing column> raises a clear estimator-
1167+
level ValueError before `data[self.cluster]` access (combined-kernel
1168+
path; codex CI R7 P1)."""
1169+
from diff_diff import DifferenceInDifferences
1170+
1171+
with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
1172+
DifferenceInDifferences(
1173+
vcov_type="conley",
1174+
cluster="missing_region",
1175+
conley_coords=("lat", "lon"),
1176+
conley_cutoff_km=2000.0,
1177+
conley_lag_cutoff=1,
1178+
).fit(
1179+
two_period_panel,
1180+
outcome="y",
1181+
treatment="treated",
1182+
time="time",
1183+
unit="unit",
1184+
)
1185+
1186+
def test_mpd_conley_unknown_cluster_column_raises(self):
1187+
"""MPD + Conley + cluster=<missing column> raises a clear estimator-
1188+
level ValueError before `data[self.cluster]` access (combined-kernel
1189+
path; codex CI R7 P1)."""
1190+
import pandas as _pd
1191+
1192+
from diff_diff import MultiPeriodDiD
1193+
1194+
rng = np.random.default_rng(seed=83)
1195+
rows = []
1196+
for u in range(8):
1197+
lat = rng.uniform(-30, 30)
1198+
lon = rng.uniform(-100, 100)
1199+
for t in range(3):
1200+
rows.append(
1201+
{
1202+
"unit": u,
1203+
"time": t,
1204+
"y": rng.standard_normal(),
1205+
"treated": int(u >= 4),
1206+
"lat": lat,
1207+
"lon": lon,
1208+
}
1209+
)
1210+
df = _pd.DataFrame(rows)
1211+
with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
1212+
MultiPeriodDiD(
1213+
vcov_type="conley",
1214+
cluster="missing_region",
1215+
conley_coords=("lat", "lon"),
1216+
conley_cutoff_km=2000.0,
1217+
conley_lag_cutoff=1,
1218+
).fit(
1219+
df,
1220+
outcome="y",
1221+
treatment="treated",
1222+
time="time",
1223+
unit="unit",
1224+
post_periods=[1, 2],
1225+
reference_period=0,
1226+
)
1227+
1228+
def test_twfe_conley_unknown_cluster_column_raises(self):
1229+
"""TWFE + Conley + cluster=<missing column> raises a clear estimator-
1230+
level ValueError before `data[self.cluster]` access (combined-kernel
1231+
path; codex CI R7 P1)."""
1232+
import pandas as _pd
1233+
1234+
from diff_diff import TwoWayFixedEffects
1235+
1236+
rng = np.random.default_rng(seed=89)
1237+
rows = []
1238+
for u in range(8):
1239+
lat = rng.uniform(-5, 5)
1240+
lon = rng.uniform(-5, 5)
1241+
for t in range(2):
1242+
rows.append(
1243+
{
1244+
"unit": u,
1245+
"time": t,
1246+
"y": rng.standard_normal(),
1247+
"treated": int(u >= 4),
1248+
"lat": lat,
1249+
"lon": lon,
1250+
}
1251+
)
1252+
df = _pd.DataFrame(rows)
1253+
with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
1254+
TwoWayFixedEffects(
1255+
vcov_type="conley",
1256+
cluster="missing_region",
1257+
conley_coords=("lat", "lon"),
1258+
conley_cutoff_km=2000.0,
1259+
conley_lag_cutoff=1,
1260+
).fit(df, outcome="y", treatment="treated", time="time", unit="unit")
1261+
11651262
def test_did_conley_malformed_coord_tuple_raises(self, two_period_panel):
11661263
"""vcov_type='conley' with a malformed conley_coords (wrong arity or
11671264
non-string elements) raises ValueError before downstream access.

0 commit comments

Comments
 (0)