Codex CI R7 P1: extend shared validator to check cluster column existence

igerber · claude · igerber · commit 04a5fa1e3b98 · 2026-05-14T15:23:20.000-04:00
P1 (Code Quality) [new in R7] — `cluster=<col>` is load-bearing on the Conley combined-kernel paths (Wave A #119) but none of DiD / MPD / TWFE validated that the named cluster column exists in `data` before the downstream `data[self.cluster]` access. A typo like `cluster="missing_region"` fell through to a raw pandas KeyError instead of the estimator-level ValueError pattern the rest of the Conley validation surface now uses. Same class as R1's unit-column guard and R2/R6's conley_coords guard: extends the shared `_validate_conley_estimator_inputs` helper added in R6 with an 8th check `if cluster is not None and cluster not in data.columns: raise ValueError("Cluster column ... not found in data")`. The three call sites in DiD/MPD/TWFE now pass `cluster=self.cluster` through and pick up the new guard via one-line opt-in. Future Conley surfaces that add cluster support get the validator's behavior for free. Tests: regressions on all three estimator surfaces (DiD/MPD/TWFE) asserting `cluster="missing_region"` raises the estimator-level ValueError before any pandas-level error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/diff_diff/conley.py b/diff_diff/conley.py
@@ -208,19 +208,20 @@ def _validate_conley_estimator_inputs(
     conley_lag_cutoff: Optional[int],
     survey_design: object,
     inference: str,
+    cluster: Optional[str] = None,
 ) -> None:
     """Shared front-door validation for ``vcov_type='conley'`` on the
     estimator entry points (``DifferenceInDifferences``, ``MultiPeriodDiD``,
     ``TwoWayFixedEffects``).
 
     Each estimator's ``fit()`` calls this BEFORE building Conley arrays or
-    threading them into the variance computation. The seven checks below
+    threading them into the variance computation. The eight checks below
     are the union of what each estimator needs; estimator-specific bits
     (e.g. building the array from a column name) remain inline at the
     caller. Centralizing this in one place prevents the validation-class
-    drift that surfaced repeatedly across Wave A CI rounds (DiD's
-    front-door guard for `unit` and `conley_coords` was missing on MPD
-    / TWFE).
+    drift that surfaced repeatedly across Wave A CI rounds (front-door
+    guards for `unit`, `conley_coords`, and `cluster` were each missing
+    on at least one estimator surface and required separate fixes).
 
     Parameters
     ----------
@@ -229,8 +230,8 @@ def _validate_conley_estimator_inputs(
         ``"DifferenceInDifferences"``).
     data : pandas.DataFrame
         The dataset passed to ``fit()``. Used to check column existence
-        for ``unit`` and ``conley_coords``. Typed as ``Any`` here to
-        avoid importing pandas at module load time.
+        for ``unit``, ``conley_coords``, and ``cluster``. Typed as
+        ``Any`` here to avoid importing pandas at module load time.
     unit : str or None
         Name of the unit identifier column (required for Conley).
     conley_coords : tuple/list/None
@@ -243,13 +244,19 @@ def _validate_conley_estimator_inputs(
         ``SurveyDesign`` instance or ``None``. Survey + Conley is deferred.
     inference : str
         Estimator inference mode. ``"wild_bootstrap"`` + Conley is rejected.
+    cluster : str or None, default None
+        Name of the cluster column if the user opted into the combined
+        spatial + cluster product kernel (Wave A #119). When non-None,
+        the column must exist in ``data``; otherwise the downstream
+        ``data[cluster]`` access would raise an opaque pandas
+        ``KeyError``. Pass ``None`` (the default) to skip this check.
 
     Raises
     ------
     ValueError
         Missing/malformed conley_coords, missing conley_cutoff_km,
-        missing/unknown unit, missing conley_lag_cutoff,
-        coord column not in ``data``.
+        missing/unknown unit, missing conley_lag_cutoff, coord column
+        not in ``data``, or ``cluster`` set to a name not in ``data``.
     NotImplementedError
         ``survey_design`` is non-None, or ``inference == "wild_bootstrap"``.
     """
@@ -285,6 +292,8 @@ def _validate_conley_estimator_inputs(
             "(non-negative int; 0 means spatial-within-period only, no serial "
             "component). See R conleyreg's `lag_cutoff` argument for the convention."
         )
+    if cluster is not None and cluster not in data.columns:
+        raise ValueError(f"Cluster column '{cluster}' not found in data")
     if survey_design is not None:
         raise NotImplementedError(
             f"{estimator_name}(vcov_type='conley') + survey_design is a "
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -407,6 +407,7 @@ def fit(
                 conley_lag_cutoff=self.conley_lag_cutoff,
                 survey_design=survey_design,
                 inference=self.inference,
+                cluster=self.cluster,
             )
 
         if absorb:
@@ -1473,6 +1474,7 @@ def fit(  # type: ignore[override]
                 conley_lag_cutoff=self.conley_lag_cutoff,
                 survey_design=survey_design,
                 inference=self.inference,
+                cluster=self.cluster,
             )
         # Pre-compute non_ref_periods (needed for absorb demeaning)
         non_ref_periods = [p for p in all_periods if p != reference_period]
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
@@ -184,6 +184,7 @@ def fit(  # type: ignore[override]
                 conley_lag_cutoff=self.conley_lag_cutoff,
                 survey_design=survey_design,
                 inference=self.inference,
+                cluster=self.cluster,
             )
 
         # Check for staggered treatment timing and warn if detected
diff --git a/tests/test_conley_vcov.py b/tests/test_conley_vcov.py
@@ -1162,6 +1162,103 @@ def test_did_conley_unknown_coord_column_raises(self, two_period_panel):
                 unit="unit",
             )
 
+    def test_did_conley_unknown_cluster_column_raises(self, two_period_panel):
+        """DiD + Conley + cluster=<missing column> raises a clear estimator-
+        level ValueError before `data[self.cluster]` access (combined-kernel
+        path; codex CI R7 P1)."""
+        from diff_diff import DifferenceInDifferences
+
+        with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
+            DifferenceInDifferences(
+                vcov_type="conley",
+                cluster="missing_region",
+                conley_coords=("lat", "lon"),
+                conley_cutoff_km=2000.0,
+                conley_lag_cutoff=1,
+            ).fit(
+                two_period_panel,
+                outcome="y",
+                treatment="treated",
+                time="time",
+                unit="unit",
+            )
+
+    def test_mpd_conley_unknown_cluster_column_raises(self):
+        """MPD + Conley + cluster=<missing column> raises a clear estimator-
+        level ValueError before `data[self.cluster]` access (combined-kernel
+        path; codex CI R7 P1)."""
+        import pandas as _pd
+
+        from diff_diff import MultiPeriodDiD
+
+        rng = np.random.default_rng(seed=83)
+        rows = []
+        for u in range(8):
+            lat = rng.uniform(-30, 30)
+            lon = rng.uniform(-100, 100)
+            for t in range(3):
+                rows.append(
+                    {
+                        "unit": u,
+                        "time": t,
+                        "y": rng.standard_normal(),
+                        "treated": int(u >= 4),
+                        "lat": lat,
+                        "lon": lon,
+                    }
+                )
+        df = _pd.DataFrame(rows)
+        with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
+            MultiPeriodDiD(
+                vcov_type="conley",
+                cluster="missing_region",
+                conley_coords=("lat", "lon"),
+                conley_cutoff_km=2000.0,
+                conley_lag_cutoff=1,
+            ).fit(
+                df,
+                outcome="y",
+                treatment="treated",
+                time="time",
+                unit="unit",
+                post_periods=[1, 2],
+                reference_period=0,
+            )
+
+    def test_twfe_conley_unknown_cluster_column_raises(self):
+        """TWFE + Conley + cluster=<missing column> raises a clear estimator-
+        level ValueError before `data[self.cluster]` access (combined-kernel
+        path; codex CI R7 P1)."""
+        import pandas as _pd
+
+        from diff_diff import TwoWayFixedEffects
+
+        rng = np.random.default_rng(seed=89)
+        rows = []
+        for u in range(8):
+            lat = rng.uniform(-5, 5)
+            lon = rng.uniform(-5, 5)
+            for t in range(2):
+                rows.append(
+                    {
+                        "unit": u,
+                        "time": t,
+                        "y": rng.standard_normal(),
+                        "treated": int(u >= 4),
+                        "lat": lat,
+                        "lon": lon,
+                    }
+                )
+        df = _pd.DataFrame(rows)
+        with pytest.raises(ValueError, match="Cluster column 'missing_region' not found"):
+            TwoWayFixedEffects(
+                vcov_type="conley",
+                cluster="missing_region",
+                conley_coords=("lat", "lon"),
+                conley_cutoff_km=2000.0,
+                conley_lag_cutoff=1,
+            ).fit(df, outcome="y", treatment="treated", time="time", unit="unit")
+
     def test_did_conley_malformed_coord_tuple_raises(self, two_period_panel):
         """vcov_type='conley' with a malformed conley_coords (wrong arity or
         non-string elements) raises ValueError before downstream access.

Original file line number	Diff line number	Diff line change
`@@ -407,6 +407,7 @@ def fit(`
`407`	`407`	`conley_lag_cutoff=self.conley_lag_cutoff,`
`408`	`408`	`survey_design=survey_design,`
`409`	`409`	`inference=self.inference,`
	`410`	`+ cluster=self.cluster,`
`410`	`411`	`)`
`411`	`412`
`412`	`413`	`if absorb:`
`@@ -1473,6 +1474,7 @@ def fit( # type: ignore[override]`
`1473`	`1474`	`conley_lag_cutoff=self.conley_lag_cutoff,`
`1474`	`1475`	`survey_design=survey_design,`
`1475`	`1476`	`inference=self.inference,`
	`1477`	`+ cluster=self.cluster,`
`1476`	`1478`	`)`
`1477`	`1479`	`# Pre-compute non_ref_periods (needed for absorb demeaning)`
`1478`	`1480`	`non_ref_periods = [p for p in all_periods if p != reference_period]`
Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,7 @@ def fit( # type: ignore[override]`
`184`	`184`	`conley_lag_cutoff=self.conley_lag_cutoff,`
`185`	`185`	`survey_design=survey_design,`
`186`	`186`	`inference=self.inference,`
	`187`	`+ cluster=self.cluster,`
`187`	`188`	`)`
`188`	`189`
`189`	`190`	`# Check for staggered treatment timing and warn if detected`