Rewrite Hausman pretest to ES(e) per Theorem A.1, guard cluster+survey

igerber · claude · igerber · commit bf427af3a37d · 2026-03-22T12:48:55.000-04:00
Address rerun-3 review findings:

P1 fixes:
- Hausman pretest now aggregates to post-treatment event-study ES(e)
  before computing test statistic, matching Theorem A.1 (was using raw
  (g,t) cells including pre-treatment placebos)
- Raise NotImplementedError when both cluster and survey_design are set
  (cluster was silently ignored under survey TSL dispatch)
- Add clustered bootstrap+aggregate='all' test covering bootstrap-updated
  event-study and group effect SEs

P2: Update REGISTRY.md Hausman note to describe ES(e) aggregation.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -348,6 +348,12 @@ def fit(
         """
         self._validate_params()
 
+        if self.cluster is not None and survey_design is not None:
+            raise NotImplementedError(
+                "cluster and survey_design cannot both be set. "
+                "Use survey_design with PSU/strata for cluster-robust inference."
+            )
+
         # Resolve survey design if provided
         from diff_diff.survey import _resolve_survey_for_fit
 
@@ -1475,74 +1481,92 @@ def _nan_result() -> HausmanPretestResult:
         if not common_gts:
             return _nan_result()
 
-        # Filter out (g,t) cells with non-finite effect estimates
-        common_gts = [
-            gt
-            for gt in common_gts
-            if np.isfinite(result_all.group_time_effects[gt]["effect"])
-            and np.isfinite(result_post.group_time_effects[gt]["effect"])
-        ]
-        if not common_gts:
-            return _nan_result()
-
-        k = len(common_gts)
-
-        # Build EIF matrices for common (g,t) pairs: (n_units, k)
         eif_all = result_all.influence_functions
         eif_post = result_post.influence_functions
         assert eif_all is not None and eif_post is not None
         n_units = len(next(iter(eif_all.values())))
 
-        eif_all_mat = np.column_stack([eif_all[gt] for gt in common_gts])
-        eif_post_mat = np.column_stack([eif_post[gt] for gt in common_gts])
-
-        # Filter out (g,t) pairs with non-finite EIF values
-        finite_mask = np.all(np.isfinite(eif_all_mat), axis=0) & np.all(
-            np.isfinite(eif_post_mat), axis=0
+        # --- Aggregate to post-treatment ES(e) per Theorem A.1 ---
+        # Derive cohort fractions from data for proper weights
+        all_units_list = sorted(data[unit].unique())
+        unit_cohorts = (
+            data.groupby(unit)[first_treat].first().reindex(all_units_list).values.astype(float)
         )
-        if not np.all(finite_mask):
-            n_dropped = int(np.sum(~finite_mask))
-            common_gts = [gt for gt, m in zip(common_gts, finite_mask) if m]
-            eif_all_mat = eif_all_mat[:, finite_mask]
-            eif_post_mat = eif_post_mat[:, finite_mask]
-            k = len(common_gts)
-            if k == 0:
-                return _nan_result()
-            warnings.warn(
-                f"Dropped {n_dropped} (g,t) pair(s) with non-finite EIF values "
-                "from Hausman test.",
-                UserWarning,
-                stacklevel=2,
-            )
-
-        # Recompute delta after filtering
-        delta = np.array(
-            [
-                result_post.group_time_effects[gt]["effect"]
-                - result_all.group_time_effects[gt]["effect"]
-                for gt in common_gts
-            ]
+        cohort_fractions: Dict[float, float] = {}
+        for g in set(result_all.groups) | set(result_post.groups):
+            cohort_fractions[g] = float(np.sum(unit_cohorts == g)) / n_units
+
+        def _aggregate_es(
+            gt_effects: Dict, eif_dict: Dict, groups: List, ant: int
+        ) -> Dict[int, Tuple[float, np.ndarray]]:
+            """Aggregate (g,t) effects to post-treatment ES(e) with cohort weights."""
+            by_e: Dict[int, List[Tuple[float, float, np.ndarray]]] = {}
+            for (g, t), d in gt_effects.items():
+                e = int(t - g)
+                if e < -ant:  # pre-treatment beyond anticipation window
+                    continue
+                if not np.isfinite(d["effect"]):
+                    continue
+                if (g, t) not in eif_dict:
+                    continue
+                eif_vec = eif_dict[(g, t)]
+                if not np.all(np.isfinite(eif_vec)):
+                    continue
+                pg = cohort_fractions.get(g, 0.0)
+                if e not in by_e:
+                    by_e[e] = []
+                by_e[e].append((d["effect"], pg, eif_vec))
+
+            result: Dict[int, Tuple[float, np.ndarray]] = {}
+            for e, items in by_e.items():
+                if e < 0:  # restrict to post-treatment (e >= 0)
+                    continue
+                effs = np.array([x[0] for x in items])
+                pgs = np.array([x[1] for x in items])
+                eifs = [x[2] for x in items]
+                total_pg = pgs.sum()
+                w = pgs / total_pg if total_pg > 0 else np.ones(len(pgs)) / len(pgs)
+                es_eff = float(np.sum(w * effs))
+                es_eif = np.zeros(n_units)
+                for k_idx in range(len(eifs)):
+                    es_eif += w[k_idx] * eifs[k_idx]
+                result[e] = (es_eff, es_eif)
+            return result
+
+        es_all = _aggregate_es(
+            result_all.group_time_effects, eif_all, result_all.groups, anticipation
+        )
+        es_post = _aggregate_es(
+            result_post.group_time_effects, eif_post, result_post.groups, anticipation
         )
 
-        # Also filter units with non-finite EIF values (row-wise)
+        # Find common post-treatment horizons
+        common_e = sorted(set(es_all.keys()) & set(es_post.keys()))
+        if not common_e:
+            return _nan_result()
+
+        delta = np.array([es_post[e][0] - es_all[e][0] for e in common_e])
+
+        # Build ES(e)-level EIF matrices
+        eif_all_mat = np.column_stack([es_all[e][1] for e in common_e])
+        eif_post_mat = np.column_stack([es_post[e][1] for e in common_e])
+
+        # Filter units with non-finite EIF values
         row_finite = np.all(np.isfinite(eif_all_mat), axis=1) & np.all(
             np.isfinite(eif_post_mat), axis=1
         )
-        # Build cluster mapping for covariance if needed
         cl_idx: Optional[np.ndarray] = None
         n_cl: Optional[int] = None
         if cluster is not None:
-            all_units = sorted(data[unit].unique())
-            cl_idx, n_cl = _validate_and_build_cluster_mapping(data, unit, cluster, all_units)
-
+            cl_idx, n_cl = _validate_and_build_cluster_mapping(data, unit, cluster, all_units_list)
         if not np.all(row_finite):
             eif_all_mat = eif_all_mat[row_finite]
             eif_post_mat = eif_post_mat[row_finite]
             n_units = int(np.sum(row_finite))
             if cl_idx is not None:
                 cl_idx = cl_idx[row_finite]
 
-        # Compute full covariance matrices using shared _cluster_aggregate
+        # Compute full covariance matrices
         if cl_idx is not None and n_cl is not None:
 
             def _eif_cov(eif_mat: np.ndarray) -> np.ndarray:
@@ -1559,7 +1583,6 @@ def _eif_cov(eif_mat: np.ndarray) -> np.ndarray:
 
         V = cov_post - cov_all
 
-        # If covariance has NaN/Inf, test is unreliable
         if not np.all(np.isfinite(V)):
             warnings.warn(
                 "Hausman covariance matrix contains non-finite values. " "The test is unreliable.",
@@ -1583,29 +1606,23 @@ def _eif_cov(eif_mat: np.ndarray) -> np.ndarray:
                 stacklevel=2,
             )
 
-        # Effective rank = number of positive eigenvalues
         effective_rank = int(np.sum(eigvals > tol))
         if effective_rank == 0:
             return _nan_result()
 
-        # Compute H = delta' @ pinv(V) @ delta
         V_pinv = np.linalg.pinv(V, rcond=tol / max_eigval if max_eigval > 0 else 1e-10)
         H = float(delta @ V_pinv @ delta)
-        H = max(H, 0.0)  # numerical floor
+        H = max(H, 0.0)
 
         p_value = float(chi2.sf(H, df=effective_rank))
         reject = p_value < alpha
 
-        # Build per-(g,t) details DataFrame
-        gt_details = pd.DataFrame(
+        es_details = pd.DataFrame(
             {
-                "group": [gt[0] for gt in common_gts],
-                "time": [gt[1] for gt in common_gts],
-                "att_all": [result_all.group_time_effects[gt]["effect"] for gt in common_gts],
-                "att_post": [result_post.group_time_effects[gt]["effect"] for gt in common_gts],
+                "relative_period": common_e,
+                "es_all": [es_all[e][0] for e in common_e],
+                "es_post": [es_post[e][0] for e in common_e],
                 "delta": delta,
-                "se_all": [result_all.group_time_effects[gt]["se"] for gt in common_gts],
-                "se_post": [result_post.group_time_effects[gt]["se"] for gt in common_gts],
             }
         )
 
@@ -1618,5 +1635,5 @@ def _eif_cov(eif_mat: np.ndarray) -> np.ndarray:
             att_all=result_all.overall_att,
             att_post=result_post.overall_att,
             recommendation="pt_post" if reject else "pt_all",
-            gt_details=gt_details,
+            gt_details=es_details,
         )
diff --git a/diff_diff/efficient_did_results.py b/diff_diff/efficient_did_results.py
@@ -42,7 +42,7 @@ class HausmanPretestResult:
     recommendation: str
     """``"pt_all"`` if fail to reject, ``"pt_post"`` if reject."""
     gt_details: Optional[pd.DataFrame] = None
-    """Per-(g,t) details: ATT_all, ATT_post, delta, SE_all, SE_post."""
+    """Per-event-study-horizon details: relative_period, es_all, es_post, delta."""
 
     def __repr__(self) -> str:
         return (
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -680,7 +680,7 @@ where `q_{g,e} = pi_g / sum_{g' in G_{trt,e}} pi_{g'}`.
 - **Note:** EfficientDiD bootstrap with survey weights deferred to Phase 5
 - **Note:** EfficientDiD covariates (DR path) with survey weights deferred — the doubly robust nuisance estimation does not yet thread survey weights through sieve/kernel steps
 - **Note:** Cluster-robust SEs use the standard Liang-Zeger clustered sandwich estimator applied to EIF values: aggregate EIF within clusters, center, and compute variance with G/(G-1) small-sample correction. Cluster bootstrap generates multiplier weights at the cluster level (all units in a cluster share the same weight). Analytical clustered SEs are the default when `cluster` is set; cluster bootstrap is opt-in via `n_bootstrap > 0`.
-- **Note:** Hausman pretest uses the full cross-(g,t) covariance matrix from EIF values (Theorem A.1), not a diagonal approximation. The variance-difference matrix V = Cov(ATT_post) - Cov(ATT_all) is inverted via Moore-Penrose pseudoinverse to handle finite-sample non-positive-definiteness. Effective rank of V (number of positive eigenvalues) is used as degrees of freedom. Substantially negative eigenvalues trigger a warning.
+- **Note:** Hausman pretest operates on the post-treatment event-study vector ES(e) per Theorem A.1. Both PT-All and PT-Post fits are aggregated to ES(e) using cohort-size weights before computing the test statistic H = delta' V^{-1} delta where delta = ES_post - ES_all and V = Cov(ES_post) - Cov(ES_all). Covariance is computed from aggregated ES(e)-level EIF values. The variance-difference matrix V is inverted via Moore-Penrose pseudoinverse to handle finite-sample non-positive-definiteness. Effective rank of V (number of positive eigenvalues) is used as degrees of freedom.
 - **Note:** Last-cohort-as-control (`control_group="last_cohort"`) reclassifies the latest treatment cohort as pseudo-never-treated and drops time periods at/after that cohort's treatment start. This is distinct from CallawaySantAnna's `not_yet_treated` option which dynamically selects not-yet-treated units per (g,t) pair.
 
 ---
diff --git a/tests/test_efficient_did.py b/tests/test_efficient_did.py
@@ -638,13 +638,15 @@ def test_hausman_differential_trends_detects(self):
             assert pretest.statistic >= 0
         assert pretest.recommendation in ("pt_all", "pt_post", "inconclusive")
 
-    def test_hausman_gt_details(self):
-        """gt_details should have expected columns."""
+    def test_hausman_es_details(self):
+        """gt_details should have event-study columns per Theorem A.1."""
         df = _make_staggered_panel(n_per_group=80, n_control=100)
         pretest = EfficientDiD.hausman_pretest(df, "y", "unit", "time", "first_treat")
         assert pretest.gt_details is not None
-        expected_cols = {"group", "time", "att_all", "att_post", "delta", "se_all", "se_post"}
+        expected_cols = {"relative_period", "es_all", "es_post", "delta"}
         assert set(pretest.gt_details.columns) == expected_cols
+        # All relative periods should be post-treatment (>= 0)
+        assert all(e >= 0 for e in pretest.gt_details["relative_period"])
 
     def test_hausman_recommendation_field(self):
         """recommendation should be pt_all or pt_post."""
@@ -893,6 +895,30 @@ def test_single_cluster_raises(self):
         with pytest.raises(ValueError, match="at least 2 clusters"):
             EfficientDiD(cluster="cluster_id").fit(df, "y", "unit", "time", "first_treat")
 
+    def test_cluster_plus_survey_raises(self):
+        """cluster + survey_design should raise NotImplementedError."""
+        df = _make_staggered_panel(n_per_group=60, n_control=80)
+        df["cluster_id"] = df["unit"] % 5
+        df["w"] = 1.0
+        with pytest.raises(NotImplementedError, match="cluster and survey_design"):
+            EfficientDiD(cluster="cluster_id").fit(
+                df, "y", "unit", "time", "first_treat", survey_design="w"
+            )
+
+    def test_clustered_bootstrap_aggregate_all(self, ci_params):
+        """Clustered bootstrap with aggregate='all' should produce finite results."""
+        n_boot = ci_params.bootstrap(99)
+        df = self._make_clustered_panel(n_clusters=60, units_per_cluster=3)
+        result = EfficientDiD(cluster="cluster_id", n_bootstrap=n_boot, seed=42).fit(
+            df, "y", "unit", "time", "first_treat", aggregate="all"
+        )
+        assert result.event_study_effects is not None
+        assert result.group_effects is not None
+        for e, d in result.event_study_effects.items():
+            assert np.isfinite(d["se"])
+        for g, d in result.group_effects.items():
+            assert np.isfinite(d["se"])
+
 
 class TestSmallCohortWarning:
     """Small cohort warnings for numerical stability."""