Address P2 review findings: remove mutable state, deduplicate cluster covariance, update docstrings

igerber · claude · igerber · commit 5bead50cd0dc · 2026-03-22T09:36:26.000-04:00
- Remove _cluster_indices, _n_clusters, _store_eif instance attributes;
  pass cluster info explicitly through aggregation methods and store_eif
  as a fit() parameter
- Extract shared _cluster_aggregate() helper used by both _compute_se_from_eif
  (scalar SE) and hausman_pretest (covariance matrix)
- Update class docstring to document cluster, control_group parameters

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -45,6 +45,39 @@
 __all__ = ["EfficientDiD", "EfficientDiDResults", "EDiDBootstrapResults"]
 
 
+def _cluster_aggregate(
+    eif_mat: np.ndarray,
+    cluster_indices: np.ndarray,
+    n_clusters: int,
+) -> np.ndarray:
+    """Sum EIF values within clusters and center.
+
+    Parameters
+    ----------
+    eif_mat : ndarray, shape (n_units,) or (n_units, k)
+        EIF values — 1-D for a single estimand, 2-D for multiple.
+    cluster_indices : ndarray, shape (n_units,)
+        Integer cluster assignment per unit.
+    n_clusters : int
+        Number of unique clusters.
+
+    Returns
+    -------
+    ndarray, shape (n_clusters,) or (n_clusters, k)
+        Centered cluster-level sums.
+    """
+    if eif_mat.ndim == 1:
+        sums = np.bincount(cluster_indices, weights=eif_mat, minlength=n_clusters).astype(float)
+    else:
+        sums = np.column_stack(
+            [
+                np.bincount(cluster_indices, weights=eif_mat[:, j], minlength=n_clusters)
+                for j in range(eif_mat.shape[1])
+            ]
+        ).astype(float)
+    return sums - sums.mean(axis=0)
+
+
 def _compute_se_from_eif(
     eif: np.ndarray,
     n_units: int,
@@ -58,10 +91,9 @@ def _compute_se_from_eif(
     center, and apply G/(G-1) small-sample correction.
     """
     if cluster_indices is not None and n_clusters is not None:
-        cluster_sums = np.bincount(cluster_indices, weights=eif, minlength=n_clusters)
-        cluster_mean = np.mean(cluster_sums)
+        centered = _cluster_aggregate(eif, cluster_indices, n_clusters)
         correction = n_clusters / (n_clusters - 1) if n_clusters > 1 else 1.0
-        var = correction * np.mean((cluster_sums - cluster_mean) ** 2) / n_units
+        var = correction * np.mean(centered**2) / n_units
         return float(np.sqrt(max(var, 0.0)))
     return float(np.sqrt(np.mean(eif**2) / n_units))
 
@@ -91,8 +123,17 @@ class EfficientDiD(EfficientDiDBootstrapMixin):
     alpha : float, default 0.05
         Significance level.
     cluster : str or None
-        Column name for cluster-robust SEs (not yet implemented —
-        currently only unit-level inference).
+        Column name for cluster-robust SEs.  When set, analytical SEs
+        use the Liang-Zeger clustered sandwich estimator on EIF values.
+        With ``n_bootstrap > 0``, bootstrap weights are generated at the
+        cluster level (all units in a cluster share the same weight).
+    control_group : str, default ``"never_treated"``
+        Which units serve as the comparison group:
+        ``"never_treated"`` requires a never-treated cohort (raises if
+        none exist); ``"last_cohort"`` reclassifies the latest treatment
+        cohort as pseudo-never-treated and drops post-treatment periods
+        for that cohort.  Distinct from CallawaySantAnna's
+        ``"not_yet_treated"`` — see REGISTRY.md for details.
     n_bootstrap : int, default 0
         Number of multiplier bootstrap iterations (0 = analytical only).
     bootstrap_weights : str, default ``"rademacher"``
@@ -151,7 +192,6 @@ def __init__(
         self.kernel_bandwidth = kernel_bandwidth
         self.is_fitted_ = False
         self.results_: Optional[EfficientDiDResults] = None
-        self._store_eif = False
         self._validate_params()
 
     def _validate_params(self) -> None:
@@ -229,6 +269,7 @@ def fit(
         covariates: Optional[List[str]] = None,
         aggregate: Optional[str] = None,
         balance_e: Optional[int] = None,
+        store_eif: bool = False,
     ) -> EfficientDiDResults:
         """Fit the Efficient DiD estimator.
 
@@ -397,8 +438,6 @@ def fit(
         else:
             unit_cluster_indices = None
             n_clusters = None
-        self._cluster_indices = unit_cluster_indices
-        self._n_clusters = n_clusters
 
         period_to_col = {p: i for i, p in enumerate(time_periods)}
         period_1 = time_periods[0]
@@ -709,9 +748,7 @@ def fit(
                     eif_by_gt[(g, t)] = eif_vals
 
                 # Analytical SE = sqrt(mean(EIF^2) / n)  [paper p.21]
-                se_gt = _compute_se_from_eif(
-                    eif_vals, n_units, self._cluster_indices, self._n_clusters
-                )
+                se_gt = _compute_se_from_eif(eif_vals, n_units, unit_cluster_indices, n_clusters)
 
                 t_stat, p_val, ci = safe_inference(att_gt, se_gt, alpha=self.alpha)
 
@@ -733,7 +770,13 @@ def fit(
 
         # ----- Aggregation -----
         overall_att, overall_se = self._aggregate_overall(
-            group_time_effects, eif_by_gt, n_units, cohort_fractions, unit_cohorts
+            group_time_effects,
+            eif_by_gt,
+            n_units,
+            cohort_fractions,
+            unit_cohorts,
+            cluster_indices=unit_cluster_indices,
+            n_clusters=n_clusters,
         )
         overall_t, overall_p, overall_ci = safe_inference(overall_att, overall_se, alpha=self.alpha)
 
@@ -750,6 +793,8 @@ def fit(
                 time_periods,
                 balance_e,
                 unit_cohorts=unit_cohorts,
+                cluster_indices=unit_cluster_indices,
+                n_clusters=n_clusters,
             )
         if aggregate in ("group", "all"):
             group_effects = self._aggregate_by_group(
@@ -759,6 +804,8 @@ def fit(
                 cohort_fractions,
                 treatment_groups,
                 unit_cohorts=unit_cohorts,
+                cluster_indices=unit_cluster_indices,
+                n_clusters=n_clusters,
             )
 
         # ----- Bootstrap -----
@@ -772,8 +819,8 @@ def fit(
                 balance_e=balance_e,
                 treatment_groups=treatment_groups,
                 cohort_fractions=cohort_fractions,
-                cluster_indices=self._cluster_indices,
-                n_clusters=self._n_clusters,
+                cluster_indices=unit_cluster_indices,
+                n_clusters=n_clusters,
             )
             # Update estimates with bootstrap inference
             overall_se = bootstrap_results.overall_att_se
@@ -850,7 +897,7 @@ def fit(
             efficient_weights=stored_weights if stored_weights else None,
             omega_condition_numbers=stored_cond if stored_cond else None,
             control_group=self.control_group,
-            influence_functions=eif_by_gt if self._store_eif else None,
+            influence_functions=eif_by_gt if store_eif else None,
             bootstrap_results=bootstrap_results,
             estimation_path="dr" if use_covariates else "nocov",
             sieve_k_max=self.sieve_k_max,
@@ -918,6 +965,8 @@ def _aggregate_overall(
         n_units: int,
         cohort_fractions: Dict[float, float],
         unit_cohorts: np.ndarray,
+        cluster_indices: Optional[np.ndarray] = None,
+        n_clusters: Optional[int] = None,
     ) -> Tuple[float, float]:
         """Compute overall ATT with WIF-adjusted SE.
 
@@ -965,7 +1014,7 @@ def _aggregate_overall(
         agg_eif_total = agg_eif + wif  # both O(1) scale
 
         # SE = sqrt(mean(EIF^2) / n) — standard IF-based SE
-        se = _compute_se_from_eif(agg_eif_total, n_units, self._cluster_indices, self._n_clusters)
+        se = _compute_se_from_eif(agg_eif_total, n_units, cluster_indices, n_clusters)
 
         return overall_att, se
 
@@ -979,6 +1028,8 @@ def _aggregate_event_study(
         time_periods: List[Any],
         balance_e: Optional[int] = None,
         unit_cohorts: Optional[np.ndarray] = None,
+        cluster_indices: Optional[np.ndarray] = None,
+        n_clusters: Optional[int] = None,
     ) -> Dict[int, Dict[str, Any]]:
         """Aggregate ATT(g,t) by relative time e = t - g.
 
@@ -1057,7 +1108,7 @@ def _aggregate_event_study(
                 )
                 agg_eif = agg_eif + wif
 
-            agg_se = _compute_se_from_eif(agg_eif, n_units, self._cluster_indices, self._n_clusters)
+            agg_se = _compute_se_from_eif(agg_eif, n_units, cluster_indices, n_clusters)
 
             t_stat, p_val, ci = safe_inference(agg_eff, agg_se, alpha=self.alpha)
             result[e] = {
@@ -1079,6 +1130,8 @@ def _aggregate_by_group(
         cohort_fractions: Dict[float, float],
         treatment_groups: List[Any],
         unit_cohorts: Optional[np.ndarray] = None,
+        cluster_indices: Optional[np.ndarray] = None,
+        n_clusters: Optional[int] = None,
     ) -> Dict[Any, Dict[str, Any]]:
         """Aggregate ATT(g,t) by treatment cohort.
 
@@ -1117,7 +1170,7 @@ def _aggregate_by_group(
             agg_eif = np.zeros(n_units)
             for k, gt in enumerate(g_gts):
                 agg_eif += w[k] * eif_by_gt[gt]
-            agg_se = _compute_se_from_eif(agg_eif, n_units, self._cluster_indices, self._n_clusters)
+            agg_se = _compute_se_from_eif(agg_eif, n_units, cluster_indices, n_clusters)
 
             t_stat, p_val, ci = safe_inference(agg_eff, agg_se, alpha=self.alpha)
             result[g] = {
@@ -1206,12 +1259,10 @@ def hausman_pretest(
         )
 
         edid_all = cls(pt_assumption="all", alpha=alpha, **common_kwargs)
-        edid_all._store_eif = True
-        result_all = edid_all.fit(**fit_kwargs)
+        result_all = edid_all.fit(**fit_kwargs, store_eif=True)
 
         edid_post = cls(pt_assumption="post", alpha=alpha, **common_kwargs)
-        edid_post._store_eif = True
-        result_post = edid_post.fit(**fit_kwargs)
+        result_post = edid_post.fit(**fit_kwargs, store_eif=True)
 
         # Find common (g,t) pairs — PT-Post pairs are a subset of PT-All
         common_gts = sorted(
@@ -1277,31 +1328,35 @@ def _nan_result(recommendation: str = "pt_post") -> HausmanPretestResult:
         row_finite = np.all(np.isfinite(eif_all_mat), axis=1) & np.all(
             np.isfinite(eif_post_mat), axis=1
         )
-        cl_idx = edid_all._cluster_indices
+        # Build cluster mapping for covariance if needed
+        cl_idx: Optional[np.ndarray] = None
+        n_cl: Optional[int] = None
+        if cluster is not None:
+            all_units = sorted(data[unit].unique())
+            cluster_col = data.groupby(unit)[cluster].first()
+            cluster_ids = cluster_col.reindex(all_units).values
+            unique_clusters = np.unique(cluster_ids)
+            n_cl = len(unique_clusters)
+            cluster_to_idx = {c: i for i, c in enumerate(unique_clusters)}
+            cl_idx = np.array([cluster_to_idx[c] for c in cluster_ids])
+
         if not np.all(row_finite):
             eif_all_mat = eif_all_mat[row_finite]
             eif_post_mat = eif_post_mat[row_finite]
             n_units = int(np.sum(row_finite))
             if cl_idx is not None:
                 cl_idx = cl_idx[row_finite]
 
-        # Compute full covariance matrices
-        if cl_idx is not None:
-            n_cl = edid_all._n_clusters
+        # Compute full covariance matrices using shared _cluster_aggregate
+        if cl_idx is not None and n_cl is not None:
 
-            def _cluster_cov(eif_mat: np.ndarray) -> np.ndarray:
-                s_mat = np.column_stack(
-                    [
-                        np.bincount(cl_idx, weights=eif_mat[:, j], minlength=n_cl)
-                        for j in range(eif_mat.shape[1])
-                    ]
-                )
-                s_centered = s_mat - s_mat.mean(axis=0)
+            def _eif_cov(eif_mat: np.ndarray) -> np.ndarray:
+                centered = _cluster_aggregate(eif_mat, cl_idx, n_cl)
                 correction = n_cl / (n_cl - 1) if n_cl > 1 else 1.0
-                return correction * (s_centered.T @ s_centered) / (n_units**2)
+                return correction * (centered.T @ centered) / (n_units**2)
 
-            cov_all = _cluster_cov(eif_all_mat)
-            cov_post = _cluster_cov(eif_post_mat)
+            cov_all = _eif_cov(eif_all_mat)
+            cov_post = _eif_cov(eif_post_mat)
         else:
             with np.errstate(over="ignore", invalid="ignore"):
                 cov_all = (eif_all_mat.T @ eif_all_mat) / (n_units**2)