Fix non-finite handling to align with Methodology Registry

igerber · claude · igerber · commit f3c9f059a142 · 2026-01-21T08:25:23.000-05:00
Address PR #95 reviewer feedback: - Analytic SE: return NaN instead of zeroing to signal invalid inference - Bootstrap: drop invalid samples and warn, preserving valid distribution - Update test to verify methodology-aligned behavior (finite or NaN, not biased) Per docs/methodology/REGISTRY.md: "Missing group-time cells: ATT(g,t) set to NaN" Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/diff_diff/staggered_aggregation.py b/diff_diff/staggered_aggregation.py
@@ -292,11 +292,11 @@ def _compute_aggregated_se_with_wif(
             warnings.warn(
                 f"Non-finite values ({n_nonfinite}/{len(wif_contrib)}) in weight influence "
                 "function computation. This may occur with very small samples or extreme "
-                "weights. SE estimates may be unreliable.",
+                "weights. Returning NaN for SE to signal invalid inference.",
                 RuntimeWarning,
                 stacklevel=2
             )
-            wif_contrib = np.where(np.isfinite(wif_contrib), wif_contrib, 0.0)
+            return np.nan  # Signal invalid inference instead of biased SE
 
         # Scale by 1/n_units to match R's getSE formula: sqrt(mean(IF^2)/n)
         psi_wif = wif_contrib / n_units
diff --git a/diff_diff/staggered_bootstrap.py b/diff_diff/staggered_bootstrap.py
@@ -379,22 +379,17 @@ def _run_multiplier_bootstrap(
                     control_weights @ control_inf
                 )
 
-            perturbations = self._check_and_fix_nonfinite(
-                perturbations, f"bootstrap perturbations for ATT(g,t) {gt_pairs[j]}"
-            )
+            # Let non-finite values propagate - they will be handled at statistics computation
             bootstrap_atts_gt[:, j] = original_atts[j] + perturbations
 
         # Vectorized overall ATT: matrix-vector multiply
         # Shape: (n_bootstrap,)
-        # Suppress RuntimeWarnings for edge cases
+        # Suppress RuntimeWarnings for edge cases - non-finite values handled at statistics computation
         with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
             bootstrap_overall = bootstrap_atts_gt @ overall_weights
 
-        bootstrap_overall = self._check_and_fix_nonfinite(
-            bootstrap_overall, "bootstrap overall ATT aggregation"
-        )
-
         # Vectorized event study aggregation
+        # Non-finite values handled at statistics computation stage
         rel_periods: List[int] = []
         bootstrap_event_study: Optional[Dict[int, np.ndarray]] = None
         if event_study_info is not None:
@@ -409,11 +404,8 @@ def _run_multiplier_bootstrap(
                 with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
                     bootstrap_event_study[e] = bootstrap_atts_gt[:, gt_indices] @ weights
 
-                bootstrap_event_study[e] = self._check_and_fix_nonfinite(
-                    bootstrap_event_study[e], f"bootstrap event study aggregation (e={e})"
-                )
-
         # Vectorized group aggregation
+        # Non-finite values handled at statistics computation stage
         group_list: List[Any] = []
         bootstrap_group: Optional[Dict[Any, np.ndarray]] = None
         if group_agg_info is not None:
@@ -427,26 +419,24 @@ def _run_multiplier_bootstrap(
                 with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
                     bootstrap_group[g] = bootstrap_atts_gt[:, gt_indices] @ weights
 
-                bootstrap_group[g] = self._check_and_fix_nonfinite(
-                    bootstrap_group[g], f"bootstrap group aggregation (g={g})"
-                )
-
         # Compute bootstrap statistics for ATT(g,t)
         gt_ses = {}
         gt_cis = {}
         gt_p_values = {}
 
         for j, gt in enumerate(gt_pairs):
             se, ci, p_value = self._compute_effect_bootstrap_stats(
-                original_atts[j], bootstrap_atts_gt[:, j]
+                original_atts[j], bootstrap_atts_gt[:, j],
+                context=f"ATT(g={gt[0]}, t={gt[1]})"
             )
             gt_ses[gt] = se
             gt_cis[gt] = ci
             gt_p_values[gt] = p_value
 
         # Compute bootstrap statistics for overall ATT
         overall_se, overall_ci, overall_p_value = self._compute_effect_bootstrap_stats(
-            original_overall, bootstrap_overall
+            original_overall, bootstrap_overall,
+            context="overall ATT"
         )
 
         # Compute bootstrap statistics for event study effects
@@ -461,7 +451,8 @@ def _run_multiplier_bootstrap(
 
             for e in rel_periods:
                 se, ci, p_value = self._compute_effect_bootstrap_stats(
-                    event_study_info[e]['effect'], bootstrap_event_study[e]
+                    event_study_info[e]['effect'], bootstrap_event_study[e],
+                    context=f"event study (e={e})"
                 )
                 event_study_ses[e] = se
                 event_study_cis[e] = ci
@@ -479,7 +470,8 @@ def _run_multiplier_bootstrap(
 
             for g in group_list:
                 se, ci, p_value = self._compute_effect_bootstrap_stats(
-                    group_agg_info[g]['effect'], bootstrap_group[g]
+                    group_agg_info[g]['effect'], bootstrap_group[g],
+                    context=f"group effect (g={g})"
                 )
                 group_effect_ses[g] = se
                 group_effect_cis[g] = ci
@@ -640,16 +632,23 @@ def _compute_effect_bootstrap_stats(
         self,
         original_effect: float,
         boot_dist: np.ndarray,
+        context: str = "bootstrap distribution",
     ) -> Tuple[float, Tuple[float, float], float]:
         """
         Compute bootstrap statistics for a single effect.
 
+        Non-finite bootstrap samples are dropped and a warning is issued if any
+        are present. If too few valid samples remain (<50%), returns NaN for all
+        statistics to signal invalid inference.
+
         Parameters
         ----------
         original_effect : float
             Original point estimate.
         boot_dist : np.ndarray
             Bootstrap distribution of the effect.
+        context : str, optional
+            Description for warning messages, by default "bootstrap distribution".
 
         Returns
         -------
@@ -660,35 +659,65 @@ def _compute_effect_bootstrap_stats(
         p_value : float
             Bootstrap p-value.
         """
-        se = float(np.std(boot_dist, ddof=1))
-        ci = self._compute_percentile_ci(boot_dist, self.alpha)
-        p_value = self._compute_bootstrap_pvalue(original_effect, boot_dist)
+        # Filter out non-finite values
+        finite_mask = np.isfinite(boot_dist)
+        n_valid = np.sum(finite_mask)
+        n_total = len(boot_dist)
+
+        if n_valid < n_total:
+            import warnings
+            n_nonfinite = n_total - n_valid
+            warnings.warn(
+                f"Dropping {n_nonfinite}/{n_total} non-finite bootstrap samples in {context}. "
+                "This may occur with very small samples or extreme weights. "
+                "Bootstrap estimates based on remaining valid samples.",
+                RuntimeWarning,
+                stacklevel=3
+            )
+
+        # Check if we have enough valid samples
+        if n_valid < n_total * 0.5:
+            import warnings
+            warnings.warn(
+                f"Too few valid bootstrap samples ({n_valid}/{n_total}) in {context}. "
+                "Returning NaN for SE/CI/p-value to signal invalid inference.",
+                RuntimeWarning,
+                stacklevel=3
+            )
+            return np.nan, (np.nan, np.nan), np.nan
+
+        # Use only valid samples
+        valid_dist = boot_dist[finite_mask]
+
+        se = float(np.std(valid_dist, ddof=1))
+        ci = self._compute_percentile_ci(valid_dist, self.alpha)
+        p_value = self._compute_bootstrap_pvalue(original_effect, valid_dist)
         return se, ci, p_value
 
-    def _check_and_fix_nonfinite(self, arr: np.ndarray, context: str) -> np.ndarray:
-        """Check for non-finite values and warn if found.
+    def _mask_nonfinite_samples(self, arr: np.ndarray, context: str) -> np.ndarray:
+        """Return boolean mask of finite samples, warning if any dropped.
 
         Parameters
         ----------
         arr : np.ndarray
-            Array to check.
+            Array to check (1D bootstrap distribution).
         context : str
             Description of where this check is happening (for warning message).
 
         Returns
         -------
         np.ndarray
-            Array with non-finite values replaced by 0.0.
+            Boolean mask where True indicates finite (valid) samples.
         """
-        if not np.all(np.isfinite(arr)):
+        finite_mask = np.isfinite(arr)
+        if not np.all(finite_mask):
             import warnings
-            n_nonfinite = np.sum(~np.isfinite(arr))
+            n_nonfinite = np.sum(~finite_mask)
             warnings.warn(
-                f"Non-finite values ({n_nonfinite}/{arr.size}) in {context}. "
+                f"Dropping {n_nonfinite}/{arr.size} non-finite bootstrap samples in {context}. "
                 "This may occur with very small samples or extreme weights. "
-                "Bootstrap estimates may be unreliable.",
+                "Bootstrap estimates based on remaining valid samples.",
                 RuntimeWarning,
                 stacklevel=3
             )
-            return np.where(np.isfinite(arr), arr, 0.0)
-        return arr
+        return finite_mask
diff --git a/tests/test_staggered.py b/tests/test_staggered.py
@@ -682,7 +682,14 @@ def test_extreme_propensity_scores(self):
         assert results.overall_se > 0, "SE should be positive"
 
     def test_extreme_weights_warning(self):
-        """Test that extreme weights produce warnings, not silent failures."""
+        """Test that extreme weights produce warnings and methodology-aligned behavior.
+
+        Per the Methodology Registry (docs/methodology/REGISTRY.md):
+        - Missing group-time cells: ATT(g,t) set to NaN
+        - Analytic SE: returns NaN to signal invalid inference (not biased via zeroing)
+        - Bootstrap: drops invalid samples and warns, preserving valid distribution
+        """
+        import warnings
         np.random.seed(42)
 
         # Minimal dataset: very small sample with unbalanced groups
@@ -705,7 +712,7 @@ def test_extreme_weights_warning(self):
             'first_treat': first_treat_expanded.astype(int),
         })
 
-        # Test without bootstrap first
+        # Test without bootstrap - ATT should be finite, SE may be NaN for edge cases
         cs = CallawaySantAnna()
         results = cs.fit(
             data,
@@ -715,24 +722,31 @@ def test_extreme_weights_warning(self):
             first_treat='first_treat'
         )
 
-        # Results should be finite even in edge cases
+        # ATT point estimate should be finite
         assert np.isfinite(results.overall_att), "ATT should be finite"
-        assert np.isfinite(results.overall_se), "SE should be finite"
+        # SE is either finite (valid) or NaN (signals invalid inference) - not biased
+        assert np.isfinite(results.overall_se) or np.isnan(results.overall_se), \
+            "SE should be finite or NaN (not inf)"
 
-        # Test with bootstrap enabled
-        cs_boot = CallawaySantAnna(n_bootstrap=50, seed=42)
-        boot_results = cs_boot.fit(
-            data,
-            outcome='outcome',
-            unit='unit',
-            time='time',
-            first_treat='first_treat'
-        )
+        # Test with bootstrap - should drop invalid samples with warning
+        cs_boot = CallawaySantAnna(n_bootstrap=100, seed=42)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            boot_results = cs_boot.fit(
+                data,
+                outcome='outcome',
+                unit='unit',
+                time='time',
+                first_treat='first_treat'
+            )
 
-        # Bootstrap should also produce finite results
+        # ATT should be finite
         assert np.isfinite(boot_results.overall_att), "ATT should be finite"
+        # Bootstrap SE based on valid samples - may be finite or NaN
         assert boot_results.bootstrap_results is not None, "Bootstrap results should exist"
-        assert np.isfinite(boot_results.overall_se), "Bootstrap SE should be finite"
+        assert np.isfinite(boot_results.overall_se) or np.isnan(boot_results.overall_se), \
+            "Bootstrap SE should be finite or NaN (not inf)"
 
     def test_near_collinear_covariates(self):
         """Test that near-collinear covariates are handled gracefully."""