Merge pull request #326 from igerber/fix/power-analysis-simulation-failure-rate

igerber · web-flow · commit 61aea00ccd09 · 2026-04-19T06:06:30.000-04:00
Surface PowerAnalysis simulation-failure count and narrow except clause
diff --git a/diff_diff/power.py b/diff_diff/power.py
@@ -995,7 +995,13 @@ class SimulationPowerResults:
     coverage : float
         Proportion of CIs containing true effect.
     n_simulations : int
-        Number of simulations performed.
+        Number of simulations performed (successful count; see
+        ``n_simulation_failures`` for failed-replicate count).
+    n_simulation_failures : int
+        Number of simulations at the primary effect size whose `estimator.fit`
+        (or result extraction) raised an exception and was skipped. Lets
+        callers programmatically detect fragile DGP/estimator pairings; a
+        proportional warning is also emitted above a 10% failure rate.
     effect_sizes : List[float]
         Effect sizes tested (if multiple).
     powers : List[float]
@@ -1032,6 +1038,7 @@ class SimulationPowerResults:
     survey_config: Optional[Any] = field(default=None, repr=False)
     mean_deff: Optional[float] = None
     mean_icc_realized: Optional[float] = None
+    n_simulation_failures: int = 0
 
     def __post_init__(self):
         """Compute derived statistics."""
@@ -1062,6 +1069,7 @@ def summary(self) -> str:
             "",
             f"{'Estimator:':<35} {self.estimator_name}",
             f"{'Number of simulations:':<35} {self.n_simulations}",
+            f"{'Simulation failures:':<35} {self.n_simulation_failures}",
             f"{'True treatment effect:':<35} {self.true_effect:.4f}",
             f"{'Significance level (alpha):':<35} {self.alpha:.3f}",
             "",
@@ -1130,6 +1138,7 @@ def to_dict(self) -> Dict[str, Any]:
             "mean_se": self.mean_se,
             "coverage": self.coverage,
             "n_simulations": self.n_simulations,
+            "n_simulation_failures": self.n_simulation_failures,
             "true_effect": self.true_effect,
             "alpha": self.alpha,
             "estimator_name": self.estimator_name,
@@ -2097,6 +2106,7 @@ def simulate_power(
     primary_p_values: List[float] = []
     primary_rejections: List[bool] = []
     primary_ci_contains: List[bool] = []
+    primary_n_failures = 0
 
     # Survey DGP truth accumulation (DEFF/ICC are DGP properties,
     # independent of effect size, so averaging across all sims is correct)
@@ -2238,7 +2248,13 @@ def simulate_power(
                 rejections.append(rejected)
                 ci_contains_true.append(ci[0] <= effect <= ci[1])
 
-            except Exception as e:
+            except (
+                ValueError,
+                np.linalg.LinAlgError,
+                KeyError,
+                RuntimeError,
+                ZeroDivisionError,
+            ) as e:
                 n_failures += 1
                 if progress:
                     print(f"  Warning: Simulation {sim} failed: {e}")
@@ -2266,6 +2282,7 @@ def simulate_power(
             primary_p_values = p_values
             primary_rejections = rejections
             primary_ci_contains = ci_contains_true
+            primary_n_failures = n_failures
 
     # Compute confidence interval for power (primary effect)
     power_val = all_powers[primary_idx]
@@ -2292,6 +2309,7 @@ def simulate_power(
         mean_se=mean_se,
         coverage=coverage,
         n_simulations=n_valid,
+        n_simulation_failures=primary_n_failures,
         effect_sizes=effect_sizes,
         powers=all_powers,
         true_effect=primary_effect,
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -2353,6 +2353,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 - **Note:** The simulation-based power registry (`simulate_power`, `simulate_mde`, `simulate_sample_size`) uses a single-cohort staggered DGP by default. Estimators configured with `control_group="not_yet_treated"`, `clean_control="strict"`, or `anticipation>0` will receive a `UserWarning` because the default DGP does not match their identification strategy. Users must supply `data_generator_kwargs` (e.g., `cohort_periods=[2, 4]`, `never_treated_frac=0.0`) or a custom `data_generator` to match the estimator design.
 - **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`).
 - **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning.
+- **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit).
 - **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection).
 
 **Reference implementation(s):**
diff --git a/tests/test_power.py b/tests/test_power.py
@@ -556,6 +556,162 @@ class Result:
             # Should have completed successfully without warning
             assert len([x for x in w if "simulations" in str(x.message)]) == 0
 
+    def test_simulation_failure_counter_on_result(self):
+        """`n_simulation_failures` on the result object surfaces the internal counter."""
+        from diff_diff.prep import generate_did_data
+
+        class AlternatingFailingEstimator:
+            """Raises ValueError on every other call — ~50% failure rate."""
+
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 2 == 0:
+                    raise ValueError("forced simulated failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        estimator = AlternatingFailingEstimator()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            results = simulate_power(
+                estimator=estimator,
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        assert results.n_simulation_failures == 10
+        assert results.n_simulations == 10
+        assert results.n_simulation_failures + results.n_simulations == 20
+
+    def test_simulation_failure_counter_zero_on_clean_run(self):
+        """Clean run: counter is exactly 0, not omitted or None."""
+        did = DifferenceInDifferences()
+        results = simulate_power(
+            estimator=did,
+            n_units=50,
+            n_periods=4,
+            treatment_effect=5.0,
+            sigma=2.0,
+            n_simulations=15,
+            seed=42,
+            progress=False,
+        )
+        assert results.n_simulation_failures == 0
+
+    def test_simulation_does_not_swallow_programming_errors(self):
+        """`TypeError` (programming error) must propagate, not be absorbed as a failure."""
+        from diff_diff.prep import generate_did_data
+
+        class TypeErrorEstimator:
+            """Raises TypeError — a programming bug signal, not a DGP failure."""
+
+            def fit(self, data, **kwargs):
+                raise TypeError("programming bug — must propagate")
+
+        with pytest.raises(TypeError, match="programming bug"):
+            simulate_power(
+                estimator=TypeErrorEstimator(),
+                n_simulations=5,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+    def test_simulation_all_failed_raises_runtime_error(self):
+        """All simulations failing: narrow-except path still raises RuntimeError."""
+        from diff_diff.prep import generate_did_data
+
+        class AlwaysFailingEstimator:
+            def fit(self, data, **kwargs):
+                raise ValueError("every replicate fails")
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            with pytest.raises(RuntimeError, match="All simulations failed"):
+                simulate_power(
+                    estimator=AlwaysFailingEstimator(),
+                    n_simulations=5,
+                    progress=False,
+                    data_generator=generate_did_data,
+                )
+
+    def test_simulation_failure_counter_survives_serialization(self):
+        """`n_simulation_failures` round-trips through to_dict/to_dataframe."""
+        from diff_diff.prep import generate_did_data
+
+        class AlternatingFailingEstimator:
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 2 == 0:
+                    raise ValueError("forced simulated failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            results = simulate_power(
+                estimator=AlternatingFailingEstimator(),
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        serialized = results.to_dict()
+        assert "n_simulation_failures" in serialized
+        assert serialized["n_simulation_failures"] == results.n_simulation_failures == 10
+
+    def test_simulation_failure_rate_warning_above_threshold(self):
+        """10% threshold: >10% failure still warns with the per-effect-size message."""
+        from diff_diff.prep import generate_did_data
+
+        class MostlyFailingEstimator:
+            """Fails 16/20 calls (80% failure rate) — triggers warning."""
+
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 5 != 0:
+                    raise ValueError("forced failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        with pytest.warns(UserWarning, match=r"simulations .* failed for effect_size="):
+            results = simulate_power(
+                estimator=MostlyFailingEstimator(),
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        assert results.n_simulation_failures == 16
+        assert results.n_simulations == 4
+
 
 class TestVisualization:
     """Tests for power curve visualization."""