Skip to content

Commit 61aea00

Browse files
authored
Merge pull request #326 from igerber/fix/power-analysis-simulation-failure-rate
Surface PowerAnalysis simulation-failure count and narrow except clause
2 parents 5798d09 + a1a522d commit 61aea00

3 files changed

Lines changed: 177 additions & 2 deletions

File tree

diff_diff/power.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,13 @@ class SimulationPowerResults:
995995
coverage : float
996996
Proportion of CIs containing true effect.
997997
n_simulations : int
998-
Number of simulations performed.
998+
Number of simulations performed (successful count; see
999+
``n_simulation_failures`` for failed-replicate count).
1000+
n_simulation_failures : int
1001+
Number of simulations at the primary effect size whose `estimator.fit`
1002+
(or result extraction) raised an exception and was skipped. Lets
1003+
callers programmatically detect fragile DGP/estimator pairings; a
1004+
proportional warning is also emitted above a 10% failure rate.
9991005
effect_sizes : List[float]
10001006
Effect sizes tested (if multiple).
10011007
powers : List[float]
@@ -1032,6 +1038,7 @@ class SimulationPowerResults:
10321038
survey_config: Optional[Any] = field(default=None, repr=False)
10331039
mean_deff: Optional[float] = None
10341040
mean_icc_realized: Optional[float] = None
1041+
n_simulation_failures: int = 0
10351042

10361043
def __post_init__(self):
10371044
"""Compute derived statistics."""
@@ -1062,6 +1069,7 @@ def summary(self) -> str:
10621069
"",
10631070
f"{'Estimator:':<35} {self.estimator_name}",
10641071
f"{'Number of simulations:':<35} {self.n_simulations}",
1072+
f"{'Simulation failures:':<35} {self.n_simulation_failures}",
10651073
f"{'True treatment effect:':<35} {self.true_effect:.4f}",
10661074
f"{'Significance level (alpha):':<35} {self.alpha:.3f}",
10671075
"",
@@ -1130,6 +1138,7 @@ def to_dict(self) -> Dict[str, Any]:
11301138
"mean_se": self.mean_se,
11311139
"coverage": self.coverage,
11321140
"n_simulations": self.n_simulations,
1141+
"n_simulation_failures": self.n_simulation_failures,
11331142
"true_effect": self.true_effect,
11341143
"alpha": self.alpha,
11351144
"estimator_name": self.estimator_name,
@@ -2097,6 +2106,7 @@ def simulate_power(
20972106
primary_p_values: List[float] = []
20982107
primary_rejections: List[bool] = []
20992108
primary_ci_contains: List[bool] = []
2109+
primary_n_failures = 0
21002110

21012111
# Survey DGP truth accumulation (DEFF/ICC are DGP properties,
21022112
# independent of effect size, so averaging across all sims is correct)
@@ -2238,7 +2248,13 @@ def simulate_power(
22382248
rejections.append(rejected)
22392249
ci_contains_true.append(ci[0] <= effect <= ci[1])
22402250

2241-
except Exception as e:
2251+
except (
2252+
ValueError,
2253+
np.linalg.LinAlgError,
2254+
KeyError,
2255+
RuntimeError,
2256+
ZeroDivisionError,
2257+
) as e:
22422258
n_failures += 1
22432259
if progress:
22442260
print(f" Warning: Simulation {sim} failed: {e}")
@@ -2266,6 +2282,7 @@ def simulate_power(
22662282
primary_p_values = p_values
22672283
primary_rejections = rejections
22682284
primary_ci_contains = ci_contains_true
2285+
primary_n_failures = n_failures
22692286

22702287
# Compute confidence interval for power (primary effect)
22712288
power_val = all_powers[primary_idx]
@@ -2292,6 +2309,7 @@ def simulate_power(
22922309
mean_se=mean_se,
22932310
coverage=coverage,
22942311
n_simulations=n_valid,
2312+
n_simulation_failures=primary_n_failures,
22952313
effect_sizes=effect_sizes,
22962314
powers=all_powers,
22972315
true_effect=primary_effect,

docs/methodology/REGISTRY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2353,6 +2353,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
23532353
- **Note:** The simulation-based power registry (`simulate_power`, `simulate_mde`, `simulate_sample_size`) uses a single-cohort staggered DGP by default. Estimators configured with `control_group="not_yet_treated"`, `clean_control="strict"`, or `anticipation>0` will receive a `UserWarning` because the default DGP does not match their identification strategy. Users must supply `data_generator_kwargs` (e.g., `cohort_periods=[2, 4]`, `never_treated_frac=0.0`) or a custom `data_generator` to match the estimator design.
23542354
- **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`).
23552355
- **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning.
2356+
- **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit).
23562357
- **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection).
23572358

23582359
**Reference implementation(s):**

tests/test_power.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,162 @@ class Result:
556556
# Should have completed successfully without warning
557557
assert len([x for x in w if "simulations" in str(x.message)]) == 0
558558

559+
def test_simulation_failure_counter_on_result(self):
560+
"""`n_simulation_failures` on the result object surfaces the internal counter."""
561+
from diff_diff.prep import generate_did_data
562+
563+
class AlternatingFailingEstimator:
564+
"""Raises ValueError on every other call — ~50% failure rate."""
565+
566+
def __init__(self):
567+
self.call_count = 0
568+
569+
def fit(self, data, **kwargs):
570+
self.call_count += 1
571+
if self.call_count % 2 == 0:
572+
raise ValueError("forced simulated failure")
573+
574+
class Result:
575+
att = 5.0
576+
se = 1.0
577+
p_value = 0.01
578+
conf_int = (3.0, 7.0)
579+
580+
return Result()
581+
582+
estimator = AlternatingFailingEstimator()
583+
with warnings.catch_warnings():
584+
warnings.simplefilter("ignore", UserWarning)
585+
results = simulate_power(
586+
estimator=estimator,
587+
n_simulations=20,
588+
progress=False,
589+
data_generator=generate_did_data,
590+
)
591+
592+
assert results.n_simulation_failures == 10
593+
assert results.n_simulations == 10
594+
assert results.n_simulation_failures + results.n_simulations == 20
595+
596+
def test_simulation_failure_counter_zero_on_clean_run(self):
597+
"""Clean run: counter is exactly 0, not omitted or None."""
598+
did = DifferenceInDifferences()
599+
results = simulate_power(
600+
estimator=did,
601+
n_units=50,
602+
n_periods=4,
603+
treatment_effect=5.0,
604+
sigma=2.0,
605+
n_simulations=15,
606+
seed=42,
607+
progress=False,
608+
)
609+
assert results.n_simulation_failures == 0
610+
611+
def test_simulation_does_not_swallow_programming_errors(self):
612+
"""`TypeError` (programming error) must propagate, not be absorbed as a failure."""
613+
from diff_diff.prep import generate_did_data
614+
615+
class TypeErrorEstimator:
616+
"""Raises TypeError — a programming bug signal, not a DGP failure."""
617+
618+
def fit(self, data, **kwargs):
619+
raise TypeError("programming bug — must propagate")
620+
621+
with pytest.raises(TypeError, match="programming bug"):
622+
simulate_power(
623+
estimator=TypeErrorEstimator(),
624+
n_simulations=5,
625+
progress=False,
626+
data_generator=generate_did_data,
627+
)
628+
629+
def test_simulation_all_failed_raises_runtime_error(self):
630+
"""All simulations failing: narrow-except path still raises RuntimeError."""
631+
from diff_diff.prep import generate_did_data
632+
633+
class AlwaysFailingEstimator:
634+
def fit(self, data, **kwargs):
635+
raise ValueError("every replicate fails")
636+
637+
with warnings.catch_warnings():
638+
warnings.simplefilter("ignore", UserWarning)
639+
with pytest.raises(RuntimeError, match="All simulations failed"):
640+
simulate_power(
641+
estimator=AlwaysFailingEstimator(),
642+
n_simulations=5,
643+
progress=False,
644+
data_generator=generate_did_data,
645+
)
646+
647+
def test_simulation_failure_counter_survives_serialization(self):
648+
"""`n_simulation_failures` round-trips through to_dict/to_dataframe."""
649+
from diff_diff.prep import generate_did_data
650+
651+
class AlternatingFailingEstimator:
652+
def __init__(self):
653+
self.call_count = 0
654+
655+
def fit(self, data, **kwargs):
656+
self.call_count += 1
657+
if self.call_count % 2 == 0:
658+
raise ValueError("forced simulated failure")
659+
660+
class Result:
661+
att = 5.0
662+
se = 1.0
663+
p_value = 0.01
664+
conf_int = (3.0, 7.0)
665+
666+
return Result()
667+
668+
with warnings.catch_warnings():
669+
warnings.simplefilter("ignore", UserWarning)
670+
results = simulate_power(
671+
estimator=AlternatingFailingEstimator(),
672+
n_simulations=20,
673+
progress=False,
674+
data_generator=generate_did_data,
675+
)
676+
677+
serialized = results.to_dict()
678+
assert "n_simulation_failures" in serialized
679+
assert serialized["n_simulation_failures"] == results.n_simulation_failures == 10
680+
681+
def test_simulation_failure_rate_warning_above_threshold(self):
682+
"""10% threshold: >10% failure still warns with the per-effect-size message."""
683+
from diff_diff.prep import generate_did_data
684+
685+
class MostlyFailingEstimator:
686+
"""Fails 16/20 calls (80% failure rate) — triggers warning."""
687+
688+
def __init__(self):
689+
self.call_count = 0
690+
691+
def fit(self, data, **kwargs):
692+
self.call_count += 1
693+
if self.call_count % 5 != 0:
694+
raise ValueError("forced failure")
695+
696+
class Result:
697+
att = 5.0
698+
se = 1.0
699+
p_value = 0.01
700+
conf_int = (3.0, 7.0)
701+
702+
return Result()
703+
704+
with pytest.warns(UserWarning, match=r"simulations .* failed for effect_size="):
705+
results = simulate_power(
706+
estimator=MostlyFailingEstimator(),
707+
n_simulations=20,
708+
progress=False,
709+
data_generator=generate_did_data,
710+
)
711+
712+
assert results.n_simulation_failures == 16
713+
assert results.n_simulations == 4
714+
559715

560716
class TestVisualization:
561717
"""Tests for power curve visualization."""

0 commit comments

Comments
 (0)