Close axis-C/J silent-failures audit: B-spline derivative + PA survey cache

igerber · claude · igerber · commit bac7a2bf40e1 · 2026-04-19T17:39:41.000-04:00
Bundles the two remaining S-complexity findings from the Phase 2 audit, closing Phase 3 execution. Finding #12 — ContinuousDiD B-spline degenerate knot (axis C, Minor, `continuous_did_bspline.py:153`): `bspline_derivative_design_matrix` silently swallowed `ValueError` from `scipy.interpolate.BSpline` in the per-basis derivative loop, leaving affected columns of the derivative design matrix as zero with no user-visible signal. Downstream ContinuousDiD analytical inference then fed a biased `dPsi` into SE computation. Fix aggregates failed-basis indices and emits ONE `UserWarning` naming them. The all-identical-knot degenerate case (single dose value, `knots[0] == knots[-1]`) remains silently handled — derivatives there are mathematically zero, well-defined, and always have been. Finding #28 — PowerAnalysis survey-design cache staleness (axis J, Major, `power.py:171-180`): `_build_survey_design()` populated `self._cached_survey_design` on first call and never invalidated. Mutating `config.survey_design` after `__init__` silently returned the stale cached design. Default construction is microseconds and user-provided designs are reference copies, so the cache never earned its cost. Fix drops the cache entirely; method now reflects live `self.survey_design` every call. Six new tests: - `tests/test_continuous_did.py::TestBSplineDerivativeDegenerateBasis` (3): single-dose silent contract, `ValueError`-forced aggregate warning, happy-path no-warning regression. - `tests/test_power.py::TestSurveyPowerConfigDesignStaleness` (3): mutate-survey_design-picks-up-new, clearing-falls-back-to-default, repeat-calls-equivalent regression. REGISTRY notes added under §ContinuousDiD (edge cases) and §PowerAnalysis (`survey_config` section). Audit state post-PR: all 28 actionable Phase-2 findings resolved (26 in prior PRs; #12 + #28 here). Three P1 follow-ups remain logged in `TODO.md` from PR #337's discovered divergences (FW/PGD algorithmic mismatch in `compute_synthetic_weights`, TROP grid-search on rank- deficient Y, TROP bootstrap RNG unification). Those are post-audit cleanup work, not Phase-3 scope. No behavioral changes on clean inputs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/diff_diff/continuous_did_bspline.py b/diff_diff/continuous_did_bspline.py
@@ -5,6 +5,8 @@
 the dose-response curve estimation in ContinuousDiD.
 """
 
+import warnings
+
 import numpy as np
 from scipy.interpolate import BSpline
 
@@ -140,9 +142,12 @@ def bspline_derivative_design_matrix(x, knots, degree, include_intercept=True):
 
     # Check if knot vector is degenerate (all identical, e.g. single dose)
     if knots[0] == knots[-1]:
-        # All knots identical: derivatives are all zero
+        # All knots identical: derivatives are all zero — this is a
+        # mathematically well-defined degenerate case (single dose value
+        # means no dose variation to differentiate), handled silently.
         pass
     else:
+        failed_basis_indices = []
         for j in range(n_basis):
             c = np.zeros(n_basis)
             c[j] = 1.0
@@ -151,8 +156,28 @@ def bspline_derivative_design_matrix(x, knots, degree, include_intercept=True):
                 deriv_j = spline_j.derivative()
                 dB[:, j] = deriv_j(x_clamped)
             except ValueError:
-                # Degenerate knot vector: derivative is zero
-                pass
+                # Finding #12 (axis C, silent-failures audit): silent pass
+                # on ValueError meant a malformed knot vector (too few
+                # knots for the degree, non-monotonic, etc.) quietly set
+                # whole columns of the derivative design matrix to zero.
+                # Downstream ContinuousDiD inference then used a silently
+                # biased dPsi matrix. Track affected basis indices so we
+                # can surface ONE aggregate warning.
+                failed_basis_indices.append(j)
+
+        if failed_basis_indices:
+            warnings.warn(
+                f"B-spline derivative construction failed for "
+                f"{len(failed_basis_indices)} of {n_basis} basis function(s) "
+                f"(indices {failed_basis_indices}); their derivative columns "
+                f"are zero. This typically indicates a malformed knot vector "
+                f"(too few knots for the chosen degree, non-monotonic, or "
+                f"repeated interior knots). ContinuousDiD inference may be "
+                f"biased; consider increasing the number of distinct doses "
+                f"or reducing the B-spline degree.",
+                UserWarning,
+                stacklevel=2,
+            )
 
     if include_intercept:
         # Drop first column (intercept derivative = 0), prepend zeros
diff --git a/diff_diff/power.py b/diff_diff/power.py
@@ -167,17 +167,24 @@ def __post_init__(self) -> None:
             )
 
     def _build_survey_design(self) -> Any:
-        """Return cached SurveyDesign (built once, reused across simulations)."""
-        if not hasattr(self, "_cached_survey_design"):
-            if self.survey_design is not None:
-                self._cached_survey_design = self.survey_design
-            else:
-                from diff_diff.survey import SurveyDesign
+        """Return a SurveyDesign for this config.
+
+        Reflects the live ``self.survey_design`` value every call (no
+        caching). Finding #28 (axis J, silent-failures audit): the
+        previous ``_cached_survey_design`` was populated on first call
+        and never invalidated on mutation, so ``config.survey_design =
+        other_design`` silently kept returning the original. Since the
+        default ``SurveyDesign(...)`` construction is microseconds and
+        user-provided designs are just reference copies, there's no cache
+        cost worth keeping.
+        """
+        if self.survey_design is not None:
+            return self.survey_design
+        from diff_diff.survey import SurveyDesign
 
-                self._cached_survey_design = SurveyDesign(
-                    weights="weight", strata="stratum", psu="psu", fpc="fpc"
-                )
-        return self._cached_survey_design
+        return SurveyDesign(
+            weights="weight", strata="stratum", psu="psu", fpc="fpc"
+        )
 
     @property
     def min_viable_n(self) -> int:
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -723,6 +723,7 @@ See `docs/methodology/continuous-did.md` Section 4 for full details.
   not-yet-treated controls. When `anticipation=0` (default), behavior is
   unchanged.
 - **Boundary knots**: Knots are built once from all treated doses (global, not per-cell) to ensure a common basis across (g,t) cells for aggregation. Evaluation grid is clamped to training-dose boundary knots (`range(dose)`). R's `contdid` v0.1.0 has an inconsistency where `splines2::bSpline(dvals)` uses `range(dvals)` instead of `range(dose)`, which can produce extrapolation artifacts at dose grid extremes. Our approach avoids extrapolation and is methodologically sound.
+- **Note:** `bspline_derivative_design_matrix` previously swallowed `ValueError` from `scipy.interpolate.BSpline` in the per-basis derivative loop, leaving affected columns of the derivative design matrix as zero with no user-facing signal. It now aggregates the failed basis indices and emits ONE `UserWarning` naming them so downstream ContinuousDiD inference doesn't silently use a biased `dPsi`. The all-identical-knot degenerate case (single dose value) remains silently handled — derivatives there are mathematically zero. Axis-C finding #12 in the Phase 2 silent-failures audit.
 
 ### Implementation Checklist
 
@@ -2582,6 +2583,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 - **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`).
 - **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning.
 - **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit).
+- **Note:** `SurveyPowerConfig._build_survey_design()` no longer caches its return value in `self._cached_survey_design`. Mutating `config.survey_design` (or any other config field) after the first call used to silently return the stale cached design; the method now returns the live `self.survey_design` (or the default construction) every call. Construction is microseconds — the cache never earned its complexity. Axis-J finding #28 in the Phase 2 silent-failures audit.
 - **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection).
 
 **Reference implementation(s):**
diff --git a/tests/test_continuous_did.py b/tests/test_continuous_did.py
@@ -102,6 +102,109 @@ def test_linear_basis(self):
         assert B.shape[1] == 2  # intercept + 1 basis fn
 
 
+# ---------------------------------------------------------------------------
+# Finding #12 (axis C, silent-failures audit). Previously
+# `bspline_derivative_design_matrix` silently swallowed ValueError in the
+# per-basis derivative loop, leaving affected columns of the derivative
+# design matrix as zero with no user-visible signal. ContinuousDiD's
+# analytical inference then fed a biased dPsi into downstream SE
+# computation. The fix aggregates failed-basis indices and emits ONE
+# UserWarning naming them.
+# ---------------------------------------------------------------------------
+
+
+class TestBSplineDerivativeDegenerateBasis:
+    def test_single_dose_is_silent(self):
+        """All-identical knots (single dose value) is a well-defined
+        degenerate case — derivatives are mathematically zero and the
+        function returns silently. Regression-guard the existing contract."""
+        x = np.array([3.0, 3.0, 3.0, 3.0])
+        knots = np.array([3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0])  # all identical
+        import warnings as _w
+
+        with _w.catch_warnings(record=True) as caught:
+            _w.simplefilter("always")
+            dB = bspline_derivative_design_matrix(x, knots, degree=3, include_intercept=True)
+        deriv_warnings = [
+            w for w in caught if "B-spline derivative construction failed" in str(w.message)
+        ]
+        assert deriv_warnings == [], (
+            "All-identical knots should be handled silently (mathematically "
+            "well-defined zero-derivative case); warning fired unexpectedly: "
+            f"{[str(w.message) for w in deriv_warnings]}"
+        )
+        np.testing.assert_array_equal(dB, np.zeros_like(dB))
+
+    def test_valueerror_from_bspline_emits_aggregate_warning(self):
+        """When BSpline construction raises ValueError for some basis
+        functions (malformed knot vector, etc.), the new aggregate
+        UserWarning must fire naming the affected indices."""
+        from unittest.mock import patch
+
+        import diff_diff.continuous_did_bspline as bspline_mod
+
+        dose = np.linspace(1, 5, 30)
+        knots, deg = build_bspline_basis(dose, degree=3, num_knots=1)
+        x = np.linspace(1.5, 4.5, 20)
+
+        # Force ValueError on basis indices 1 and 3 only; the rest run
+        # through normally. This is the partial-failure mode the audit
+        # called out.
+        real_bspline = bspline_mod.BSpline
+        call_counter = {"n": 0}
+
+        def flaky_bspline(knots, c, degree):
+            # c is a one-hot vector; the index set to 1 is the basis j
+            j = int(np.argmax(c))
+            call_counter["n"] += 1
+            if j in (1, 3):
+                raise ValueError(f"forced test failure for basis j={j}")
+            return real_bspline(knots, c, degree)
+
+        import warnings as _w
+
+        with patch.object(bspline_mod, "BSpline", side_effect=flaky_bspline):
+            with _w.catch_warnings(record=True) as caught:
+                _w.simplefilter("always")
+                dB = bspline_derivative_design_matrix(
+                    x, knots, degree=deg, include_intercept=True
+                )
+
+        deriv_warnings = [
+            w for w in caught if "B-spline derivative construction failed" in str(w.message)
+        ]
+        assert len(deriv_warnings) == 1, (
+            f"Expected exactly one aggregate warning, got {len(deriv_warnings)}: "
+            f"{[str(w.message) for w in deriv_warnings]}"
+        )
+        msg = str(deriv_warnings[0].message)
+        # Message must name the failed basis indices so the user can debug.
+        assert "[1, 3]" in msg, f"Expected indices [1, 3] in warning; got: {msg}"
+        assert "2 of" in msg, f"Expected failure count '2 of ...' in warning; got: {msg}"
+        # Affected columns should be zero; others should be non-zero-ish.
+        # With include_intercept=True, column 0 is always zero (intercept
+        # derivative) and basis index j is at dB column j (the drop-first
+        # then prepend-zeros logic keeps the same per-j mapping for j>=1).
+        np.testing.assert_array_equal(dB[:, 1], np.zeros(len(x)))  # failed basis j=1
+        np.testing.assert_array_equal(dB[:, 3], np.zeros(len(x)))  # failed basis j=3
+
+    def test_clean_knots_emit_no_warning(self):
+        """Well-formed knot vector → no ValueError path taken → no
+        warning. Regression-guard the happy path."""
+        dose = np.linspace(1, 5, 50)
+        knots, deg = build_bspline_basis(dose, degree=3, num_knots=2)
+        x = np.linspace(1.5, 4.5, 30)
+        import warnings as _w
+
+        with _w.catch_warnings(record=True) as caught:
+            _w.simplefilter("always")
+            bspline_derivative_design_matrix(x, knots, deg, include_intercept=True)
+        deriv_warnings = [
+            w for w in caught if "B-spline derivative construction failed" in str(w.message)
+        ]
+        assert deriv_warnings == []
+
+
 class TestDoseGrid:
     """Test dose grid computation."""
 
diff --git a/tests/test_power.py b/tests/test_power.py
@@ -2827,3 +2827,69 @@ def test_survey_config_validation_psu_re_sd_nan(self):
     def test_survey_config_validation_fpc_nan(self):
         with pytest.raises(ValueError, match="fpc_per_stratum must be finite"):
             SurveyPowerConfig(fpc_per_stratum=np.inf)
+
+
+# ---------------------------------------------------------------------------
+# Finding #28 (axis J, silent-failures audit). `_build_survey_design`
+# previously cached the resolved design in ``self._cached_survey_design``
+# on first call and never invalidated; mutating ``config.survey_design``
+# after ``__init__`` silently returned the stale cache. The fix drops the
+# cache — construction is microseconds — so every call reflects live
+# state.
+# ---------------------------------------------------------------------------
+
+
+class TestSurveyPowerConfigDesignStaleness:
+    def test_mutating_survey_design_after_first_call_picks_up_new(self):
+        """Reassigning config.survey_design after initial _build_survey_design
+        must be reflected on the next call."""
+        from diff_diff.survey import SurveyDesign
+
+        cfg = SurveyPowerConfig()
+        first = cfg._build_survey_design()
+        # Sanity: default is the expected column-name convention.
+        assert first.weights == "weight"
+        assert first.strata == "stratum"
+
+        replacement = SurveyDesign(
+            weights="my_weight", strata="my_stratum", psu="my_psu", fpc="my_fpc"
+        )
+        cfg.survey_design = replacement
+
+        second = cfg._build_survey_design()
+        assert second is replacement, (
+            "After mutating config.survey_design, _build_survey_design must "
+            "return the new design, not the cached default."
+        )
+        assert second.weights == "my_weight"
+
+    def test_clearing_survey_design_falls_back_to_default(self):
+        """Reassigning config.survey_design back to None after a non-None
+        initialization must fall back to the default construction."""
+        from diff_diff.survey import SurveyDesign
+
+        initial = SurveyDesign(
+            weights="w0", strata="s0", psu="p0", fpc="f0"
+        )
+        cfg = SurveyPowerConfig(survey_design=initial)
+        first = cfg._build_survey_design()
+        assert first is initial
+
+        cfg.survey_design = None
+        second = cfg._build_survey_design()
+        assert second is not initial
+        assert second.weights == "weight"
+        assert second.strata == "stratum"
+
+    def test_repeat_calls_produce_equivalent_output(self):
+        """Regression guard: no-mutation case must still work — two
+        consecutive calls on an untouched config return consistent
+        SurveyDesign column names (identity equality is not guaranteed
+        since we dropped the cache; equivalence is what matters)."""
+        cfg = SurveyPowerConfig()
+        first = cfg._build_survey_design()
+        second = cfg._build_survey_design()
+        assert first.weights == second.weights
+        assert first.strata == second.strata
+        assert first.psu == second.psu
+        assert first.fpc == second.fpc