Merge pull request #430 from igerber/fix-audit-412-r2

igerber · web-flow · commit 93ff96de30bc · 2026-05-14T08:55:17.000-04:00
Address #412 holistic re-audit residuals (R2)
diff --git a/TODO.md b/TODO.md
@@ -61,6 +61,7 @@ Deferred items from PR reviews that were not addressed before merge.
 | dCDH: Parity test SE/CI assertions only cover pure-direction scenarios; mixed-direction SE comparison is structurally apples-to-oranges (cell-count vs obs-count weighting). | `test_chaisemartin_dhaultfoeuille_parity.py` | #294 | Low |
 | dCDH by_path: negative-baseline path regression (e.g. `(-1, 0, 0, 0)`) is not yet exercised. The existing negative-D test (`test_negative_integer_D_supported`) only covers paths with negative values in non-baseline positions like `(0, -1, -1, -1)`, which does not trigger the R `substr(path, 1, 1)` bug regime (the bug needs a multi-character baseline). Add a switcher fixture with `D_{g,1} = -1` and assert the resulting path tuple key. | `tests/test_chaisemartin_dhaultfoeuille.py` | #419 | Low |
 | dCDH by_path: per-path placebo heterogeneity (`predict_het` rows for negative horizons) is currently NaN-filled in `to_dataframe(level="by_path")` `het_*` columns and unpopulated in `path_heterogeneity_effects`. R `did_multiplegt_dyn(..., by_path, predict_het)` forwards `predict_het` into each per-path `did_multiplegt_main` call alongside `placebo`, so R likely emits placebo het rows we do not yet mirror. Validate R's actual placebo predict_het output, then either implement parity or document the deviation explicitly. | `diff_diff/chaisemartin_dhaultfoeuille.py`, `diff_diff/chaisemartin_dhaultfoeuille_results.py`, `tests/test_chaisemartin_dhaultfoeuille_parity.py` | #422 | Medium |
+| dCDH heterogeneity: `_compute_heterogeneity_test` passes `df=None` to `safe_inference`, so Python uses the normal Z critical value (~1.96) for `t_stat`-derived `p_value` and `conf_int`. R `did_multiplegt_dyn(..., predict_het)` uses the t-distribution with df = n - k from the OLS regression, producing ~0.1-2% rtol gaps on CIs and p-values vs Python. Documented as a deviation in the heterogeneity R-parity Note; parity tests pin only `beta`, `se`, `t_stat`, and `n_obs`. Either thread the OLS df into `safe_inference` to match R, or formalize a separate inference-tolerance constant for the heterogeneity surface. | `diff_diff/chaisemartin_dhaultfoeuille.py`, `tests/test_chaisemartin_dhaultfoeuille_parity.py` | pilot-412 | Low |
 | CallawaySantAnna: consider materializing NaN entries for non-estimable (g,t) cells in group_time_effects dict (currently omitted with consolidated warning); would require updating downstream consumers (event study, balance_e, aggregation) | `staggered.py` | #256 | Low |
 | ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails) |
 | Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium |
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_results.py b/diff_diff/chaisemartin_dhaultfoeuille_results.py
@@ -430,8 +430,9 @@ class ChaisemartinDHaultfoeuilleResults:
         ``paths_of_interest=[(...), ...]``) is set. Inner dict keyed by
         horizon directly (no ``"horizons"`` wrapper); each entry holds
         ``{"beta", "se", "t_stat", "p_value", "conf_int", "n_obs"}``,
-        where ``beta`` is the WLS coefficient on the heterogeneity
-        covariate on the path-restricted switcher subsample. Cohort
+        where ``beta`` is the heterogeneity coefficient on the path-
+        restricted switcher subsample - plain OLS on the non-survey
+        path, WLS-on-pweights under ``survey_design``. Cohort
         dummies in the design matrix absorb baseline by construction.
         Empty-state contract mirrors ``path_effects``: ``None`` when not
         requested; ``{}`` when requested but no path has eligible
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py
@@ -2867,6 +2867,44 @@ def test_heterogeneity_multi_horizon(self):
         assert 1 in r.heterogeneity_effects
         assert 2 in r.heterogeneity_effects
 
+    def test_heterogeneity_inference_matches_safe_inference(self):
+        """Local invariant: non-survey heterogeneity `t_stat` / `p_value` /
+        `conf_int` must equal ``safe_inference(beta, se, df=None)`` on every
+        populated horizon. R parity for these fields is intentionally
+        skipped (Python uses normal Z, R uses finite-df t — documented in
+        REGISTRY); without this local invariant a regression isolated to
+        the inference extraction or `_refresh_path_inference` ordering
+        could silently drop / mis-route the SE-derived fields while
+        beta / se still pass R parity.
+        """
+        from diff_diff.utils import safe_inference
+
+        df = self._make_panel_with_het()
+        r = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df, "outcome", "group", "period", "treatment",
+            L_max=2, heterogeneity="het_x",
+        )
+        assert r.heterogeneity_effects is not None
+        checked = 0
+        for l_h, het in r.heterogeneity_effects.items():
+            if not (np.isfinite(het["beta"]) and np.isfinite(het["se"])):
+                continue
+            expected_t, expected_p, expected_ci = safe_inference(
+                het["beta"], het["se"], df=None
+            )
+            assert het["t_stat"] == pytest.approx(expected_t, rel=1e-12), (
+                f"l={l_h} t_stat: stored={het['t_stat']} vs "
+                f"safe_inference={expected_t}"
+            )
+            assert het["p_value"] == pytest.approx(expected_p, rel=1e-12), (
+                f"l={l_h} p_value: stored={het['p_value']} vs "
+                f"safe_inference={expected_p}"
+            )
+            assert het["conf_int"][0] == pytest.approx(expected_ci[0], rel=1e-12)
+            assert het["conf_int"][1] == pytest.approx(expected_ci[1], rel=1e-12)
+            checked += 1
+        assert checked >= 1, "Expected at least one populated heterogeneity horizon"
+
     def test_heterogeneity_missing_column(self):
         df = self._make_panel_with_het()
         with pytest.raises(ValueError, match="not found"):
@@ -10079,6 +10117,56 @@ def test_per_path_heterogeneity_finite_under_known_signal(self):
                 f"(DGP: 5 + 3*het_x), got {horizons[1]['beta']}"
             )
 
+    def test_per_path_heterogeneity_inference_matches_safe_inference(self):
+        """Local invariant: non-survey per-path heterogeneity `t_stat` /
+        `p_value` / `conf_int` must equal ``safe_inference(beta, se,
+        df=None)`` on every populated (path, horizon) entry. R parity
+        for these fields is intentionally skipped (Python uses normal Z,
+        R uses finite-df t — documented in REGISTRY); without this local
+        invariant a regression isolated to the inference extraction or
+        `_refresh_path_inference` ordering could silently drop or
+        mis-route the SE-derived fields while beta / se still pass R
+        parity. Mirrors the global heterogeneity test of the same name
+        in TestHeterogeneityTesting.
+        """
+        from diff_diff.utils import safe_inference
+
+        df = _by_path_het_data()
+        est = ChaisemartinDHaultfoeuille(drop_larger_lower=False, by_path=3)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            res = est.fit(
+                df, outcome="outcome", group="group", time="period",
+                treatment="treatment", L_max=3, heterogeneity="het_x",
+            )
+        assert res.path_heterogeneity_effects
+        checked = 0
+        for path, horizons in res.path_heterogeneity_effects.items():
+            for l_h, het in horizons.items():
+                if not (np.isfinite(het["beta"]) and np.isfinite(het["se"])):
+                    continue
+                expected_t, expected_p, expected_ci = safe_inference(
+                    het["beta"], het["se"], df=None
+                )
+                assert het["t_stat"] == pytest.approx(expected_t, rel=1e-12), (
+                    f"path={path} l={l_h} t_stat: stored={het['t_stat']} vs "
+                    f"safe_inference={expected_t}"
+                )
+                assert het["p_value"] == pytest.approx(expected_p, rel=1e-12), (
+                    f"path={path} l={l_h} p_value: stored={het['p_value']} vs "
+                    f"safe_inference={expected_p}"
+                )
+                assert het["conf_int"][0] == pytest.approx(
+                    expected_ci[0], rel=1e-12
+                )
+                assert het["conf_int"][1] == pytest.approx(
+                    expected_ci[1], rel=1e-12
+                )
+                checked += 1
+        assert checked >= 1, (
+            "Expected at least one populated (path, horizon) heterogeneity entry"
+        )
+
     def test_per_path_heterogeneity_telescope_to_global_on_single_path(self):
         """On a single-path panel, per-path == global heterogeneity.
         Plain OLS path: bit-exact via path_groups identity."""
diff --git a/tests/test_chaisemartin_dhaultfoeuille_parity.py b/tests/test_chaisemartin_dhaultfoeuille_parity.py
@@ -1382,6 +1382,23 @@ def test_parity_multi_path_reversible_predict_het(self, golden_values):
             assert py_h["se"] == pytest.approx(r_h["se"], rel=self.SE_RTOL), (
                 f"h={h} se: py={py_h['se']:.6f} vs r={r_h['se']:.6f}"
             )
+            # `t_stat = beta / se` is invariant to the Wald-test
+            # critical-value distribution; pin it at SE_RTOL so a
+            # regression in beta or se surfaces here too.
+            assert py_h["t_stat"] == pytest.approx(r_h["t"], rel=self.SE_RTOL), (
+                f"h={h} t_stat: py={py_h['t_stat']:.6f} vs r={r_h['t']:.6f}"
+            )
+            assert int(py_h["n_obs"]) == int(r_h["n_obs"]), (
+                f"h={h} n_obs: py={py_h['n_obs']} vs r={r_h['n_obs']}"
+            )
+            # NOTE: `p_value` and `conf_int` are NOT pinned to R here. Python's
+            # `safe_inference(..., df=None)` uses the normal Z critical value
+            # (~1.96 at alpha=0.05); R `did_multiplegt_dyn(..., predict_het)`
+            # uses t-distribution with df = n - k from the OLS regression.
+            # That structural deviation produces ~0.1-2% rtol gaps on CI
+            # bounds and p-values - tracked separately rather than masked by
+            # a loose parity tolerance. See REGISTRY Phase 3 heterogeneity
+            # Note "Deviation from R (heterogeneity inference critical value)".
 
 
 class TestDCDHDynRParityByPathHeterogeneity:
@@ -1471,3 +1488,18 @@ def test_parity_multi_path_reversible_by_path_predict_het(
                     f"path={path_key} h={h} se: "
                     f"py={py_h['se']:.6f} vs r={r_h['se']:.6f}"
                 )
+                # `t_stat = beta / se` is invariant to the Wald-test
+                # critical-value distribution; pin it at SE_RTOL so a
+                # regression in beta or se surfaces here too. p_value
+                # and conf_int are not pinned - see the global parity
+                # class for the Z-vs-t deviation note.
+                assert py_h["t_stat"] == pytest.approx(
+                    r_h["t"], rel=self.SE_RTOL
+                ), (
+                    f"path={path_key} h={h} t_stat: "
+                    f"py={py_h['t_stat']:.6f} vs r={r_h['t']:.6f}"
+                )
+                assert int(py_h["n_obs"]) == int(r_h["n_obs"]), (
+                    f"path={path_key} h={h} n_obs: "
+                    f"py={py_h['n_obs']} vs r={r_h['n_obs']}"
+                )