@@ -2103,6 +2103,357 @@ def test_full_sdid_rust_vs_python(self):
21032103 )
21042104
21052105
2106+ # ---------------------------------------------------------------------------
2107+ # Silent-failure audit axis-G coverage (findings #21, #22, #23).
2108+ # The motivating incident (CS covariate scaling, early 2026) was a silent
2109+ # Rust-vs-Python divergence on rank-deficient / mixed-scale designs that
2110+ # sailed through the existing happy-path parity tests. These three classes
2111+ # extend backend-parity coverage to the edge cases the Phase-2 audit flagged.
2112+ # ---------------------------------------------------------------------------
2113+
2114+
2115+ @pytest .mark .skipif (not HAS_RUST_BACKEND , reason = "Rust backend not available" )
2116+ class TestSolveOLSSkipRankCheckParity :
2117+ """Finding #21: `solve_ols(..., skip_rank_check=True)` parity.
2118+
2119+ Both backends skip the pivoted-QR rank check and trust the caller's
2120+ full-rank assertion. Rust uses SVD; Python uses scipy.linalg.lstsq
2121+ with ``cond=1e-7``. Parity is expected in well-conditioned cases, but
2122+ near-singular and mixed-scale inputs could divergence on singular-
2123+ value truncation ordering between LAPACK backends.
2124+
2125+ Rust dispatch is gated by ``vcov_type='hc1'`` and ``weights is None``
2126+ (linalg.py:621-634), so tests scope to that intersection — the only
2127+ path where Rust runs under ``skip_rank_check=True``.
2128+
2129+ Assertions target fitted values (X @ beta), not beta directly: for
2130+ rank-deficient designs, kept-column beta values depend on pivot order
2131+ while fitted values are backend-invariant.
2132+ """
2133+
2134+ def _run_both_backends (self , X , y ):
2135+ """Call solve_ols with skip_rank_check=True under Rust and under
2136+ forced-Python, return (coef_rust, coef_py)."""
2137+ import sys
2138+ from unittest .mock import patch
2139+
2140+ from diff_diff .linalg import solve_ols
2141+
2142+ coef_rust , resid_rust , _ = solve_ols (
2143+ X , y , skip_rank_check = True , vcov_type = "hc1" , return_vcov = False
2144+ )
2145+
2146+ linalg_module = sys .modules ["diff_diff.linalg" ]
2147+ with patch .object (linalg_module , "HAS_RUST_BACKEND" , False ), \
2148+ patch .object (linalg_module , "_rust_solve_ols" , None ):
2149+ coef_py , resid_py , _ = solve_ols (
2150+ X , y , skip_rank_check = True , vcov_type = "hc1" , return_vcov = False
2151+ )
2152+ return coef_rust , coef_py
2153+
2154+ def test_mixed_scale_full_rank (self ):
2155+ """Full-rank X with column-norm ratio > 1e6. Both SVD backends
2156+ should truncate the same singular values."""
2157+ rng = np .random .default_rng (11 )
2158+ n , k = 80 , 3
2159+ X = np .column_stack ([
2160+ rng .normal (0 , 1 , n ), # unit scale
2161+ rng .normal (0 , 1e6 , n ), # 1e6 scale
2162+ rng .normal (0 , 1 , n ), # unit scale
2163+ ])
2164+ y = 1.0 + 0.5 * X [:, 0 ] + 1e-7 * X [:, 1 ] + 0.3 * X [:, 2 ] + rng .normal (0 , 0.1 , n )
2165+
2166+ coef_rust , coef_py = self ._run_both_backends (X , y )
2167+
2168+ fitted_rust = X @ coef_rust
2169+ fitted_py = X @ coef_py
2170+ np .testing .assert_allclose (
2171+ fitted_rust , fitted_py , rtol = 1e-6 , atol = 1e-8 ,
2172+ err_msg = "Rust vs Python fitted-value divergence on mixed-scale X" ,
2173+ )
2174+
2175+ def test_near_singular_full_rank (self ):
2176+ """Near-collinear full-rank X (cond(X'X) > 1e10). Backends use
2177+ the same SVD threshold (cond=1e-7), so should agree on truncation."""
2178+ rng = np .random .default_rng (22 )
2179+ n = 80
2180+ x1 = rng .normal (0 , 1 , n )
2181+ x2 = x1 + rng .normal (0 , 1e-6 , n ) # nearly parallel to x1
2182+ x3 = rng .normal (0 , 1 , n )
2183+ X = np .column_stack ([x1 , x2 , x3 ])
2184+ y = 1.0 + 0.5 * x1 - 0.3 * x3 + rng .normal (0 , 0.1 , n )
2185+
2186+ coef_rust , coef_py = self ._run_both_backends (X , y )
2187+
2188+ fitted_rust = X @ coef_rust
2189+ fitted_py = X @ coef_py
2190+ np .testing .assert_allclose (
2191+ fitted_rust , fitted_py , rtol = 1e-6 , atol = 1e-8 ,
2192+ err_msg = "Rust vs Python fitted-value divergence on near-singular X" ,
2193+ )
2194+
2195+ def test_rank_deficient_collinear (self ):
2196+ """Perfectly collinear columns. ``skip_rank_check=True`` bypasses
2197+ the QR detector; both backends must still produce matching fitted
2198+ values via minimum-norm SVD solve, even if individual coefficients
2199+ differ on dropped columns."""
2200+ rng = np .random .default_rng (33 )
2201+ n = 80
2202+ x1 = rng .normal (0 , 1 , n )
2203+ X = np .column_stack ([x1 , 2.0 * x1 , rng .normal (0 , 1 , n )]) # x2 ≡ 2*x1
2204+ y = 1.0 + 0.5 * x1 + 0.3 * X [:, 2 ] + rng .normal (0 , 0.1 , n )
2205+
2206+ coef_rust , coef_py = self ._run_both_backends (X , y )
2207+
2208+ fitted_rust = X @ coef_rust
2209+ fitted_py = X @ coef_py
2210+ # Fitted values are the backend-invariant object under rank deficiency.
2211+ np .testing .assert_allclose (
2212+ fitted_rust , fitted_py , rtol = 1e-6 , atol = 1e-8 ,
2213+ err_msg = "Rust vs Python fitted-value divergence on rank-deficient X" ,
2214+ )
2215+
2216+
2217+ @pytest .mark .skipif (not HAS_RUST_BACKEND , reason = "Rust backend not available" )
2218+ class TestSyntheticWeightsBackendParity :
2219+ """Finding #22: `compute_synthetic_weights` Rust vs Python parity.
2220+
2221+ Both backends solve the same constrained QP (FW + simplex projection)
2222+ with ``_OPTIMIZATION_TOL=1e-8``. PR #312 normalized Y inside the SDID
2223+ fit path; the shared utility at ``utils.py:1134-1199`` is unchanged
2224+ and is called from ``prep.py:990`` (DGP) only.
2225+
2226+ Tolerance: ``atol=1e-7, rtol=1e-6`` on weights — loose enough to
2227+ absorb the FW convergence tolerance (1e-8) plus BLAS platform drift
2228+ (~1e-12 to 1e-14 on float64), tight enough to catch real algorithmic
2229+ divergence.
2230+ """
2231+
2232+ def _run_both_backends (self , Y_control , Y_treated , lambda_reg = 0.0 ):
2233+ import sys
2234+ from unittest .mock import patch
2235+
2236+ from diff_diff .utils import compute_synthetic_weights
2237+
2238+ w_rust = compute_synthetic_weights (Y_control , Y_treated , lambda_reg = lambda_reg )
2239+
2240+ utils_module = sys .modules ["diff_diff.utils" ]
2241+ with patch .object (utils_module , "HAS_RUST_BACKEND" , False ):
2242+ w_py = compute_synthetic_weights (Y_control , Y_treated , lambda_reg = lambda_reg )
2243+ return w_rust , w_py
2244+
2245+ def test_near_singular_Y_prime_Y (self ):
2246+ """Highly collinear control units make Y'Y near-singular. Both
2247+ backends should converge to the same (near-uniform or zeroed-out)
2248+ weight vector."""
2249+ rng = np .random .default_rng (7 )
2250+ n_pre , n_control = 5 , 4
2251+ base = rng .normal (0 , 1 , n_pre )
2252+ # Three of four control units are nearly parallel to `base`
2253+ Y_control = np .column_stack ([
2254+ base ,
2255+ base + 1e-10 * rng .normal (0 , 1 , n_pre ),
2256+ base + 1e-10 * rng .normal (0 , 1 , n_pre ),
2257+ rng .normal (0 , 1 , n_pre ),
2258+ ])
2259+ Y_treated = base + 1e-8 * rng .normal (0 , 1 , n_pre )
2260+
2261+ w_rust , w_py = self ._run_both_backends (Y_control , Y_treated )
2262+
2263+ np .testing .assert_allclose (
2264+ w_rust , w_py , atol = 1e-7 , rtol = 1e-6 ,
2265+ err_msg = "Near-singular Y'Y: weight divergence between Rust and Python" ,
2266+ )
2267+
2268+ @pytest .mark .xfail (
2269+ strict = True ,
2270+ reason = "Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is "
2271+ "projected gradient descent. PGD and FW can converge to different "
2272+ "vertices under near-degenerate / extreme-scale inputs even though "
2273+ "both solve the same constrained QP. Algorithmic unification is a "
2274+ "larger fix (P1 follow-up in TODO.md); this xfail baselines the gap "
2275+ "so we notice if/when the algorithms align." ,
2276+ )
2277+ def test_extreme_Y_scale (self ):
2278+ """Y at 1e9 scale, well-conditioned panel. Known Rust-vs-Python
2279+ divergence: Rust/FW concentrates on one unit, Python/PGD returns
2280+ near-uniform. See xfail reason."""
2281+ rng = np .random .default_rng (44 )
2282+ n_pre , n_control = 8 , 5
2283+ Y_control = rng .normal (0 , 1 , (n_pre , n_control )) + 1e9
2284+ Y_treated = rng .normal (0 , 1 , n_pre ) + 1e9
2285+
2286+ w_rust , w_py = self ._run_both_backends (Y_control , Y_treated )
2287+
2288+ np .testing .assert_allclose (
2289+ w_rust , w_py , atol = 1e-7 , rtol = 1e-6 ,
2290+ err_msg = "Extreme Y scale: weight divergence "
2291+ "(known FW-vs-PGD algorithmic mismatch; see xfail reason)" ,
2292+ )
2293+
2294+ @pytest .mark .xfail (
2295+ strict = True ,
2296+ reason = "Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is "
2297+ "projected gradient descent. Same algorithmic mismatch as "
2298+ "test_extreme_Y_scale — backends reach different simplex vertices "
2299+ "on moderate-scale random data. P1 follow-up in TODO.md." ,
2300+ )
2301+ def test_regularization_lambda_variations (self ):
2302+ """Parity across lambda_reg=0 (unregularized) and lambda_reg>0
2303+ (shrink toward uniform). Known divergence; see xfail reason."""
2304+ rng = np .random .default_rng (55 )
2305+ n_pre , n_control = 8 , 5
2306+ Y_control = rng .normal (0 , 1 , (n_pre , n_control ))
2307+ Y_treated = rng .normal (0 , 1 , n_pre )
2308+
2309+ for lr in (0.0 , 0.01 , 1.0 ):
2310+ w_rust , w_py = self ._run_both_backends (Y_control , Y_treated , lambda_reg = lr )
2311+ np .testing .assert_allclose (
2312+ w_rust , w_py , atol = 1e-7 , rtol = 1e-6 ,
2313+ err_msg = f"lambda_reg={ lr } : weight divergence between Rust and Python" ,
2314+ )
2315+
2316+
2317+ @pytest .mark .slow
2318+ @pytest .mark .skipif (not HAS_RUST_BACKEND , reason = "Rust backend not available" )
2319+ class TestTROPRustEdgeCaseParity :
2320+ """Finding #23: TROP Rust grid-search + bootstrap parity on edge cases.
2321+
2322+ Existing parity tests (this file, ~line 1687 / 1757) compare ATT and
2323+ effect dictionaries on well-conditioned random data at ``atol=1e-6``.
2324+ Gap: rank-deficient Y on the grid-search path, seed reproducibility
2325+ on the bootstrap path.
2326+
2327+ Sizing kept minimal (n_units=6, n_periods=5–6) per the
2328+ `feedback_trop_heavy_tests` memory.
2329+ """
2330+
2331+ @staticmethod
2332+ def _make_correlated_panel (n_units = 6 , n_periods = 5 , n_treated = 2 ):
2333+ """Panel with two control units nearly parallel to each other,
2334+ making the pre-period Y matrix near rank-deficient."""
2335+ import pandas as pd
2336+ rng = np .random .default_rng (13 )
2337+ data = []
2338+ shared_path = rng .normal (0 , 1 , n_periods )
2339+ for i in range (n_units ):
2340+ is_treated = i < n_treated
2341+ if i in (n_treated , n_treated + 1 ):
2342+ # Two control units share a near-identical trajectory
2343+ base = shared_path + 1e-10 * rng .normal (0 , 1 , n_periods )
2344+ else :
2345+ base = rng .normal (0 , 1 , n_periods )
2346+ for t in range (n_periods ):
2347+ y = 5.0 + i * 0.3 + base [t ]
2348+ treated = 1 if (is_treated and t >= n_periods - 2 ) else 0
2349+ if treated :
2350+ y += 1.5
2351+ data .append ({"unit" : i , "time" : t , "outcome" : y , "treated" : treated })
2352+ return pd .DataFrame (data )
2353+
2354+ @pytest .mark .xfail (
2355+ strict = True ,
2356+ reason = "TROP Rust grid-search at trop_global.py:688 and Python fallback "
2357+ "at trop_global.py:753 diverge on rank-deficient Y: empirical ATT "
2358+ "gap ~6% on two near-parallel control units. Either (a) grid-winner "
2359+ "ties break differently between backends, or (b) the per-λ solver "
2360+ "itself reaches different stationary points under rank deficiency. "
2361+ "Finding #23 flagged this exact surface as the Phase-2 gap. Rust "
2362+ "vs Python unification is a P1 follow-up (TODO.md). This xfail "
2363+ "baselines the divergence so we notice when/if the backends align." ,
2364+ )
2365+ def test_grid_search_rank_deficient_Y (self ):
2366+ """Grid-search ATT parity on rank-deficient Y.
2367+
2368+ Known to fail: two near-parallel control units produce ~6% ATT
2369+ divergence between Rust and Python LOOCV grid-search. See xfail
2370+ reason for follow-up plan.
2371+ """
2372+ import sys
2373+ from unittest .mock import patch
2374+
2375+ from diff_diff import TROP
2376+
2377+ df = self ._make_correlated_panel ()
2378+ # n_bootstrap>=2 is required by TROP.__init__; we set the minimum.
2379+ # Bootstrap SE is NOT asserted here (see separate test below +
2380+ # xfail baseline for the RNG-algorithm mismatch between backends).
2381+ trop_params = dict (
2382+ method = "global" ,
2383+ lambda_time_grid = [0.1 , 1.0 , 10.0 ],
2384+ lambda_unit_grid = [0.1 , 1.0 , 10.0 ],
2385+ lambda_nn_grid = [np .inf ],
2386+ n_bootstrap = 2 ,
2387+ seed = 42 ,
2388+ )
2389+
2390+ trop_rust = TROP (** trop_params )
2391+ res_rust = trop_rust .fit (df .copy (), "outcome" , "treated" , "unit" , "time" )
2392+
2393+ trop_global_module = sys .modules ["diff_diff.trop_global" ]
2394+ with patch .object (trop_global_module , "HAS_RUST_BACKEND" , False ), \
2395+ patch .object (trop_global_module , "_rust_loocv_grid_search_global" , None ), \
2396+ patch .object (trop_global_module , "_rust_bootstrap_trop_variance_global" , None ):
2397+ trop_py = TROP (** trop_params )
2398+ res_py = trop_py .fit (df .copy (), "outcome" , "treated" , "unit" , "time" )
2399+
2400+ # Primary assertion: the ATT point estimate at the chosen λ matches.
2401+ # This catches both (a) same λ chosen and (b) tied λ producing same fit.
2402+ np .testing .assert_allclose (
2403+ res_rust .att , res_py .att , atol = 1e-6 ,
2404+ err_msg = "Grid-search ATT divergence on rank-deficient Y: "
2405+ f"Rust={ res_rust .att :.8f} , Python={ res_py .att :.8f} " ,
2406+ )
2407+
2408+ @pytest .mark .xfail (
2409+ strict = True ,
2410+ reason = "Rust bootstrap uses its own RNG (rand crate) while Python uses "
2411+ "numpy's default_rng; same `seed=42` produces different bootstrap "
2412+ "replicate indices. Empirical SE divergence ~28%. Unifying the RNG "
2413+ "across backends is a P1 follow-up (TODO.md). This xfail baselines "
2414+ "the gap so we notice if/when the backends share an RNG seed-to-"
2415+ "bytestream mapping. Grid-search ATT parity is validated in the "
2416+ "previous test (rank-deficient Y path)." ,
2417+ )
2418+ def test_bootstrap_seed_reproducibility (self ):
2419+ """Bootstrap SE parity under a fixed seed.
2420+
2421+ Known to fail: Rust and Python use different RNG backends, so
2422+ identical ``seed=42`` produces different bootstrap replicates.
2423+ See xfail reason for follow-up plan.
2424+ """
2425+ import sys
2426+ from unittest .mock import patch
2427+
2428+ from diff_diff import TROP
2429+
2430+ df = self ._make_correlated_panel (n_units = 6 , n_periods = 6 , n_treated = 2 )
2431+ trop_params = dict (
2432+ method = "global" ,
2433+ lambda_time_grid = [1.0 ],
2434+ lambda_unit_grid = [1.0 ],
2435+ lambda_nn_grid = [np .inf ],
2436+ n_bootstrap = 10 ,
2437+ seed = 42 ,
2438+ )
2439+
2440+ trop_rust = TROP (** trop_params )
2441+ res_rust = trop_rust .fit (df .copy (), "outcome" , "treated" , "unit" , "time" )
2442+
2443+ trop_global_module = sys .modules ["diff_diff.trop_global" ]
2444+ with patch .object (trop_global_module , "HAS_RUST_BACKEND" , False ), \
2445+ patch .object (trop_global_module , "_rust_loocv_grid_search_global" , None ), \
2446+ patch .object (trop_global_module , "_rust_bootstrap_trop_variance_global" , None ):
2447+ trop_py = TROP (** trop_params )
2448+ res_py = trop_py .fit (df .copy (), "outcome" , "treated" , "unit" , "time" )
2449+
2450+ np .testing .assert_allclose (
2451+ res_rust .se , res_py .se , atol = 1e-8 ,
2452+ err_msg = f"Bootstrap SE divergence under seed=42: "
2453+ f"Rust={ res_rust .se :.10f} , Python={ res_py .se :.10f} " ,
2454+ )
2455+
2456+
21062457class TestFallbackWhenNoRust :
21072458 """Test that pure Python fallback works when Rust is unavailable."""
21082459
0 commit comments