|
| 1 | +# Generate cross-language weighted-OLS parity fixture for HAD Phase 4.5. |
| 2 | +# |
| 3 | +# Purpose: no public weighted-CCF reference exists for bias-corrected |
| 4 | +# local-linear (nprobust::lprobust has no weight argument; np::npreg uses |
| 5 | +# its own internal local-linear algorithm that does not reduce to a |
| 6 | +# straightforward weighted OLS at the intercept). To validate the weighted |
| 7 | +# kernel-composition machinery in diff_diff._nprobust_port cross-language, |
| 8 | +# we record the intercept from an R implementation of the SAME formula |
| 9 | +# (weighted-OLS with one-sided Epanechnikov kernel) that the Python port |
| 10 | +# implements. Bit-parity at atol=1e-14 locks in numerical consistency |
| 11 | +# across BLAS reductions. |
| 12 | +# |
| 13 | +# This is NOT third-party validation of the weighted-CCF methodology. It is |
| 14 | +# a regression lock against R↔Python drift on the weighted-OLS formula |
| 15 | +# itself. Methodology confidence under informative weights comes from: |
| 16 | +# 1. Analytic derivation in docs/methodology/REGISTRY.md |
| 17 | +# 2. Uniform-weights bit-parity: weights=np.ones ≡ unweighted at 1e-14 |
| 18 | +# 3. Monte Carlo oracle consistency (tests/test_had_mc.py) |
| 19 | +# |
| 20 | +# Usage: |
| 21 | +# Rscript benchmarks/R/generate_np_npreg_weighted_golden.R |
| 22 | +# |
| 23 | +# Output: |
| 24 | +# benchmarks/data/np_npreg_weighted_golden.json |
| 25 | +# |
| 26 | +# Phase 4.5 of HeterogeneousAdoptionDiD (de Chaisemartin et al. 2026). |
| 27 | +# Python test loader: tests/test_np_npreg_weighted_parity.py. |
| 28 | + |
| 29 | +library(jsonlite) |
| 30 | + |
| 31 | +# ------------------------------------------------------------------------- |
| 32 | +# Weighted local-linear at a boundary: manual weighted OLS with Epa kernel. |
| 33 | +# Matches diff_diff/local_linear.py::local_linear_fit exactly. |
| 34 | +# ------------------------------------------------------------------------- |
| 35 | + |
| 36 | +weighted_local_linear <- function(d, y, weights, eval_point = 0.0, h = 0.3) { |
| 37 | + # One-sided epanechnikov on [0, 1]: k(u) = (3/4)(1 - u^2), zero elsewhere. |
| 38 | + u <- (d - eval_point) / h |
| 39 | + kw <- ifelse(u >= 0 & u <= 1, 0.75 * (1 - u^2), 0) |
| 40 | + # Combined weights: user weights * kernel weights. |
| 41 | + combined <- kw * weights |
| 42 | + # Active window (non-zero combined weight). |
| 43 | + active <- combined > 0 |
| 44 | + if (sum(active) < 2) { |
| 45 | + stop("Active window has fewer than 2 observations.") |
| 46 | + } |
| 47 | + # Weighted OLS of y ~ 1 + (d - eval_point), intercept is mu_hat at |
| 48 | + # eval_point. |
| 49 | + fit <- lm(y[active] ~ I(d[active] - eval_point), weights = combined[active]) |
| 50 | + mu_hat <- as.numeric(coef(fit)[1]) |
| 51 | + slope_hat <- as.numeric(coef(fit)[2]) |
| 52 | + list( |
| 53 | + mu_hat = mu_hat, |
| 54 | + slope = slope_hat, |
| 55 | + n_active = as.integer(sum(active)), |
| 56 | + h = h, |
| 57 | + eval_point = eval_point |
| 58 | + ) |
| 59 | +} |
| 60 | + |
| 61 | +# ------------------------------------------------------------------------- |
| 62 | +# DGPs: deterministic seeds for reproducibility. |
| 63 | +# ------------------------------------------------------------------------- |
| 64 | + |
| 65 | +set.seed(20260424) |
| 66 | + |
| 67 | +dgp1 <- local({ |
| 68 | + G <- 500 |
| 69 | + d <- runif(G, 0, 1) |
| 70 | + y <- 2 * d + 0.3 * d^2 + rnorm(G, sd = 0.25) |
| 71 | + w <- rep(1.0, G) |
| 72 | + list(d = d, y = y, w = w, eval_point = 0.0, h = 0.30, |
| 73 | + description = "Uniform weights, G=500, boundary=0") |
| 74 | +}) |
| 75 | + |
| 76 | +dgp2 <- local({ |
| 77 | + G <- 400 |
| 78 | + d <- runif(G, 0, 1) |
| 79 | + y <- 2 * d + 0.3 * d^2 + rnorm(G, sd = 0.25) |
| 80 | + w <- exp(-d * 2.0) |
| 81 | + list(d = d, y = y, w = w, eval_point = 0.0, h = 0.25, |
| 82 | + description = "Informative weights (exp decay from boundary), G=400") |
| 83 | +}) |
| 84 | + |
| 85 | +dgp3 <- local({ |
| 86 | + G <- 200 |
| 87 | + d <- runif(G, 0, 1) |
| 88 | + y <- 3 * d - d^2 + 0.5 * d^3 + rnorm(G, sd = 0.30) |
| 89 | + w <- pmax(0.1, runif(G, 0.5, 1.5)) |
| 90 | + list(d = d, y = y, w = w, eval_point = 0.0, h = 0.35, |
| 91 | + description = "Small G=200, nonlinear m(d), bounded heterogeneous weights") |
| 92 | +}) |
| 93 | + |
| 94 | +dgp4 <- local({ |
| 95 | + G <- 400 |
| 96 | + d_lower <- 0.1 |
| 97 | + d <- runif(G, d_lower, 1) |
| 98 | + y <- 2 * (d - d_lower) + 0.3 * (d - d_lower)^2 + rnorm(G, sd = 0.25) |
| 99 | + w <- rep(1.0, G) |
| 100 | + list(d = d - d_lower, y = y, w = w, eval_point = 0.0, h = 0.30, |
| 101 | + description = "G=400, d_lower=0.1 shifted boundary=0 (Design 1 near-d_lower)", |
| 102 | + d_lower = d_lower) |
| 103 | +}) |
| 104 | + |
| 105 | +# ------------------------------------------------------------------------- |
| 106 | +# Run each DGP through the weighted local-linear reference. |
| 107 | +# ------------------------------------------------------------------------- |
| 108 | + |
| 109 | +run_one <- function(name, dgp) { |
| 110 | + cat(sprintf("Running %s: %s\n", name, dgp$description)) |
| 111 | + res <- weighted_local_linear( |
| 112 | + d = dgp$d, y = dgp$y, weights = dgp$w, |
| 113 | + eval_point = dgp$eval_point, h = dgp$h |
| 114 | + ) |
| 115 | + list( |
| 116 | + description = dgp$description, |
| 117 | + n = length(dgp$d), |
| 118 | + d = as.numeric(dgp$d), |
| 119 | + y = as.numeric(dgp$y), |
| 120 | + weights = as.numeric(dgp$w), |
| 121 | + eval_point = as.numeric(res$eval_point), |
| 122 | + h = as.numeric(res$h), |
| 123 | + kernel = "epanechnikov", |
| 124 | + n_active = res$n_active, |
| 125 | + mu_hat = as.numeric(res$mu_hat), |
| 126 | + slope = as.numeric(res$slope) |
| 127 | + ) |
| 128 | +} |
| 129 | + |
| 130 | +out <- list( |
| 131 | + metadata = list( |
| 132 | + r_version = paste(R.Version()$major, R.Version()$minor, sep = "."), |
| 133 | + seed = 20260424L, |
| 134 | + generator = "generate_np_npreg_weighted_golden.R", |
| 135 | + algorithm = "manual weighted OLS with one-sided Epanechnikov kernel", |
| 136 | + purpose = "HAD Phase 4.5 cross-language weighted-LL parity", |
| 137 | + note = paste( |
| 138 | + "Regression lock on the weighted kernel + weighted OLS formula", |
| 139 | + "implemented in diff_diff.local_linear.local_linear_fit. Not a", |
| 140 | + "third-party validation of weighted-CCF methodology; see REGISTRY", |
| 141 | + "'Weighted extension (Phase 4.5)' for the parity-gap acknowledgement." |
| 142 | + ) |
| 143 | + ), |
| 144 | + dgp1 = run_one("dgp1", dgp1), |
| 145 | + dgp2 = run_one("dgp2", dgp2), |
| 146 | + dgp3 = run_one("dgp3", dgp3), |
| 147 | + dgp4 = run_one("dgp4", dgp4) |
| 148 | +) |
| 149 | + |
| 150 | +out_path <- "benchmarks/data/np_npreg_weighted_golden.json" |
| 151 | +dir.create(dirname(out_path), recursive = TRUE, showWarnings = FALSE) |
| 152 | + |
| 153 | +write_json(out, out_path, auto_unbox = TRUE, pretty = TRUE, digits = 14) |
| 154 | +cat(sprintf("Wrote %s\n", out_path)) |
0 commit comments