|
| 1 | +# Generate nprobust mse-dpi golden values for the Phase 1b parity suite. |
| 2 | +# |
| 3 | +# This script re-implements the mse-dpi algorithm from |
| 4 | +# nprobust::lpbwselect.mse.dpi (nprobust 0.5.0, SHA 36e4e53, source at |
| 5 | +# npfunctions.R:498-607 of github.com/nppackages/nprobust) so that every |
| 6 | +# intermediate quantity the Python port needs to parity-check is exposed. |
| 7 | +# Calling lpbwselect() directly would only surface h and b; we also need |
| 8 | +# c.bw, bw.mp2, bw.mp3, and the per-stage (V, B1, B2, R) diagnostics. |
| 9 | +# We therefore use nprobust::: internals via getFromNamespace(). |
| 10 | +# |
| 11 | +# Usage: |
| 12 | +# Rscript benchmarks/R/generate_nprobust_golden.R |
| 13 | +# |
| 14 | +# Requirements: |
| 15 | +# nprobust (CRAN), jsonlite |
| 16 | +# |
| 17 | +# Output: |
| 18 | +# benchmarks/data/nprobust_mse_dpi_golden.json |
| 19 | +# |
| 20 | +# Phase 1b of the HeterogeneousAdoptionDiD implementation (de Chaisemartin, |
| 21 | +# Ciccia, D'Haultfoeuille & Knau 2026, arXiv:2405.04465v6). Python tests at |
| 22 | +# tests/test_bandwidth_selector.py and tests/test_nprobust_port.py load |
| 23 | +# this JSON and check agreement to 1% relative tolerance. |
| 24 | + |
| 25 | +library(nprobust) |
| 26 | +library(jsonlite) |
| 27 | + |
| 28 | +stopifnot(packageVersion("nprobust") == "0.5.0") |
| 29 | + |
| 30 | +# Internal helper re-implementing lpbwselect.mse.dpi while returning every |
| 31 | +# stage output. Mirrors npfunctions.R:498-607 line-by-line (with the even |
| 32 | +# / interior branches that apply for p=1, deriv=0, boundary eval). |
| 33 | +lprobust_bw <- getFromNamespace("lprobust.bw", "nprobust") |
| 34 | + |
| 35 | +extract_mse_dpi_stages <- function(d, y, kernel = "epa", eval_point = 0.0, |
| 36 | + bwcheck = 21, bwregul = 1, vce = "nn") { |
| 37 | + p <- 1L; deriv <- 0L; q <- p + 1L |
| 38 | + # For HAD (p=1, deriv=0), (p-deriv) %% 2 == 1, so `even` is FALSE. |
| 39 | + # nprobust's conditional: if (even==FALSE | interior==TRUE) -> use |
| 40 | + # closed-form lprobust.bw$bw; else -> optimize. For HAD this means |
| 41 | + # every stage bandwidth comes from the closed form, not optimize. |
| 42 | + even <- (p - deriv) %% 2 == 0 |
| 43 | + interior <- FALSE |
| 44 | + |
| 45 | + N <- length(d) |
| 46 | + x_iq <- quantile(d, 0.75) - quantile(d, 0.25) |
| 47 | + x_min <- min(d); x_max <- max(d) |
| 48 | + range_ <- x_max - x_min |
| 49 | + |
| 50 | + C_c <- switch(kernel, "epa" = 2.34, "uni" = 1.843, |
| 51 | + "tri" = 2.576, "gau" = 1.06, |
| 52 | + stop("unknown kernel: ", kernel)) |
| 53 | + |
| 54 | + c.bw <- C_c * min(sd(d), x_iq / 1.349) * N^(-1/5) |
| 55 | + bw.max <- max(abs(eval_point - x_min), abs(eval_point - x_max)) |
| 56 | + c.bw <- min(c.bw, bw.max) |
| 57 | + |
| 58 | + # Nearest-neighbor precomputation when vce="nn" (npfunctions.R:518-529). |
| 59 | + dups <- dupsid <- NULL |
| 60 | + if (vce == "nn") { |
| 61 | + order_x <- order(d) |
| 62 | + d <- d[order_x]; y <- y[order_x] |
| 63 | + dups <- integer(N) |
| 64 | + for (j in 1:N) dups[j] <- sum(d == d[j]) |
| 65 | + dupsid <- integer(N); j <- 1L |
| 66 | + while (j <= N) { |
| 67 | + dupsid[j:(j + dups[j] - 1L)] <- 1:dups[j] |
| 68 | + j <- j + dups[j] |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + bw.min <- NULL |
| 73 | + if (!is.null(bwcheck)) { |
| 74 | + bw.min <- sort(abs(d - eval_point))[bwcheck] |
| 75 | + c.bw <- max(c.bw, bw.min) |
| 76 | + } |
| 77 | + |
| 78 | + # Helper: dispatch between closed-form and optimize exactly as R does. |
| 79 | + select_bw <- function(C, exp_bias, exp_var, scale, range_) { |
| 80 | + if (!even || interior) { |
| 81 | + return(C$bw) |
| 82 | + } |
| 83 | + fn <- function(H) { |
| 84 | + abs(H^exp_bias * (C$B1 + H * C$B2 + scale * C$R)^2 + |
| 85 | + C$V / (N * H^exp_var)) |
| 86 | + } |
| 87 | + optimize(fn, interval = c(.Machine$double.eps, range_))$minimum |
| 88 | + } |
| 89 | + |
| 90 | + # Stage 2: C.d1 -> bw.mp2 (npfunctions.R:539-546) |
| 91 | + C_d1 <- lprobust_bw(y, d, NULL, eval_point, o = q + 1L, nu = q + 1L, |
| 92 | + o.B = q + 2L, h.V = c.bw, h.B1 = range_, |
| 93 | + h.B2 = range_, scale = 0, vce = vce, nnmatch = 3L, |
| 94 | + kernel = kernel, dups = dups, dupsid = dupsid) |
| 95 | + bw.mp2 <- select_bw(C_d1, |
| 96 | + exp_bias = 2 * (q + 1) + 2 - 2 * (q + 1), |
| 97 | + exp_var = 1 + 2 * (q + 1), |
| 98 | + scale = 0, range_ = range_) |
| 99 | + |
| 100 | + # Stage 2: C.d2 -> bw.mp3 (npfunctions.R:549-556) |
| 101 | + C_d2 <- lprobust_bw(y, d, NULL, eval_point, o = q + 2L, nu = q + 2L, |
| 102 | + o.B = q + 3L, h.V = c.bw, h.B1 = range_, |
| 103 | + h.B2 = range_, scale = 0, vce = vce, nnmatch = 3L, |
| 104 | + kernel = kernel, dups = dups, dupsid = dupsid) |
| 105 | + bw.mp3 <- select_bw(C_d2, |
| 106 | + exp_bias = 2 * (q + 2) + 2 - 2 * (q + 2), |
| 107 | + exp_var = 1 + 2 * (q + 2), |
| 108 | + scale = 0, range_ = range_) |
| 109 | + |
| 110 | + # Apply clipping (npfunctions.R:559-565) |
| 111 | + bw.mp2 <- min(bw.mp2, bw.max) |
| 112 | + bw.mp3 <- min(bw.mp3, bw.max) |
| 113 | + if (!is.null(bw.min)) { |
| 114 | + bw.mp2 <- max(bw.mp2, bw.min) |
| 115 | + bw.mp3 <- max(bw.mp3, bw.min) |
| 116 | + } |
| 117 | + |
| 118 | + # Stage 3: C.b -> b.mse.dpi (npfunctions.R:569-580) |
| 119 | + C_b <- lprobust_bw(y, d, NULL, eval_point, o = q, nu = p + 1L, |
| 120 | + o.B = q + 1L, h.V = c.bw, h.B1 = bw.mp2, |
| 121 | + h.B2 = bw.mp3, scale = bwregul, vce = vce, |
| 122 | + nnmatch = 3L, kernel = kernel, |
| 123 | + dups = dups, dupsid = dupsid) |
| 124 | + b.mse.dpi <- select_bw(C_b, |
| 125 | + exp_bias = 2 * q + 2 - 2 * (p + 1), |
| 126 | + exp_var = 1 + 2 * (p + 1), |
| 127 | + scale = bwregul, range_ = range_) |
| 128 | + b.mse.dpi <- min(b.mse.dpi, bw.max) |
| 129 | + if (!is.null(bw.min)) b.mse.dpi <- max(b.mse.dpi, bw.min) |
| 130 | + |
| 131 | + # Stage 3 final: C.h -> h.mse.dpi (npfunctions.R:585-595) |
| 132 | + C_h <- lprobust_bw(y, d, NULL, eval_point, o = p, nu = deriv, |
| 133 | + o.B = q, h.V = c.bw, h.B1 = b.mse.dpi, |
| 134 | + h.B2 = bw.mp2, scale = bwregul, vce = vce, |
| 135 | + nnmatch = 3L, kernel = kernel, |
| 136 | + dups = dups, dupsid = dupsid) |
| 137 | + h.mse.dpi <- select_bw(C_h, |
| 138 | + exp_bias = 2 * p + 2 - 2 * deriv, |
| 139 | + exp_var = 1 + 2 * deriv, |
| 140 | + scale = bwregul, range_ = range_) |
| 141 | + h.mse.dpi <- min(h.mse.dpi, bw.max) |
| 142 | + if (!is.null(bw.min)) h.mse.dpi <- max(h.mse.dpi, bw.min) |
| 143 | + |
| 144 | + stage_record <- function(C) { |
| 145 | + list(V = as.numeric(C$V), B1 = as.numeric(C$B1), |
| 146 | + B2 = as.numeric(C$B2), R = as.numeric(C$R), |
| 147 | + bw = as.numeric(C$bw)) |
| 148 | + } |
| 149 | + |
| 150 | + list( |
| 151 | + c_bw = as.numeric(c.bw), |
| 152 | + bw_mp2 = as.numeric(bw.mp2), |
| 153 | + bw_mp3 = as.numeric(bw.mp3), |
| 154 | + b_mse_dpi = as.numeric(b.mse.dpi), |
| 155 | + h_mse_dpi = as.numeric(h.mse.dpi), |
| 156 | + bw_min = if (is.null(bw.min)) NA_real_ else as.numeric(bw.min), |
| 157 | + bw_max = as.numeric(bw.max), |
| 158 | + stage_d1 = stage_record(C_d1), |
| 159 | + stage_d2 = stage_record(C_d2), |
| 160 | + stage_b = stage_record(C_b), |
| 161 | + stage_h = stage_record(C_h) |
| 162 | + ) |
| 163 | +} |
| 164 | + |
| 165 | +set.seed(20260419) |
| 166 | + |
| 167 | +# DGP 1: d ~ Uniform(0, 1), y = d + d^2 + N(0, 0.5) |
| 168 | +G <- 2000L |
| 169 | +d1 <- runif(G, 0, 1) |
| 170 | +y1 <- d1 + d1^2 + rnorm(G, 0, 0.5) |
| 171 | + |
| 172 | +# DGP 2: d ~ Beta(2, 2), y = d + d^2 + N(0, 0.5) (f(0) vanishes at boundary) |
| 173 | +d2 <- rbeta(G, 2, 2) |
| 174 | +y2 <- d2 + d2^2 + rnorm(G, 0, 0.5) |
| 175 | + |
| 176 | +# DGP 3: Half-normal d, y = 0.5 * d^2 + N(0, 1) |
| 177 | +d3 <- abs(rnorm(G, 0, 1)) |
| 178 | +y3 <- 0.5 * d3^2 + rnorm(G, 0, 1) |
| 179 | + |
| 180 | +golden <- list( |
| 181 | + metadata = list( |
| 182 | + nprobust_version = as.character(packageVersion("nprobust")), |
| 183 | + nprobust_sha = "36e4e532d2f7d23d4dc6e162575cca79e0927cda", |
| 184 | + seed = 20260419L, |
| 185 | + generator = "benchmarks/R/generate_nprobust_golden.R", |
| 186 | + algorithm = paste("Port of nprobust::lpbwselect.mse.dpi with all five", |
| 187 | + "stage bandwidths plus per-stage (V, B1, B2, R).", |
| 188 | + "Evaluation at boundary eval=0 for HAD use case", |
| 189 | + "(p=1, deriv=0, interior=FALSE).") |
| 190 | + ), |
| 191 | + dgp1 = c(list(n = G, d = d1, y = y1, kernel = "epa", |
| 192 | + description = "Uniform(0,1), polynomial m(d) = d + d^2"), |
| 193 | + extract_mse_dpi_stages(d1, y1, kernel = "epa")), |
| 194 | + dgp2 = c(list(n = G, d = d2, y = y2, kernel = "epa", |
| 195 | + description = "Beta(2,2) - boundary density vanishes at 0"), |
| 196 | + extract_mse_dpi_stages(d2, y2, kernel = "epa")), |
| 197 | + dgp3 = c(list(n = G, d = d3, y = y3, kernel = "epa", |
| 198 | + description = "Half-normal d, quadratic m(d) with unit noise"), |
| 199 | + extract_mse_dpi_stages(d3, y3, kernel = "epa")) |
| 200 | +) |
| 201 | + |
| 202 | +out_path <- "benchmarks/data/nprobust_mse_dpi_golden.json" |
| 203 | +dir.create("benchmarks/data", recursive = TRUE, showWarnings = FALSE) |
| 204 | +write_json(golden, out_path, auto_unbox = TRUE, pretty = TRUE, digits = 14) |
| 205 | +cat("Golden values written to", out_path, "\n") |
| 206 | +cat("DGP 1 h.mse.dpi:", golden$dgp1$h_mse_dpi, "\n") |
| 207 | +cat("DGP 2 h.mse.dpi:", golden$dgp2$h_mse_dpi, "\n") |
| 208 | +cat("DGP 3 h.mse.dpi:", golden$dgp3$h_mse_dpi, "\n") |
0 commit comments