|
| 1 | +# Generate nprobust lprobust golden values for the Phase 1c parity suite. |
| 2 | +# |
| 3 | +# This script calls nprobust::lprobust() at a single eval point with |
| 4 | +# bwselect="mse-dpi" on five deterministic DGPs and records: |
| 5 | +# - tau.cl, tau.bc (point estimates) |
| 6 | +# - se.cl, se.rb (standard errors) |
| 7 | +# - h, b (bandwidths chosen by the mse-dpi selector) |
| 8 | +# - N (observations in the selected kernel window) |
| 9 | +# - z = qnorm(1 - alpha/2) |
| 10 | +# - ci_low, ci_high = tau.bc +/- z * se.rb |
| 11 | +# |
| 12 | +# DGPs 1-3 reuse the same seed + shape as benchmarks/R/generate_nprobust_golden.R |
| 13 | +# so the selected (h, b) are identical; Phase 1c parity is therefore isolated |
| 14 | +# to the point-estimate + variance computation. DGP 4 adds cluster IDs for |
| 15 | +# cluster-robust SE parity; DGP 5 shifts the support to test a non-zero |
| 16 | +# boundary (Design 1 continuous-near-d_lower). |
| 17 | +# |
| 18 | +# Usage: |
| 19 | +# Rscript benchmarks/R/generate_nprobust_lprobust_golden.R |
| 20 | +# |
| 21 | +# Requirements: |
| 22 | +# nprobust (CRAN), jsonlite |
| 23 | +# |
| 24 | +# Output: |
| 25 | +# benchmarks/data/nprobust_lprobust_golden.json |
| 26 | +# |
| 27 | +# Phase 1c of the HeterogeneousAdoptionDiD implementation (de Chaisemartin, |
| 28 | +# Ciccia, D'Haultfoeuille & Knau 2026, arXiv:2405.04465v6). Python tests at |
| 29 | +# tests/test_bias_corrected_lprobust.py and tests/test_nprobust_port.py load |
| 30 | +# this JSON and check agreement to tiered tolerances (1e-14 on tau_cl/se_cl, |
| 31 | +# 1e-12 on tau_bc/se_rb, 1e-13 on CI bounds; see Phase 1c plan). |
| 32 | + |
| 33 | +library(nprobust) |
| 34 | +library(jsonlite) |
| 35 | + |
| 36 | +stopifnot(packageVersion("nprobust") == "0.5.0") |
| 37 | + |
| 38 | +extract_lprobust_single_eval <- function(d, y, eval_point = 0.0, |
| 39 | + kernel = "epa", vce = "nn", |
| 40 | + cluster = NULL, alpha = 0.05, |
| 41 | + h = NULL, b = NULL) { |
| 42 | + # If h (and optionally b) are passed, bypass the mse-dpi selector and |
| 43 | + # call lprobust() with those bandwidths directly. This is used for |
| 44 | + # clustered DGPs where nprobust's internal lpbwselect.mse.dpi hits a |
| 45 | + # singleton-cluster shape bug in lprobust.vce during the order-q+1/q+2 |
| 46 | + # pilot fits. For unclustered DGPs, h=NULL triggers bwselect="mse-dpi". |
| 47 | + if (is.null(h)) { |
| 48 | + fit <- lprobust(y = y, x = d, eval = eval_point, |
| 49 | + p = 1L, deriv = 0L, kernel = kernel, |
| 50 | + bwselect = "mse-dpi", vce = vce, cluster = cluster, |
| 51 | + bwcheck = 21L, bwregul = 1, nnmatch = 3L) |
| 52 | + } else { |
| 53 | + # When b is unspecified, nprobust defaults to b = h / rho with rho=1. |
| 54 | + fit <- lprobust(y = y, x = d, eval = eval_point, |
| 55 | + p = 1L, deriv = 0L, kernel = kernel, |
| 56 | + h = h, b = if (is.null(b)) h else b, |
| 57 | + vce = vce, cluster = cluster, |
| 58 | + bwcheck = 21L, nnmatch = 3L) |
| 59 | + } |
| 60 | + est <- fit$Estimate[1, ] |
| 61 | + z <- qnorm(1 - alpha / 2) |
| 62 | + ci_low <- as.numeric(est["tau.bc"] - z * est["se.rb"]) |
| 63 | + ci_high <- as.numeric(est["tau.bc"] + z * est["se.rb"]) |
| 64 | + |
| 65 | + list( |
| 66 | + eval_point = as.numeric(eval_point), |
| 67 | + h = as.numeric(est["h"]), |
| 68 | + b = as.numeric(est["b"]), |
| 69 | + n_used = as.integer(est["N"]), |
| 70 | + tau_cl = as.numeric(est["tau.us"]), |
| 71 | + tau_bc = as.numeric(est["tau.bc"]), |
| 72 | + se_cl = as.numeric(est["se.us"]), |
| 73 | + se_rb = as.numeric(est["se.rb"]), |
| 74 | + ci_low = ci_low, |
| 75 | + ci_high = ci_high, |
| 76 | + alpha = as.numeric(alpha), |
| 77 | + z = as.numeric(z) |
| 78 | + ) |
| 79 | +} |
| 80 | + |
| 81 | +set.seed(20260419) |
| 82 | + |
| 83 | +# DGP 1: d ~ Uniform(0, 1), y = d + d^2 + N(0, 0.5) |
| 84 | +G <- 2000L |
| 85 | +d1 <- runif(G, 0, 1) |
| 86 | +y1 <- d1 + d1^2 + rnorm(G, 0, 0.5) |
| 87 | + |
| 88 | +# DGP 2: d ~ Beta(2, 2), y = d + d^2 + N(0, 0.5) (f(0) vanishes at boundary) |
| 89 | +d2 <- rbeta(G, 2, 2) |
| 90 | +y2 <- d2 + d2^2 + rnorm(G, 0, 0.5) |
| 91 | + |
| 92 | +# DGP 3: Half-normal d, y = 0.5 * d^2 + N(0, 1) |
| 93 | +d3 <- abs(rnorm(G, 0, 1)) |
| 94 | +y3 <- 0.5 * d3^2 + rnorm(G, 0, 1) |
| 95 | + |
| 96 | +# DGP 4: Uniform(0, 1) with 50 clusters of 40 obs (cluster-robust SE parity). |
| 97 | +# Fewer, larger clusters avoid an nprobust-internal singleton-cluster shape |
| 98 | +# bug in lprobust.vce that fires if a kernel window retains only one obs per |
| 99 | +# cluster. 50 clusters x 40 obs => the mse-dpi pilot windows near the |
| 100 | +# boundary keep enough obs per cluster to stay well-conditioned. |
| 101 | +set.seed(20260420) |
| 102 | +G4 <- 2000L |
| 103 | +d4 <- runif(G4, 0, 1) |
| 104 | +cluster4 <- rep(1:50, each = 40)[1:G4] |
| 105 | +# Introduce within-cluster correlation in y via a cluster effect. |
| 106 | +cluster_effect <- rnorm(50, 0, 0.3)[cluster4] |
| 107 | +y4 <- d4 + d4^2 + cluster_effect + rnorm(G4, 0, 0.3) |
| 108 | + |
| 109 | +# DGP 5: Uniform(0.2, 1.0) — Design 1 continuous-near-d_lower at |
| 110 | +# boundary = d.min() > 0. Different seed to avoid aliasing DGP 1. |
| 111 | +set.seed(20260421) |
| 112 | +G5 <- 2000L |
| 113 | +d5 <- runif(G5, 0.2, 1.0) |
| 114 | +y5 <- (d5 - 0.2) + (d5 - 0.2)^2 + rnorm(G5, 0, 0.5) |
| 115 | +eval5 <- min(d5) # Design 1 continuous: evaluate at the realized minimum. |
| 116 | + |
| 117 | +golden <- list( |
| 118 | + metadata = list( |
| 119 | + nprobust_version = as.character(packageVersion("nprobust")), |
| 120 | + nprobust_sha = "36e4e532d2f7d23d4dc6e162575cca79e0927cda", |
| 121 | + seeds = list(dgp1 = 20260419L, dgp2 = 20260419L, dgp3 = 20260419L, |
| 122 | + dgp4 = 20260420L, dgp5 = 20260421L), |
| 123 | + generator = "benchmarks/R/generate_nprobust_lprobust_golden.R", |
| 124 | + algorithm = paste("nprobust::lprobust(..., bwselect='mse-dpi') at a single", |
| 125 | + "eval point, p=1, deriv=0, kernel='epa', vce='nn'", |
| 126 | + "unless noted. z = qnorm(1 - alpha/2) exported so the", |
| 127 | + "Python side consumes R's critical value directly.") |
| 128 | + ), |
| 129 | + dgp1 = c(list(n = G, d = d1, y = y1, kernel = "epa", vce = "nn", |
| 130 | + description = "Uniform(0,1), polynomial m(d) = d + d^2"), |
| 131 | + extract_lprobust_single_eval(d1, y1, kernel = "epa", vce = "nn")), |
| 132 | + dgp2 = c(list(n = G, d = d2, y = y2, kernel = "epa", vce = "nn", |
| 133 | + description = "Beta(2,2) - boundary density vanishes at 0"), |
| 134 | + extract_lprobust_single_eval(d2, y2, kernel = "epa", vce = "nn")), |
| 135 | + dgp3 = c(list(n = G, d = d3, y = y3, kernel = "epa", vce = "nn", |
| 136 | + description = "Half-normal d, quadratic m(d) with unit noise"), |
| 137 | + extract_lprobust_single_eval(d3, y3, kernel = "epa", vce = "nn")), |
| 138 | + dgp4 = c(list(n = G4, d = d4, y = y4, cluster = cluster4, |
| 139 | + kernel = "epa", vce = "nn", |
| 140 | + h_manual = 0.3, b_manual = 0.3, |
| 141 | + description = paste("Clustered (50 groups of 40) Uniform(0,1);", |
| 142 | + "manual h=b=0.3 to sidestep nprobust's", |
| 143 | + "singleton-cluster bug in the mse-dpi", |
| 144 | + "pilot fits.")), |
| 145 | + extract_lprobust_single_eval(d4, y4, kernel = "epa", vce = "nn", |
| 146 | + cluster = cluster4, |
| 147 | + h = 0.3, b = 0.3)), |
| 148 | + dgp5 = c(list(n = G5, d = d5, y = y5, eval_point_override = eval5, |
| 149 | + kernel = "epa", vce = "nn", |
| 150 | + description = "Uniform(0.2, 1.0), Design 1 boundary = d.min()"), |
| 151 | + extract_lprobust_single_eval(d5, y5, eval_point = eval5, |
| 152 | + kernel = "epa", vce = "nn")) |
| 153 | +) |
| 154 | + |
| 155 | +out_path <- "benchmarks/data/nprobust_lprobust_golden.json" |
| 156 | +dir.create("benchmarks/data", recursive = TRUE, showWarnings = FALSE) |
| 157 | +write_json(golden, out_path, auto_unbox = TRUE, pretty = TRUE, digits = 14) |
| 158 | +cat("Golden values written to", out_path, "\n") |
| 159 | +for (name in c("dgp1", "dgp2", "dgp3", "dgp4", "dgp5")) { |
| 160 | + cat(sprintf("%s: tau.bc = %.6f, se.rb = %.6f, h = %.6f, b = %.6f\n", |
| 161 | + name, golden[[name]]$tau_bc, golden[[name]]$se_rb, |
| 162 | + golden[[name]]$h, golden[[name]]$b)) |
| 163 | +} |
0 commit comments