Skip to content

Commit ed94e43

Browse files
authored
Merge pull request #335 from igerber/did-no-untreated
Phase 1b: MSE-optimal bandwidth selector (CCF 2018 DPI)
2 parents dab5771 + ee02021 commit ed94e43

9 files changed

Lines changed: 2653 additions & 14 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ target/
7272
.claude/settings.local.json
7373
.claude/reviews/
7474
.claude/paper-review/
75+
.claude/scheduled_tasks.lock
7576

7677
# MCP configuration (may contain tokens)
7778
.mcp.json
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# Generate nprobust mse-dpi golden values for the Phase 1b parity suite.
2+
#
3+
# This script re-implements the mse-dpi algorithm from
4+
# nprobust::lpbwselect.mse.dpi (nprobust 0.5.0, SHA 36e4e53, source at
5+
# npfunctions.R:498-607 of github.com/nppackages/nprobust) so that every
6+
# intermediate quantity the Python port needs to parity-check is exposed.
7+
# Calling lpbwselect() directly would only surface h and b; we also need
8+
# c.bw, bw.mp2, bw.mp3, and the per-stage (V, B1, B2, R) diagnostics.
9+
# We therefore use nprobust::: internals via getFromNamespace().
10+
#
11+
# Usage:
12+
# Rscript benchmarks/R/generate_nprobust_golden.R
13+
#
14+
# Requirements:
15+
# nprobust (CRAN), jsonlite
16+
#
17+
# Output:
18+
# benchmarks/data/nprobust_mse_dpi_golden.json
19+
#
20+
# Phase 1b of the HeterogeneousAdoptionDiD implementation (de Chaisemartin,
21+
# Ciccia, D'Haultfoeuille & Knau 2026, arXiv:2405.04465v6). Python tests at
22+
# tests/test_bandwidth_selector.py and tests/test_nprobust_port.py load
23+
# this JSON and check agreement to 1% relative tolerance.
24+
25+
library(nprobust)
26+
library(jsonlite)
27+
28+
stopifnot(packageVersion("nprobust") == "0.5.0")
29+
30+
# Internal helper re-implementing lpbwselect.mse.dpi while returning every
31+
# stage output. Mirrors npfunctions.R:498-607 line-by-line (with the even
32+
# / interior branches that apply for p=1, deriv=0, boundary eval).
33+
lprobust_bw <- getFromNamespace("lprobust.bw", "nprobust")
34+
35+
extract_mse_dpi_stages <- function(d, y, kernel = "epa", eval_point = 0.0,
36+
bwcheck = 21, bwregul = 1, vce = "nn") {
37+
p <- 1L; deriv <- 0L; q <- p + 1L
38+
# For HAD (p=1, deriv=0), (p-deriv) %% 2 == 1, so `even` is FALSE.
39+
# nprobust's conditional: if (even==FALSE | interior==TRUE) -> use
40+
# closed-form lprobust.bw$bw; else -> optimize. For HAD this means
41+
# every stage bandwidth comes from the closed form, not optimize.
42+
even <- (p - deriv) %% 2 == 0
43+
interior <- FALSE
44+
45+
N <- length(d)
46+
x_iq <- quantile(d, 0.75) - quantile(d, 0.25)
47+
x_min <- min(d); x_max <- max(d)
48+
range_ <- x_max - x_min
49+
50+
C_c <- switch(kernel, "epa" = 2.34, "uni" = 1.843,
51+
"tri" = 2.576, "gau" = 1.06,
52+
stop("unknown kernel: ", kernel))
53+
54+
c.bw <- C_c * min(sd(d), x_iq / 1.349) * N^(-1/5)
55+
bw.max <- max(abs(eval_point - x_min), abs(eval_point - x_max))
56+
c.bw <- min(c.bw, bw.max)
57+
58+
# Nearest-neighbor precomputation when vce="nn" (npfunctions.R:518-529).
59+
dups <- dupsid <- NULL
60+
if (vce == "nn") {
61+
order_x <- order(d)
62+
d <- d[order_x]; y <- y[order_x]
63+
dups <- integer(N)
64+
for (j in 1:N) dups[j] <- sum(d == d[j])
65+
dupsid <- integer(N); j <- 1L
66+
while (j <= N) {
67+
dupsid[j:(j + dups[j] - 1L)] <- 1:dups[j]
68+
j <- j + dups[j]
69+
}
70+
}
71+
72+
bw.min <- NULL
73+
if (!is.null(bwcheck)) {
74+
bw.min <- sort(abs(d - eval_point))[bwcheck]
75+
c.bw <- max(c.bw, bw.min)
76+
}
77+
78+
# Helper: dispatch between closed-form and optimize exactly as R does.
79+
select_bw <- function(C, exp_bias, exp_var, scale, range_) {
80+
if (!even || interior) {
81+
return(C$bw)
82+
}
83+
fn <- function(H) {
84+
abs(H^exp_bias * (C$B1 + H * C$B2 + scale * C$R)^2 +
85+
C$V / (N * H^exp_var))
86+
}
87+
optimize(fn, interval = c(.Machine$double.eps, range_))$minimum
88+
}
89+
90+
# Stage 2: C.d1 -> bw.mp2 (npfunctions.R:539-546)
91+
C_d1 <- lprobust_bw(y, d, NULL, eval_point, o = q + 1L, nu = q + 1L,
92+
o.B = q + 2L, h.V = c.bw, h.B1 = range_,
93+
h.B2 = range_, scale = 0, vce = vce, nnmatch = 3L,
94+
kernel = kernel, dups = dups, dupsid = dupsid)
95+
bw.mp2 <- select_bw(C_d1,
96+
exp_bias = 2 * (q + 1) + 2 - 2 * (q + 1),
97+
exp_var = 1 + 2 * (q + 1),
98+
scale = 0, range_ = range_)
99+
100+
# Stage 2: C.d2 -> bw.mp3 (npfunctions.R:549-556)
101+
C_d2 <- lprobust_bw(y, d, NULL, eval_point, o = q + 2L, nu = q + 2L,
102+
o.B = q + 3L, h.V = c.bw, h.B1 = range_,
103+
h.B2 = range_, scale = 0, vce = vce, nnmatch = 3L,
104+
kernel = kernel, dups = dups, dupsid = dupsid)
105+
bw.mp3 <- select_bw(C_d2,
106+
exp_bias = 2 * (q + 2) + 2 - 2 * (q + 2),
107+
exp_var = 1 + 2 * (q + 2),
108+
scale = 0, range_ = range_)
109+
110+
# Apply clipping (npfunctions.R:559-565)
111+
bw.mp2 <- min(bw.mp2, bw.max)
112+
bw.mp3 <- min(bw.mp3, bw.max)
113+
if (!is.null(bw.min)) {
114+
bw.mp2 <- max(bw.mp2, bw.min)
115+
bw.mp3 <- max(bw.mp3, bw.min)
116+
}
117+
118+
# Stage 3: C.b -> b.mse.dpi (npfunctions.R:569-580)
119+
C_b <- lprobust_bw(y, d, NULL, eval_point, o = q, nu = p + 1L,
120+
o.B = q + 1L, h.V = c.bw, h.B1 = bw.mp2,
121+
h.B2 = bw.mp3, scale = bwregul, vce = vce,
122+
nnmatch = 3L, kernel = kernel,
123+
dups = dups, dupsid = dupsid)
124+
b.mse.dpi <- select_bw(C_b,
125+
exp_bias = 2 * q + 2 - 2 * (p + 1),
126+
exp_var = 1 + 2 * (p + 1),
127+
scale = bwregul, range_ = range_)
128+
b.mse.dpi <- min(b.mse.dpi, bw.max)
129+
if (!is.null(bw.min)) b.mse.dpi <- max(b.mse.dpi, bw.min)
130+
131+
# Stage 3 final: C.h -> h.mse.dpi (npfunctions.R:585-595)
132+
C_h <- lprobust_bw(y, d, NULL, eval_point, o = p, nu = deriv,
133+
o.B = q, h.V = c.bw, h.B1 = b.mse.dpi,
134+
h.B2 = bw.mp2, scale = bwregul, vce = vce,
135+
nnmatch = 3L, kernel = kernel,
136+
dups = dups, dupsid = dupsid)
137+
h.mse.dpi <- select_bw(C_h,
138+
exp_bias = 2 * p + 2 - 2 * deriv,
139+
exp_var = 1 + 2 * deriv,
140+
scale = bwregul, range_ = range_)
141+
h.mse.dpi <- min(h.mse.dpi, bw.max)
142+
if (!is.null(bw.min)) h.mse.dpi <- max(h.mse.dpi, bw.min)
143+
144+
stage_record <- function(C) {
145+
list(V = as.numeric(C$V), B1 = as.numeric(C$B1),
146+
B2 = as.numeric(C$B2), R = as.numeric(C$R),
147+
bw = as.numeric(C$bw))
148+
}
149+
150+
list(
151+
c_bw = as.numeric(c.bw),
152+
bw_mp2 = as.numeric(bw.mp2),
153+
bw_mp3 = as.numeric(bw.mp3),
154+
b_mse_dpi = as.numeric(b.mse.dpi),
155+
h_mse_dpi = as.numeric(h.mse.dpi),
156+
bw_min = if (is.null(bw.min)) NA_real_ else as.numeric(bw.min),
157+
bw_max = as.numeric(bw.max),
158+
stage_d1 = stage_record(C_d1),
159+
stage_d2 = stage_record(C_d2),
160+
stage_b = stage_record(C_b),
161+
stage_h = stage_record(C_h)
162+
)
163+
}
164+
165+
set.seed(20260419)
166+
167+
# DGP 1: d ~ Uniform(0, 1), y = d + d^2 + N(0, 0.5)
168+
G <- 2000L
169+
d1 <- runif(G, 0, 1)
170+
y1 <- d1 + d1^2 + rnorm(G, 0, 0.5)
171+
172+
# DGP 2: d ~ Beta(2, 2), y = d + d^2 + N(0, 0.5) (f(0) vanishes at boundary)
173+
d2 <- rbeta(G, 2, 2)
174+
y2 <- d2 + d2^2 + rnorm(G, 0, 0.5)
175+
176+
# DGP 3: Half-normal d, y = 0.5 * d^2 + N(0, 1)
177+
d3 <- abs(rnorm(G, 0, 1))
178+
y3 <- 0.5 * d3^2 + rnorm(G, 0, 1)
179+
180+
golden <- list(
181+
metadata = list(
182+
nprobust_version = as.character(packageVersion("nprobust")),
183+
nprobust_sha = "36e4e532d2f7d23d4dc6e162575cca79e0927cda",
184+
seed = 20260419L,
185+
generator = "benchmarks/R/generate_nprobust_golden.R",
186+
algorithm = paste("Port of nprobust::lpbwselect.mse.dpi with all five",
187+
"stage bandwidths plus per-stage (V, B1, B2, R).",
188+
"Evaluation at boundary eval=0 for HAD use case",
189+
"(p=1, deriv=0, interior=FALSE).")
190+
),
191+
dgp1 = c(list(n = G, d = d1, y = y1, kernel = "epa",
192+
description = "Uniform(0,1), polynomial m(d) = d + d^2"),
193+
extract_mse_dpi_stages(d1, y1, kernel = "epa")),
194+
dgp2 = c(list(n = G, d = d2, y = y2, kernel = "epa",
195+
description = "Beta(2,2) - boundary density vanishes at 0"),
196+
extract_mse_dpi_stages(d2, y2, kernel = "epa")),
197+
dgp3 = c(list(n = G, d = d3, y = y3, kernel = "epa",
198+
description = "Half-normal d, quadratic m(d) with unit noise"),
199+
extract_mse_dpi_stages(d3, y3, kernel = "epa"))
200+
)
201+
202+
out_path <- "benchmarks/data/nprobust_mse_dpi_golden.json"
203+
dir.create("benchmarks/data", recursive = TRUE, showWarnings = FALSE)
204+
write_json(golden, out_path, auto_unbox = TRUE, pretty = TRUE, digits = 14)
205+
cat("Golden values written to", out_path, "\n")
206+
cat("DGP 1 h.mse.dpi:", golden$dgp1$h_mse_dpi, "\n")
207+
cat("DGP 2 h.mse.dpi:", golden$dgp2$h_mse_dpi, "\n")
208+
cat("DGP 3 h.mse.dpi:", golden$dgp3$h_mse_dpi, "\n")

benchmarks/data/nprobust_mse_dpi_golden.json

Lines changed: 135 additions & 0 deletions
Large diffs are not rendered by default.

diff_diff/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@
4545
)
4646
from diff_diff.local_linear import (
4747
KERNELS,
48+
BandwidthResult,
4849
LocalLinearFit,
4950
epanechnikov_kernel,
5051
kernel_moments,
5152
local_linear_fit,
53+
mse_optimal_bandwidth,
5254
triangular_kernel,
5355
uniform_kernel,
5456
)
@@ -412,6 +414,9 @@
412414
"local_linear_fit",
413415
"triangular_kernel",
414416
"uniform_kernel",
417+
# MSE-optimal bandwidth selector (Phase 1b for HeterogeneousAdoptionDiD)
418+
"BandwidthResult",
419+
"mse_optimal_bandwidth",
415420
# Datasets
416421
"load_card_krueger",
417422
"load_castle_doctrine",

0 commit comments

Comments
 (0)