diff --git a/bench/.gitignore b/bench/.gitignore
new file mode 100644
index 00000000..4baf56cf
--- /dev/null
+++ b/bench/.gitignore
@@ -0,0 +1,2 @@
+# Benchmark run artifacts are never committed (Spec 065 CN-003).
+results/
diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 00000000..c99a3868
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,135 @@
+# mcpproxy benchmark harness
+
+The reproducible numbers behind mcpproxy's marketing claims — **token reduction**,
+**discovery accuracy**, and **latency** — comparing three ways an agent can be
+wired to upstream MCP tools.
+
+> Roadmap item #19 (MCP-42). In-repo (`bench/`), reproducible, intended to be
+> refreshed on release. Reports are **never committed** (Spec 065 CN-003); only
+> code, fixtures, and this methodology are versioned.
+
+## The three modes
+
+| Mode | What the agent sees in context | mcpproxy server |
+|------|--------------------------------|-----------------|
+| `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) |
+| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache` + `code_execution` + management tools; tools found on demand via BM25 | `callToolServer` |
+| `code_execution` | `code_execution` + `retrieve_tools` + management tools; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` |
+
+Both proxy modes also append the shared **management tool set** —
+`upstream_servers`, `quarantine_security`, `search_servers`, `list_registries`
+— that the live routing-mode servers expose. These count against the proxy
+context cost: omitting them undercounts that cost and inflates the savings.
+
+The per-mode catalog is **derived directly from the live tool builders**
+(`buildCallToolModeTools` / `buildCodeExecModeTools` in
+`internal/server/mcp_routing.go`, via `server.ProxyModeToolDefs`), so it can
+never drift from production.
+
+## What ships today (deterministic, offline)
+
+The **token-reduction** measurement is fully deterministic and runs with no
+network or LLM:
+
+```bash
+go run ./bench/cmd/bench            # scores the committed Spec 065 corpus
+go test ./bench/                    # unit + invariant tests
+```
+
+It counts the context-token cost of each mode over a **frozen tool corpus** and
+reports the savings of each proxy mode versus the baseline. Output: a
+`report.json` and a self-contained `dashboard.html` in `bench/results/`
+(gitignored).
+
+#### Current deterministic result
+
+Over the 45-tool Spec 065 reference corpus, counting **tool name + description
+only** (schemas excluded uniformly — see limitations), `cl100k_base`:
+
+| Mode | Context tools | Tokens | Savings vs. baseline |
+|------|---------------|--------|----------------------|
+| `baseline` | 45 | 1730 | — |
+| `retrieve_tools` | 10 | 1431 | **~17%** |
+| `code_execution` | 6 | 986 | **~43%** |
+
+These are deliberately modest: the proxy context here is the *full* per-mode
+tool set (discovery + call-tool variants + management tools), and the corpus is
+small. Savings grow toward the asymptote as the upstream tool count rises (the
+baseline grows linearly while the proxy context stays fixed) — always quote the
+corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`.
+
+### Scoring rubric — token reduction
+
+- **Tool universe**: the frozen Spec 065 snapshot
+  `specs/065-evaluation-foundation/datasets/corpus_v1.tools.json` — 45 tools
+  across 7 no-auth reference servers. Frozen + versioned so scoring never runs
+  against a drifting corpus (CN-002).
+- **Tokenizer**: `tiktoken cl100k_base`, a widely-used reproducible BPE
+  (already a repo dependency). It is a **model-agnostic estimator**; exact
+  counts for a specific pinned model (e.g. Claude) will differ, but the
+  *relative* savings between modes are stable.
+- **Proxy-mode tools**: the *complete* per-mode catalog, derived from the live
+  server builders — discovery, the call-tool variants, `code_execution`, **and
+  the shared management tool set** (`upstream_servers`, `quarantine_security`,
+  `search_servers`, `list_registries`). Nothing the agent actually sees is
+  dropped from the proxy cost.
+- **Cost of a tool**: `name + "\n" + description`. JSON input schemas are
+  excluded **uniformly** across all modes (the committed corpus snapshot does
+  not carry schemas).
+- **Savings** for a mode `m`: `1 - tokens(m) / tokens(baseline)`.
+
+### Known limitations (read before quoting a number)
+
+- **Schemas excluded — direction is not clean.** Input schemas are dropped from
+  *both* sides. The 45 baseline tools lose their schemas, but so do the proxy
+  modes' management tools (e.g. `upstream_servers` carries a large multi-field
+  schema). So the name+description-only number is **not** unambiguously
+  conservative — it is its own well-defined metric. The live run below adds full
+  schemas from `GET /api/v1/tools` for the exact headline number; quote that for
+  marketing, not this offline estimate.
+- **Savings scale with tool count.** The 45-tool reference corpus is small; real
+  deployments expose hundreds–thousands of tools, where the baseline grows
+  linearly and the proxy context stays fixed, so savings approach the asymptote.
+  Quote the corpus size alongside any percentage.
+- **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer
+  for the headline model is tracked as a follow-up (see "Roadmap").
+
+## What is scoped but not yet built (follow-ups)
+
+These require decisions and/or other roles, so they are tracked as child issues
+rather than landed here:
+
+- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the
+  Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull
+  `GET /api/v1/tools` for exact schemas, and:
+  - **Accuracy**: replay the Spec 065 retrieval golden set
+    (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k /
+    MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer.
+  - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the
+    fixed cost of loading all tools.
+- **End-to-end task success with a pinned LLM** — requires a pinned model + an
+  LLM-call budget; this is the only part that costs spend.
+- **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane.
+
+## Dataset sources & provenance
+
+- Tool corpus + retrieval golden set: Spec 065 frozen datasets
+  (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively
+  reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch,
+  time, sequential-thinking).
+- Proxy + management tool definitions: derived at run time from the live server
+  tool builders (`internal/server/mcp_routing.go` →
+  `buildCallToolModeTools` / `buildCodeExecModeTools`, exposed via
+  `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the
+  benchmark cannot drift from the tools the proxy actually serves.
+
+## Reproducible live run (skeleton)
+
+`docker-compose.yml` boots mcpproxy over the frozen reference-server config so
+the corpus and live tool list are reproducible across machines. Wiring the live
+accuracy/latency scorers into it is the follow-up above.
+
+## Reviewer contact
+
+Methodology questions / disputes: open an issue in `smart-mcp-proxy/mcpproxy-go`
+and tag the maintainers, or comment on the roadmap benchmark ticket (MCP-42).
diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go
new file mode 100644
index 00000000..a5e924b2
--- /dev/null
+++ b/bench/cmd/bench/main.go
@@ -0,0 +1,52 @@
+// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool
+// corpus and writes a JSON report plus a static HTML dashboard.
+//
+// Usage:
+//
+//	go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME]
+//
+// With no flags it scores the committed Spec 065 frozen corpus and writes the
+// reports to bench/results/ (gitignored — reports are never committed, per the
+// Spec 065 CN-003 repo rule).
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/bench"
+)
+
+func main() {
+	corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot")
+	outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html")
+	encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name")
+	flag.Parse()
+
+	tk, err := bench.NewTokenizer(*encoding)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	corpus, err := bench.LoadCorpus(*corpusPath)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	report := bench.ComputeReport(tk, corpus)
+	jsonPath, htmlPath, err := report.WriteReports(*outDir)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	fmt.Fprintf(os.Stdout, "mcpproxy token-reduction benchmark (corpus %s, %d tools, %s)\n", report.CorpusVersion, report.CorpusTools, report.Encoding)
+	for _, m := range report.Modes {
+		if m.Mode == bench.ModeBaseline {
+			fmt.Fprintf(os.Stdout, "  %-16s %6d tokens (%d tools)  baseline\n", m.Mode, m.Tokens, m.ContextTools)
+			continue
+		}
+		fmt.Fprintf(os.Stdout, "  %-16s %6d tokens (%d tools)  %.1f%% fewer tokens\n", m.Mode, m.Tokens, m.ContextTools, m.SavingsRatio*100)
+	}
+	fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath)
+}
diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml
new file mode 100644
index 00000000..7c420947
--- /dev/null
+++ b/bench/docker-compose.yml
@@ -0,0 +1,37 @@
+# Reproducible benchmark substrate (skeleton).
+#
+# Boots mcpproxy over the frozen Spec 065 reference-server config so the tool
+# corpus and live tool list are identical across machines. The live
+# accuracy/latency scorers (see bench/README.md "follow-ups") attach to this.
+#
+# Usage:
+#   docker compose -f bench/docker-compose.yml up --build
+#   # then, against the running proxy on 127.0.0.1:8092:
+#   #   GET /api/v1/tools     -> full tool defs (with schemas) for the live token run
+#   #   retrieve_tools        -> Recall@k accuracy over retrieval_golden_v1.json
+#
+# The committed corpus_v1 snapshot was frozen from exactly this config
+# (specs/065-evaluation-foundation/datasets/README.md), so a live snapshot here
+# reproduces it (modulo upstream-server version drift — pin images before
+# publishing headline numbers).
+services:
+  mcpproxy:
+    build:
+      context: ..
+      dockerfile: Dockerfile
+    command:
+      - serve
+      - --config=/data/snapshot-servers.config.json
+      - --data-dir=/data/state
+      - --listen=0.0.0.0:8092
+    environment:
+      MCPPROXY_API_KEY: eval-corpus-snapshot
+    ports:
+      - "127.0.0.1:8092:8092"
+    volumes:
+      # The frozen, servable reference-server config (7 no-auth servers).
+      - ../specs/065-evaluation-foundation/datasets/snapshot-servers.config.json:/data/snapshot-servers.config.json:ro
+      - bench-state:/data/state
+
+volumes:
+  bench-state:
diff --git a/bench/proxytools.go b/bench/proxytools.go
new file mode 100644
index 00000000..dda5edd4
--- /dev/null
+++ b/bench/proxytools.go
@@ -0,0 +1,40 @@
+package bench
+
+import (
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/server"
+)
+
+// ProxyToolsForMode returns the built-in mcpproxy proxy + management tool
+// definitions that occupy the agent's context window in the given routing mode.
+//
+// The catalog is derived directly from the live server tool builders
+// (internal/server.ProxyModeToolDefs → buildCallToolModeTools /
+// buildCodeExecModeTools in internal/server/mcp_routing.go). This is the single
+// source of truth: both routing modes append the shared management tool set
+// (upstream_servers, quarantine_security, search_servers, list_registries), so
+// deriving from the builders guarantees the benchmark counts the real per-mode
+// context cost and can never drift from production by re-introducing the
+// undercount that inflated the headline savings (MCP-3161).
+func ProxyToolsForMode(mode string) []Tool {
+	var routingMode string
+	switch mode {
+	case ModeCodeExecution:
+		routingMode = config.RoutingModeCodeExecution
+	case ModeRetrieveTools:
+		routingMode = config.RoutingModeRetrieveTools
+	default:
+		return nil
+	}
+
+	defs := server.ProxyModeToolDefs(routingMode)
+	out := make([]Tool, 0, len(defs))
+	for _, d := range defs {
+		out = append(out, Tool{
+			ToolID:      "mcpproxy:" + d.Name,
+			Name:        d.Name,
+			Description: d.Description,
+		})
+	}
+	return out
+}
diff --git a/bench/report.go b/bench/report.go
new file mode 100644
index 00000000..0ccbee4f
--- /dev/null
+++ b/bench/report.go
@@ -0,0 +1,105 @@
+package bench
+
+import (
+	"encoding/json"
+	"fmt"
+	"html/template"
+	"os"
+	"path/filepath"
+)
+
+// WriteJSON writes the report as indented JSON to path.
+func (r *Report) WriteJSON(path string) error {
+	data, err := json.MarshalIndent(r, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal report: %w", err)
+	}
+	if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil {
+		return fmt.Errorf("write %q: %w", path, err)
+	}
+	return nil
+}
+
+// WriteHTML renders the report as a self-contained static dashboard. The output
+// is a single file with no external assets so it can be published as-is to a
+// static host (CI release-tag publishing is tracked as a follow-up).
+func (r *Report) WriteHTML(path string) error {
+	tmpl, err := template.New("dashboard").Funcs(template.FuncMap{
+		"pct": func(f float64) string { return fmt.Sprintf("%.1f%%", f*100) },
+	}).Parse(dashboardHTML)
+	if err != nil {
+		return fmt.Errorf("parse template: %w", err)
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("create %q: %w", path, err)
+	}
+	defer f.Close()
+	if err := tmpl.Execute(f, r); err != nil {
+		return fmt.Errorf("render dashboard: %w", err)
+	}
+	return nil
+}
+
+// WriteReports writes both report.json and dashboard.html into dir.
+func (r *Report) WriteReports(dir string) (jsonPath, htmlPath string, err error) {
+	if err = os.MkdirAll(dir, 0o755); err != nil {
+		return "", "", fmt.Errorf("mkdir %q: %w", dir, err)
+	}
+	jsonPath = filepath.Join(dir, "report.json")
+	htmlPath = filepath.Join(dir, "dashboard.html")
+	if err = r.WriteJSON(jsonPath); err != nil {
+		return "", "", err
+	}
+	if err = r.WriteHTML(htmlPath); err != nil {
+		return "", "", err
+	}
+	return jsonPath, htmlPath, nil
+}
+
+const dashboardHTML = `<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>mcpproxy benchmark — token reduction</title>
+<style>
+  :root { color-scheme: light dark; }
+  body { font: 16px/1.5 system-ui, sans-serif; max-width: 880px; margin: 2rem auto; padding: 0 1rem; }
+  h1 { margin-bottom: .25rem; }
+  .sub { opacity: .7; margin-top: 0; }
+  table { border-collapse: collapse; width: 100%; margin: 1.5rem 0; }
+  th, td { padding: .6rem .8rem; text-align: right; border-bottom: 1px solid #8884; }
+  th:first-child, td:first-child { text-align: left; }
+  .save { font-weight: 600; color: #1a8f3c; }
+  code { background: #8881; padding: .1rem .35rem; border-radius: 4px; }
+  .notes { font-size: .9rem; opacity: .8; }
+  .notes li { margin: .3rem 0; }
+</style>
+</head>
+<body>
+<h1>mcpproxy benchmark</h1>
+<p class="sub">Token cost of loading tools into an agent's context, by routing mode.</p>
+<p>Corpus <code>{{.CorpusVersion}}</code> &middot; {{.CorpusTools}} tools &middot; encoding <code>{{.Encoding}}</code></p>
+<table>
+  <thead>
+    <tr><th>Mode</th><th>Tools in context</th><th>Context tokens</th><th>Savings vs. baseline</th></tr>
+  </thead>
+  <tbody>
+  {{range .Modes}}
+    <tr>
+      <td><code>{{.Mode}}</code></td>
+      <td>{{.ContextTools}}</td>
+      <td>{{.Tokens}}</td>
+      <td class="save">{{if eq .Mode "baseline"}}&mdash;{{else}}{{pct .SavingsRatio}}{{end}}</td>
+    </tr>
+  {{end}}
+  </tbody>
+</table>
+<h2>Methodology notes</h2>
+<ul class="notes">
+{{range .Notes}}<li>{{.}}</li>{{end}}
+</ul>
+</body>
+</html>
+`
diff --git a/bench/tokens.go b/bench/tokens.go
new file mode 100644
index 00000000..e61b3ed4
--- /dev/null
+++ b/bench/tokens.go
@@ -0,0 +1,173 @@
+// Package bench is the mcpproxy benchmark harness (roadmap #19 / MCP-42).
+//
+// It produces the reproducible numbers behind mcpproxy's marketing claims —
+// token reduction, discovery accuracy, and latency — by comparing three ways
+// an agent can be wired to upstream MCP tools:
+//
+//   - baseline: every upstream tool definition is loaded directly into the
+//     agent's context (no proxy discovery).
+//   - retrieve_tools: only mcpproxy's discovery + call_tool variants occupy the
+//     context; tools are found on demand via BM25 search.
+//   - code_execution: only code_execution + retrieve_tools occupy the context;
+//     the agent orchestrates many tools from sandboxed JS in one round-trip.
+//
+// The token-reduction measurement in this file is fully deterministic and
+// offline: it counts the context cost of each mode over a frozen tool corpus
+// using the tiktoken cl100k_base encoding (a reproducible, model-agnostic
+// estimator). It reuses the Spec 065 frozen corpus
+// (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json) as its tool
+// universe so the benchmark scores a versioned, non-drifting snapshot (CN-002).
+//
+// Methodology, limitations, and the live (docker-compose) run that adds full
+// JSON input schemas and end-to-end accuracy/latency are documented in
+// bench/README.md.
+package bench
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/pkoukk/tiktoken-go"
+)
+
+// DefaultEncoding is the tiktoken encoding used for token estimation. cl100k_base
+// is a widely-used, reproducible BPE; exact counts for a specific pinned model
+// (e.g. Claude) will differ, but the *relative* savings between modes are stable.
+const DefaultEncoding = "cl100k_base"
+
+// Routing modes the benchmark compares. The mode names mirror the mcpproxy
+// MCP servers in internal/server/mcp.go (codeExecServer, callToolServer).
+const (
+	ModeBaseline      = "baseline"
+	ModeRetrieveTools = "retrieve_tools"
+	ModeCodeExecution = "code_execution"
+)
+
+// Tool is a single tool definition the benchmark scores token cost over. It
+// matches the shape of both the Spec 065 corpus snapshot and the embedded
+// proxy-tool fixture.
+type Tool struct {
+	ToolID      string `json:"tool_id"`
+	Server      string `json:"server"`
+	Name        string `json:"tool"`
+	Description string `json:"description"`
+}
+
+// Corpus is a frozen, versioned set of tool definitions.
+type Corpus struct {
+	Version string `json:"version"`
+	Tools   []Tool `json:"tools"`
+}
+
+// LoadCorpus reads a frozen corpus snapshot (e.g. the Spec 065
+// corpus_v1.tools.json) from disk.
+func LoadCorpus(path string) (*Corpus, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read corpus %q: %w", path, err)
+	}
+	var c Corpus
+	if err := json.Unmarshal(data, &c); err != nil {
+		return nil, fmt.Errorf("parse corpus %q: %w", path, err)
+	}
+	if len(c.Tools) == 0 {
+		return nil, fmt.Errorf("corpus %q contains no tools", path)
+	}
+	return &c, nil
+}
+
+// Tokenizer wraps a tiktoken encoding for reproducible token estimation.
+type Tokenizer struct {
+	enc      *tiktoken.Tiktoken
+	encoding string
+}
+
+// NewTokenizer constructs a Tokenizer for the given tiktoken encoding name.
+func NewTokenizer(encoding string) (*Tokenizer, error) {
+	enc, err := tiktoken.GetEncoding(encoding)
+	if err != nil {
+		return nil, fmt.Errorf("load tiktoken encoding %q: %w", encoding, err)
+	}
+	return &Tokenizer{enc: enc, encoding: encoding}, nil
+}
+
+// Count returns the number of tokens in text.
+func (t *Tokenizer) Count(text string) int {
+	return len(t.enc.Encode(text, nil, nil))
+}
+
+// CountTool returns the context-token cost of a single tool definition.
+//
+// It counts the tool name and description only. Input JSON schemas are excluded
+// uniformly across every mode because the committed Spec 065 corpus snapshot
+// does not carry schemas. Schemas are dropped from BOTH sides — the baseline's
+// upstream tools and the proxy modes' management tools (e.g. upstream_servers
+// carries a large multi-field schema) — so this is a well-defined
+// name+description-only metric, not an unambiguously conservative one. The live
+// docker-compose run (README.md) adds full schemas from GET /api/v1/tools for
+// the exact headline number.
+func (t *Tokenizer) CountTool(tl Tool) int {
+	return t.Count(tl.Name + "\n" + tl.Description)
+}
+
+func (t *Tokenizer) countTools(tools []Tool) int {
+	total := 0
+	for _, tl := range tools {
+		total += t.CountTool(tl)
+	}
+	return total
+}
+
+// ModeResult is the per-mode context-cost outcome.
+type ModeResult struct {
+	Mode         string  `json:"mode"`
+	ContextTools int     `json:"context_tools"`
+	Tokens       int     `json:"tokens"`
+	SavingsRatio float64 `json:"savings_vs_baseline"`
+}
+
+// Report is the full token-reduction benchmark result.
+type Report struct {
+	Encoding      string       `json:"encoding"`
+	CorpusVersion string       `json:"corpus_version"`
+	CorpusTools   int          `json:"corpus_tools"`
+	Modes         []ModeResult `json:"modes"`
+	Notes         []string     `json:"notes"`
+}
+
+// ComputeReport computes the per-mode context-token cost over the corpus and the
+// savings of each proxy mode versus the baseline (all tools loaded directly).
+func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report {
+	baseTokens := tk.countTools(corpus.Tools)
+
+	rtTools := ProxyToolsForMode(ModeRetrieveTools)
+	ceTools := ProxyToolsForMode(ModeCodeExecution)
+
+	savings := func(tokens int) float64 {
+		if baseTokens == 0 {
+			return 0
+		}
+		return 1.0 - float64(tokens)/float64(baseTokens)
+	}
+
+	rtTokens := tk.countTools(rtTools)
+	ceTokens := tk.countTools(ceTools)
+
+	return &Report{
+		Encoding:      tk.encoding,
+		CorpusVersion: corpus.Version,
+		CorpusTools:   len(corpus.Tools),
+		Modes: []ModeResult{
+			{Mode: ModeBaseline, ContextTools: len(corpus.Tools), Tokens: baseTokens, SavingsRatio: 0},
+			{Mode: ModeRetrieveTools, ContextTools: len(rtTools), Tokens: rtTokens, SavingsRatio: savings(rtTokens)},
+			{Mode: ModeCodeExecution, ContextTools: len(ceTools), Tokens: ceTokens, SavingsRatio: savings(ceTokens)},
+		},
+		Notes: []string{
+			"Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.",
+			"Proxy-mode tools are the full per-mode catalog derived from the live server builders (internal/server.ProxyModeToolDefs), including the shared management tool set (upstream_servers, quarantine_security, search_servers, list_registries).",
+			"Counts tool name + description only; JSON input schemas are excluded uniformly from both the baseline and the proxy modes, so this is a name+description-only metric (not unambiguously conservative). See bench/README.md for the live run with full schemas.",
+			"Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.",
+		},
+	}
+}
diff --git a/bench/tokens_test.go b/bench/tokens_test.go
new file mode 100644
index 00000000..26296fe4
--- /dev/null
+++ b/bench/tokens_test.go
@@ -0,0 +1,218 @@
+package bench
+
+import (
+	"bytes"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// repoCorpus is the committed Spec 065 frozen corpus, reused here as the
+// benchmark's tool universe (45 tools, 7 no-auth reference servers).
+const repoCorpus = "../specs/065-evaluation-foundation/datasets/corpus_v1.tools.json"
+
+func newTestTokenizer(t *testing.T) *Tokenizer {
+	t.Helper()
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		t.Fatalf("NewTokenizer: %v", err)
+	}
+	return tk
+}
+
+func TestTokenizer_DeterministicAndPositive(t *testing.T) {
+	tk := newTestTokenizer(t)
+	text := "Fetches a URL from the internet and extracts its contents as markdown."
+	a := tk.Count(text)
+	b := tk.Count(text)
+	if a != b {
+		t.Fatalf("tokenizer not deterministic: %d != %d", a, b)
+	}
+	if a <= 0 {
+		t.Fatalf("expected positive token count, got %d", a)
+	}
+}
+
+func TestProxyToolsForMode(t *testing.T) {
+	rt := ProxyToolsForMode(ModeRetrieveTools)
+	if len(rt) == 0 {
+		t.Fatal("retrieve_tools mode exposes no proxy tools")
+	}
+	// retrieve_tools mode must expose the discovery tool + the call_tool variants.
+	want := map[string]bool{
+		"retrieve_tools":        false,
+		"call_tool_read":        false,
+		"call_tool_write":       false,
+		"call_tool_destructive": false,
+	}
+	for _, tl := range rt {
+		if _, ok := want[tl.Name]; ok {
+			want[tl.Name] = true
+		}
+	}
+	for name, found := range want {
+		if !found {
+			t.Errorf("retrieve_tools mode missing expected proxy tool %q", name)
+		}
+	}
+
+	ce := ProxyToolsForMode(ModeCodeExecution)
+	var hasCodeExec, hasRetrieve bool
+	for _, tl := range ce {
+		switch tl.Name {
+		case "code_execution":
+			hasCodeExec = true
+		case "retrieve_tools":
+			hasRetrieve = true
+		}
+	}
+	if !hasCodeExec || !hasRetrieve {
+		t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce))
+	}
+
+	// Both routing modes append the shared management tool set
+	// (internal/server/mcp_routing.go buildManagementTools). Omitting these
+	// undercounts the proxy-mode context cost and overstates the savings
+	// (MCP-3161 / Codex finding on PR #747). Assert they are present so the
+	// benchmark catalog can never silently drop them again.
+	mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"}
+	for _, mode := range []string{ModeRetrieveTools, ModeCodeExecution} {
+		got := map[string]bool{}
+		for _, tl := range ProxyToolsForMode(mode) {
+			got[tl.Name] = true
+			if tl.Description == "" {
+				t.Errorf("mode %s: tool %q has empty description", mode, tl.Name)
+			}
+		}
+		for _, name := range mgmt {
+			if !got[name] {
+				t.Errorf("mode %s: missing management tool %q (proxy context cost undercounted)", mode, name)
+			}
+		}
+	}
+}
+
+func TestComputeReport_SavingsAreReal(t *testing.T) {
+	tk := newTestTokenizer(t)
+	corpus, err := LoadCorpus(filepath.Clean(repoCorpus))
+	if err != nil {
+		t.Fatalf("LoadCorpus: %v", err)
+	}
+	if len(corpus.Tools) < 40 {
+		t.Fatalf("expected the frozen corpus to have ~45 tools, got %d", len(corpus.Tools))
+	}
+
+	rep := ComputeReport(tk, corpus)
+
+	modes := map[string]ModeResult{}
+	for _, m := range rep.Modes {
+		modes[m.Mode] = m
+	}
+
+	base, ok := modes[ModeBaseline]
+	if !ok {
+		t.Fatal("report missing baseline mode")
+	}
+	if base.SavingsRatio != 0 {
+		t.Errorf("baseline savings must be 0, got %v", base.SavingsRatio)
+	}
+	if base.Tokens <= 0 {
+		t.Fatalf("baseline tokens must be positive, got %d", base.Tokens)
+	}
+
+	rt := modes[ModeRetrieveTools]
+	ce := modes[ModeCodeExecution]
+
+	// The whole product thesis: discovery/orchestration modes load far fewer
+	// tokens into context than loading every upstream tool directly.
+	if rt.Tokens >= base.Tokens {
+		t.Errorf("retrieve_tools (%d) should use fewer tokens than baseline (%d)", rt.Tokens, base.Tokens)
+	}
+	if ce.Tokens >= base.Tokens {
+		t.Errorf("code_execution (%d) should use fewer tokens than baseline (%d)", ce.Tokens, base.Tokens)
+	}
+
+	// Savings ratio must be in (0,1) and match the arithmetic.
+	wantRT := 1.0 - float64(rt.Tokens)/float64(base.Tokens)
+	if diff := rt.SavingsRatio - wantRT; diff > 1e-9 || diff < -1e-9 {
+		t.Errorf("retrieve_tools savings ratio %v != computed %v", rt.SavingsRatio, wantRT)
+	}
+	if rt.SavingsRatio <= 0 || rt.SavingsRatio >= 1 {
+		t.Errorf("retrieve_tools savings ratio out of (0,1): %v", rt.SavingsRatio)
+	}
+}
+
+func TestComputeReport_BaselineMonotonic(t *testing.T) {
+	tk := newTestTokenizer(t)
+	full := &Corpus{Version: "test", Tools: []Tool{
+		{ToolID: "a:1", Server: "a", Name: "one", Description: "alpha tool that does something useful"},
+		{ToolID: "b:2", Server: "b", Name: "two", Description: "beta tool that does something else useful"},
+		{ToolID: "c:3", Server: "c", Name: "three", Description: "gamma tool with a longer description for token weight"},
+	}}
+	fewer := &Corpus{Version: "test", Tools: full.Tools[:1]}
+
+	big := ComputeReport(tk, full)
+	small := ComputeReport(tk, fewer)
+
+	baseOf := func(r *Report) int {
+		for _, m := range r.Modes {
+			if m.Mode == ModeBaseline {
+				return m.Tokens
+			}
+		}
+		return -1
+	}
+	if baseOf(big) <= baseOf(small) {
+		t.Errorf("more tools must mean more baseline tokens: %d <= %d", baseOf(big), baseOf(small))
+	}
+}
+
+func TestWriteReports_SmokeTest(t *testing.T) {
+	tk := newTestTokenizer(t)
+	corpus := &Corpus{Version: "test", Tools: []Tool{
+		{ToolID: "a:1", Server: "a", Name: "tool_a", Description: "does something"},
+	}}
+	rep := ComputeReport(tk, corpus)
+
+	dir := t.TempDir()
+	jsonPath, htmlPath, err := rep.WriteReports(dir)
+	if err != nil {
+		t.Fatalf("WriteReports: %v", err)
+	}
+
+	// JSON must parse back to a Report with the right corpus version.
+	data, err := os.ReadFile(jsonPath)
+	if err != nil {
+		t.Fatalf("read json: %v", err)
+	}
+	var got Report
+	if err := json.Unmarshal(data, &got); err != nil {
+		t.Fatalf("unmarshal json: %v", err)
+	}
+	if got.CorpusVersion != "test" {
+		t.Errorf("corpus version = %q, want %q", got.CorpusVersion, "test")
+	}
+
+	// HTML must be non-empty and contain the mode names.
+	html, err := os.ReadFile(htmlPath)
+	if err != nil {
+		t.Fatalf("read html: %v", err)
+	}
+	if len(html) < 100 {
+		t.Fatalf("dashboard.html too short (%d bytes)", len(html))
+	}
+	for _, mode := range []string{ModeBaseline, ModeRetrieveTools, ModeCodeExecution} {
+		if !bytes.Contains(html, []byte(mode)) {
+			t.Errorf("dashboard.html missing mode %q", mode)
+		}
+	}
+}
+
+func toolNames(ts []Tool) []string {
+	out := make([]string, len(ts))
+	for i, t := range ts {
+		out[i] = t.Name
+	}
+	return out
+}
diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go
new file mode 100644
index 00000000..95987195
--- /dev/null
+++ b/internal/server/bench_export.go
@@ -0,0 +1,57 @@
+package server
+
+import (
+	mcpserver "github.com/mark3labs/mcp-go/server"
+	"go.uber.org/zap"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+)
+
+// BenchProxyToolDef is a static built-in proxy/management tool definition
+// (name + description) exposed for the in-repo benchmark harness (bench/).
+//
+// The benchmark scores the per-mode context cost an agent pays for mcpproxy's
+// own tools. That cost MUST reflect every tool the live routing-mode servers
+// expose — including the shared management tool set (upstream_servers,
+// quarantine_security, search_servers, list_registries) that both modes append
+// via buildManagementTools — or the benchmark overstates the token savings
+// (MCP-3161 / Codex finding on PR #747).
+type BenchProxyToolDef struct {
+	Name        string
+	Description string
+}
+
+// ProxyModeToolDefs returns the static built-in proxy + management tool
+// definitions an agent sees in its context window for the given routing mode
+// (config.RoutingModeRetrieveTools or config.RoutingModeCodeExecution).
+//
+// It is built from the SAME builders the live server uses
+// (buildCallToolModeTools / buildCodeExecModeTools in mcp_routing.go) so the
+// benchmark catalog can never drift from production. Code execution is enabled
+// so the real code_execution tool description (not the disabled stub) is scored
+// — the code_execution routing mode only makes sense with the tool enabled.
+func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef {
+	p := &MCPProxyServer{
+		logger: zap.NewNop(),
+		config: &config.Config{
+			EnableCodeExecution: true,
+		},
+	}
+
+	var serverTools []mcpserver.ServerTool
+	switch routingMode {
+	case config.RoutingModeCodeExecution:
+		serverTools = p.buildCodeExecModeTools()
+	default: // retrieve_tools — the default routing mode
+		serverTools = p.buildCallToolModeTools()
+	}
+
+	defs := make([]BenchProxyToolDef, 0, len(serverTools))
+	for _, st := range serverTools {
+		defs = append(defs, BenchProxyToolDef{
+			Name:        st.Tool.Name,
+			Description: st.Tool.Description,
+		})
+	}
+	return defs
+}
diff --git a/internal/server/bench_export_test.go b/internal/server/bench_export_test.go
new file mode 100644
index 00000000..5dd63132
--- /dev/null
+++ b/internal/server/bench_export_test.go
@@ -0,0 +1,64 @@
+package server
+
+import (
+	"testing"
+
+	mcpserver "github.com/mark3labs/mcp-go/server"
+	"go.uber.org/zap"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+)
+
+// TestProxyModeToolDefs_IncludesManagementTools guards the benchmark integrity
+// fix (MCP-3161): every routing mode exposes the shared management tool set, so
+// the benchmark catalog must include it or it undercounts the proxy-mode context
+// cost and overstates the savings.
+func TestProxyModeToolDefs_IncludesManagementTools(t *testing.T) {
+	mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"}
+	for _, mode := range []string{config.RoutingModeRetrieveTools, config.RoutingModeCodeExecution} {
+		defs := ProxyModeToolDefs(mode)
+		if len(defs) == 0 {
+			t.Fatalf("mode %s: no proxy tool defs", mode)
+		}
+		names := map[string]bool{}
+		for _, d := range defs {
+			names[d.Name] = true
+			if d.Description == "" {
+				t.Errorf("mode %s: tool %q has empty description", mode, d.Name)
+			}
+		}
+		for _, m := range mgmt {
+			if !names[m] {
+				t.Errorf("mode %s: missing management tool %q", mode, m)
+			}
+		}
+	}
+}
+
+// TestProxyModeToolDefs_MatchesBuilders pins ProxyModeToolDefs to the live tool
+// builders. If a mode's tool set changes in mcp_routing.go, the benchmark
+// catalog tracks it automatically and this test proves the coupling holds.
+func TestProxyModeToolDefs_MatchesBuilders(t *testing.T) {
+	p := &MCPProxyServer{
+		logger: zap.NewNop(),
+		config: &config.Config{EnableCodeExecution: true},
+	}
+	cases := map[string][]mcpserver.ServerTool{
+		config.RoutingModeRetrieveTools: p.buildCallToolModeTools(),
+		config.RoutingModeCodeExecution: p.buildCodeExecModeTools(),
+	}
+	for mode, builderTools := range cases {
+		defs := ProxyModeToolDefs(mode)
+		if len(defs) != len(builderTools) {
+			t.Fatalf("mode %s: ProxyModeToolDefs len %d != builder len %d", mode, len(defs), len(builderTools))
+		}
+		for i := range builderTools {
+			if defs[i].Name != builderTools[i].Tool.Name {
+				t.Errorf("mode %s: def[%d] name %q != builder %q", mode, i, defs[i].Name, builderTools[i].Tool.Name)
+			}
+			if defs[i].Description != builderTools[i].Tool.Description {
+				t.Errorf("mode %s: def[%d] description mismatch for %q", mode, i, defs[i].Name)
+			}
+		}
+	}
+}