From fb64c91b4ae275b6bef31ddffb3e72e712e20f97 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 15:06:25 +0300
Subject: [PATCH 1/3] feat(bench): token-reduction benchmark harness over
 frozen corpus (MCP-42)

Ship the first, fully-deterministic slice of the roadmap-#19 benchmark: the
token-reduction numbers behind the "massive token savings" claim. Reuses the
frozen Spec 065 tool corpus (45 tools, 7 reference servers) as a versioned,
non-drifting universe and tiktoken cl100k_base (already a dep) as a
reproducible model-agnostic estimator.

Compares the three routing modes' static context cost:
- baseline (all upstream tools loaded directly)
- retrieve_tools (BM25 discovery + call_tool variants)
- code_execution (orchestration + retrieve_tools)

over the corpus and reports per-mode savings. Real proxy tool defs are captured
verbatim from internal/server/mcp.go into bench/proxy_tools_v1.json (provenance
recorded). Emits report.json + a self-contained dashboard.html (gitignored;
reports never committed, per Spec 065 CN-003).

Conservative by construction: input schemas excluded uniformly understates the
baseline, so measured savings (65.5% / 70.3% on the 45-tool corpus) are a floor.

Methodology, limitations, and the scoped-but-not-yet-built follow-ups (live run
with full schemas + accuracy/latency, LLM e2e, CI publish) are in bench/README.md.

Related #MCP-42

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 bench/.gitignore          |   2 +
 bench/README.md           |  99 ++++++++++++++++++++++
 bench/cmd/bench/main.go   |  52 ++++++++++++
 bench/docker-compose.yml  |  37 +++++++++
 bench/proxy_tools_v1.json |  43 ++++++++++
 bench/proxytools.go       |  53 ++++++++++++
 bench/report.go           | 105 +++++++++++++++++++++++
 bench/tokens.go           | 171 ++++++++++++++++++++++++++++++++++++++
 bench/tokens_test.go      | 153 ++++++++++++++++++++++++++++++++++
 9 files changed, 715 insertions(+)
 create mode 100644 bench/.gitignore
 create mode 100644 bench/README.md
 create mode 100644 bench/cmd/bench/main.go
 create mode 100644 bench/docker-compose.yml
 create mode 100644 bench/proxy_tools_v1.json
 create mode 100644 bench/proxytools.go
 create mode 100644 bench/report.go
 create mode 100644 bench/tokens.go
 create mode 100644 bench/tokens_test.go

diff --git a/bench/.gitignore b/bench/.gitignore
new file mode 100644
index 00000000..4baf56cf
--- /dev/null
+++ b/bench/.gitignore
@@ -0,0 +1,2 @@
+# Benchmark run artifacts are never committed (Spec 065 CN-003).
+results/
diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 00000000..cd7227a6
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,99 @@
+# mcpproxy benchmark harness
+
+The reproducible numbers behind mcpproxy's marketing claims — **token reduction**,
+**discovery accuracy**, and **latency** — comparing three ways an agent can be
+wired to upstream MCP tools.
+
+> Roadmap item #19 (MCP-42). In-repo (`bench/`), reproducible, intended to be
+> refreshed on release. Reports are **never committed** (Spec 065 CN-003); only
+> code, fixtures, and this methodology are versioned.
+
+## The three modes
+
+| Mode | What the agent sees in context | mcpproxy server |
+|------|--------------------------------|-----------------|
+| `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) |
+| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache`; tools found on demand via BM25 | `callToolServer` |
+| `code_execution` | `code_execution` + `retrieve_tools`; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` |
+
+(Mode → exposed tools mirrors `internal/server/mcp.go`.)
+
+## What ships today (deterministic, offline)
+
+The **token-reduction** measurement is fully deterministic and runs with no
+network or LLM:
+
+```bash
+go run ./bench/cmd/bench            # scores the committed Spec 065 corpus
+go test ./bench/                    # unit + invariant tests
+```
+
+It counts the context-token cost of each mode over a **frozen tool corpus** and
+reports the savings of each proxy mode versus the baseline. Output: a
+`report.json` and a self-contained `dashboard.html` in `bench/results/`
+(gitignored).
+
+### Scoring rubric — token reduction
+
+- **Tool universe**: the frozen Spec 065 snapshot
+  `specs/065-evaluation-foundation/datasets/corpus_v1.tools.json` — 45 tools
+  across 7 no-auth reference servers. Frozen + versioned so scoring never runs
+  against a drifting corpus (CN-002).
+- **Tokenizer**: `tiktoken cl100k_base`, a widely-used reproducible BPE
+  (already a repo dependency). It is a **model-agnostic estimator**; exact
+  counts for a specific pinned model (e.g. Claude) will differ, but the
+  *relative* savings between modes are stable.
+- **Cost of a tool**: `name + "\n" + description`. JSON input schemas are
+  excluded **uniformly** across all modes (the committed corpus snapshot does
+  not carry schemas).
+- **Savings** for a mode `m`: `1 - tokens(m) / tokens(baseline)`.
+
+### Known limitations (read before quoting a number)
+
+- **Schemas excluded → conservative.** Upstream tools carry far larger input
+  schemas than mcpproxy's handful of proxy tools, so excluding schemas
+  *understates* the baseline and therefore *understates* the savings. The live
+  run below adds full schemas for the exact headline number.
+- **Savings scale with tool count.** The 45-tool reference corpus is small; real
+  deployments expose hundreds–thousands of tools, where the baseline grows
+  linearly and the proxy context stays fixed, so savings approach the asymptote.
+  Quote the corpus size alongside any percentage.
+- **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer
+  for the headline model is tracked as a follow-up (see "Roadmap").
+
+## What is scoped but not yet built (follow-ups)
+
+These require decisions and/or other roles, so they are tracked as child issues
+rather than landed here:
+
+- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the
+  Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull
+  `GET /api/v1/tools` for exact schemas, and:
+  - **Accuracy**: replay the Spec 065 retrieval golden set
+    (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k /
+    MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer.
+  - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the
+    fixed cost of loading all tools.
+- **End-to-end task success with a pinned LLM** — requires a pinned model + an
+  LLM-call budget; this is the only part that costs spend.
+- **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane.
+
+## Dataset sources & provenance
+
+- Tool corpus + retrieval golden set: Spec 065 frozen datasets
+  (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively
+  reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch,
+  time, sequential-thinking).
+- Proxy tool definitions: `bench/proxy_tools_v1.json`, captured verbatim from
+  `internal/server/mcp.go` (provenance recorded in the file).
+
+## Reproducible live run (skeleton)
+
+`docker-compose.yml` boots mcpproxy over the frozen reference-server config so
+the corpus and live tool list are reproducible across machines. Wiring the live
+accuracy/latency scorers into it is the follow-up above.
+
+## Reviewer contact
+
+Methodology questions / disputes: open an issue in `smart-mcp-proxy/mcpproxy-go`
+and tag the maintainers, or comment on the roadmap benchmark ticket (MCP-42).
diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go
new file mode 100644
index 00000000..a5e924b2
--- /dev/null
+++ b/bench/cmd/bench/main.go
@@ -0,0 +1,52 @@
+// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool
+// corpus and writes a JSON report plus a static HTML dashboard.
+//
+// Usage:
+//
+//	go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME]
+//
+// With no flags it scores the committed Spec 065 frozen corpus and writes the
+// reports to bench/results/ (gitignored — reports are never committed, per the
+// Spec 065 CN-003 repo rule).
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/bench"
+)
+
+func main() {
+	corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot")
+	outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html")
+	encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name")
+	flag.Parse()
+
+	tk, err := bench.NewTokenizer(*encoding)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	corpus, err := bench.LoadCorpus(*corpusPath)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	report := bench.ComputeReport(tk, corpus)
+	jsonPath, htmlPath, err := report.WriteReports(*outDir)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	fmt.Fprintf(os.Stdout, "mcpproxy token-reduction benchmark (corpus %s, %d tools, %s)\n", report.CorpusVersion, report.CorpusTools, report.Encoding)
+	for _, m := range report.Modes {
+		if m.Mode == bench.ModeBaseline {
+			fmt.Fprintf(os.Stdout, "  %-16s %6d tokens (%d tools)  baseline\n", m.Mode, m.Tokens, m.ContextTools)
+			continue
+		}
+		fmt.Fprintf(os.Stdout, "  %-16s %6d tokens (%d tools)  %.1f%% fewer tokens\n", m.Mode, m.Tokens, m.ContextTools, m.SavingsRatio*100)
+	}
+	fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath)
+}
diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml
new file mode 100644
index 00000000..7c420947
--- /dev/null
+++ b/bench/docker-compose.yml
@@ -0,0 +1,37 @@
+# Reproducible benchmark substrate (skeleton).
+#
+# Boots mcpproxy over the frozen Spec 065 reference-server config so the tool
+# corpus and live tool list are identical across machines. The live
+# accuracy/latency scorers (see bench/README.md "follow-ups") attach to this.
+#
+# Usage:
+#   docker compose -f bench/docker-compose.yml up --build
+#   # then, against the running proxy on 127.0.0.1:8092:
+#   #   GET /api/v1/tools     -> full tool defs (with schemas) for the live token run
+#   #   retrieve_tools        -> Recall@k accuracy over retrieval_golden_v1.json
+#
+# The committed corpus_v1 snapshot was frozen from exactly this config
+# (specs/065-evaluation-foundation/datasets/README.md), so a live snapshot here
+# reproduces it (modulo upstream-server version drift — pin images before
+# publishing headline numbers).
+services:
+  mcpproxy:
+    build:
+      context: ..
+      dockerfile: Dockerfile
+    command:
+      - serve
+      - --config=/data/snapshot-servers.config.json
+      - --data-dir=/data/state
+      - --listen=0.0.0.0:8092
+    environment:
+      MCPPROXY_API_KEY: eval-corpus-snapshot
+    ports:
+      - "127.0.0.1:8092:8092"
+    volumes:
+      # The frozen, servable reference-server config (7 no-auth servers).
+      - ../specs/065-evaluation-foundation/datasets/snapshot-servers.config.json:/data/snapshot-servers.config.json:ro
+      - bench-state:/data/state
+
+volumes:
+  bench-state:
diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json
new file mode 100644
index 00000000..26fc14a8
--- /dev/null
+++ b/bench/proxy_tools_v1.json
@@ -0,0 +1,43 @@
+{
+  "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).",
+  "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools (mcp.go:561), call_tool_read/write/destructive variant descriptions (mcp.go:490-528), read_cache (mcp.go:605), code_execution (mcp.go:675). Captured verbatim at origin/main 89f06b5c.",
+  "version": "proxy_v1",
+  "tools": [
+    {
+      "tool_id": "mcpproxy:retrieve_tools",
+      "tool": "retrieve_tools",
+      "description": "🔍 CALL THIS FIRST to discover relevant tools! This is the primary tool discovery mechanism that searches across ALL upstream MCP servers using intelligent BM25 full-text search. Always use this before attempting to call any specific tools. Use natural language to describe what you want to accomplish (e.g., 'create GitHub repository', 'query database', 'weather forecast'). Results include 'annotations' (tool behavior hints like destructiveHint) and 'call_with' recommendation indicating which tool variant to use (call_tool_read/write/destructive). Then use the recommended variant with an 'intent' parameter. NOTE: Quarantined servers are excluded from search results for security. Use 'quarantine_security' tool to examine and manage quarantined servers. TO ADD NEW SERVERS: Use 'list_registries' then 'search_servers' to find and add new MCP servers.",
+      "modes": ["retrieve_tools", "code_execution"]
+    },
+    {
+      "tool_id": "mcpproxy:call_tool_read",
+      "tool": "call_tool_read",
+      "description": "Execute a READ-ONLY tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: search, query, list, get, fetch, find, check, view, read, show, describe, lookup, retrieve, browse, explore, discover, scan, inspect, analyze, examine, validate, verify. Examples: search_files, get_user, list_repositories, query_database, find_issues, check_status. This is the DEFAULT choice when unsure - most tools are read-only.",
+      "modes": ["retrieve_tools"]
+    },
+    {
+      "tool_id": "mcpproxy:call_tool_write",
+      "tool": "call_tool_write",
+      "description": "Execute a STATE-MODIFYING tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: create, update, modify, add, set, send, edit, change, write, post, put, patch, insert, upload, submit, assign, configure, enable, register, subscribe, publish, move, copy, rename, merge. Examples: create_issue, update_file, send_message, add_comment, set_status, edit_page. Use only when explicitly modifying state.",
+      "modes": ["retrieve_tools"]
+    },
+    {
+      "tool_id": "mcpproxy:call_tool_destructive",
+      "tool": "call_tool_destructive",
+      "description": "Execute a DESTRUCTIVE tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: delete, remove, drop, revoke, disable, destroy, purge, reset, clear, unsubscribe, cancel, terminate, close, archive, ban, block, disconnect, kill, wipe, truncate, force, hard. Examples: delete_repo, remove_user, drop_table, revoke_access, clear_cache, terminate_session. Use for irreversible or high-impact operations.",
+      "modes": ["retrieve_tools"]
+    },
+    {
+      "tool_id": "mcpproxy:read_cache",
+      "tool": "read_cache",
+      "description": "Retrieve paginated data when mcpproxy indicates a tool response was truncated. Use the cache key provided in truncation messages to access the complete dataset with pagination.",
+      "modes": ["retrieve_tools"]
+    },
+    {
+      "tool_id": "mcpproxy:code_execution",
+      "tool": "code_execution",
+      "description": "Execute JavaScript or TypeScript code that orchestrates multiple upstream MCP tools in a single request. Use this when you need to combine results from 2+ tools, implement conditional logic, loops, or data transformations that would require multiple round-trips otherwise.\n\n**When to use**: Multi-step workflows with data transformation, conditional logic, error handling, or iterating over results.\n**When NOT to use**: Single tool calls (use call_tool directly), long-running operations (>2 minutes).\n\n**Available in code**:\n- `input` global: Your input data passed via the 'input' parameter\n- `call_tool(serverName, toolName, args)`: Call upstream tools (returns {ok, result} or {ok, error})\n- Modern JavaScript (ES2020+): arrow functions, const/let, template literals, destructuring, classes, for-of, optional chaining (?.), nullish coalescing (??), spread/rest, Promises, Symbols, Map/Set, Proxy/Reflect (no require(), filesystem, or network access)\n\n**TypeScript support**: Set `language: \"typescript\"` to write TypeScript code with type annotations, interfaces, enums, and generics. Types are automatically stripped before execution.\n\n**Important runtime rules**:\n- `call_tool` is strictly SYNCHRONOUS. Do not use `await`.\n- Upstream tools usually return an MCP content array. To parse JSON results: `const data = JSON.parse(res.result.content[0].text);`\n- The last evaluated expression in your script is automatically returned as the final output.\n\n**Security**: Sandboxed execution with timeout enforcement. Respects existing quarantine and server restrictions.",
+      "modes": ["code_execution"]
+    }
+  ]
+}
diff --git a/bench/proxytools.go b/bench/proxytools.go
new file mode 100644
index 00000000..8191d033
--- /dev/null
+++ b/bench/proxytools.go
@@ -0,0 +1,53 @@
+package bench
+
+import (
+	_ "embed"
+	"encoding/json"
+)
+
+//go:embed proxy_tools_v1.json
+var proxyToolsJSON []byte
+
+// proxyTool is a built-in mcpproxy tool definition plus the routing modes that
+// expose it in the agent's context.
+type proxyTool struct {
+	ToolID      string   `json:"tool_id"`
+	Name        string   `json:"tool"`
+	Description string   `json:"description"`
+	Modes       []string `json:"modes"`
+}
+
+type proxyToolFixture struct {
+	Version string      `json:"version"`
+	Tools   []proxyTool `json:"tools"`
+}
+
+var proxyTools proxyToolFixture
+
+func init() {
+	if err := json.Unmarshal(proxyToolsJSON, &proxyTools); err != nil {
+		// The fixture is embedded at build time; a parse failure is a build/test
+		// bug, not a runtime condition.
+		panic("bench: invalid embedded proxy_tools_v1.json: " + err.Error())
+	}
+}
+
+// ProxyToolsForMode returns the built-in proxy tool definitions that occupy the
+// agent's context window in the given routing mode. Provenance for each
+// definition is in proxy_tools_v1.json (captured from internal/server/mcp.go).
+func ProxyToolsForMode(mode string) []Tool {
+	var out []Tool
+	for _, pt := range proxyTools.Tools {
+		for _, m := range pt.Modes {
+			if m == mode {
+				out = append(out, Tool{
+					ToolID:      pt.ToolID,
+					Name:        pt.Name,
+					Description: pt.Description,
+				})
+				break
+			}
+		}
+	}
+	return out
+}
diff --git a/bench/report.go b/bench/report.go
new file mode 100644
index 00000000..0ccbee4f
--- /dev/null
+++ b/bench/report.go
@@ -0,0 +1,105 @@
+package bench
+
+import (
+	"encoding/json"
+	"fmt"
+	"html/template"
+	"os"
+	"path/filepath"
+)
+
+// WriteJSON writes the report as indented JSON to path.
+func (r *Report) WriteJSON(path string) error {
+	data, err := json.MarshalIndent(r, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal report: %w", err)
+	}
+	if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil {
+		return fmt.Errorf("write %q: %w", path, err)
+	}
+	return nil
+}
+
+// WriteHTML renders the report as a self-contained static dashboard. The output
+// is a single file with no external assets so it can be published as-is to a
+// static host (CI release-tag publishing is tracked as a follow-up).
+func (r *Report) WriteHTML(path string) error {
+	tmpl, err := template.New("dashboard").Funcs(template.FuncMap{
+		"pct": func(f float64) string { return fmt.Sprintf("%.1f%%", f*100) },
+	}).Parse(dashboardHTML)
+	if err != nil {
+		return fmt.Errorf("parse template: %w", err)
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("create %q: %w", path, err)
+	}
+	defer f.Close()
+	if err := tmpl.Execute(f, r); err != nil {
+		return fmt.Errorf("render dashboard: %w", err)
+	}
+	return nil
+}
+
+// WriteReports writes both report.json and dashboard.html into dir.
+func (r *Report) WriteReports(dir string) (jsonPath, htmlPath string, err error) {
+	if err = os.MkdirAll(dir, 0o755); err != nil {
+		return "", "", fmt.Errorf("mkdir %q: %w", dir, err)
+	}
+	jsonPath = filepath.Join(dir, "report.json")
+	htmlPath = filepath.Join(dir, "dashboard.html")
+	if err = r.WriteJSON(jsonPath); err != nil {
+		return "", "", err
+	}
+	if err = r.WriteHTML(htmlPath); err != nil {
+		return "", "", err
+	}
+	return jsonPath, htmlPath, nil
+}
+
+const dashboardHTML = `<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>mcpproxy benchmark — token reduction</title>
+<style>
+  :root { color-scheme: light dark; }
+  body { font: 16px/1.5 system-ui, sans-serif; max-width: 880px; margin: 2rem auto; padding: 0 1rem; }
+  h1 { margin-bottom: .25rem; }
+  .sub { opacity: .7; margin-top: 0; }
+  table { border-collapse: collapse; width: 100%; margin: 1.5rem 0; }
+  th, td { padding: .6rem .8rem; text-align: right; border-bottom: 1px solid #8884; }
+  th:first-child, td:first-child { text-align: left; }
+  .save { font-weight: 600; color: #1a8f3c; }
+  code { background: #8881; padding: .1rem .35rem; border-radius: 4px; }
+  .notes { font-size: .9rem; opacity: .8; }
+  .notes li { margin: .3rem 0; }
+</style>
+</head>
+<body>
+<h1>mcpproxy benchmark</h1>
+<p class="sub">Token cost of loading tools into an agent's context, by routing mode.</p>
+<p>Corpus <code>{{.CorpusVersion}}</code> &middot; {{.CorpusTools}} tools &middot; encoding <code>{{.Encoding}}</code></p>
+<table>
+  <thead>
+    <tr><th>Mode</th><th>Tools in context</th><th>Context tokens</th><th>Savings vs. baseline</th></tr>
+  </thead>
+  <tbody>
+  {{range .Modes}}
+    <tr>
+      <td><code>{{.Mode}}</code></td>
+      <td>{{.ContextTools}}</td>
+      <td>{{.Tokens}}</td>
+      <td class="save">{{if eq .Mode "baseline"}}&mdash;{{else}}{{pct .SavingsRatio}}{{end}}</td>
+    </tr>
+  {{end}}
+  </tbody>
+</table>
+<h2>Methodology notes</h2>
+<ul class="notes">
+{{range .Notes}}<li>{{.}}</li>{{end}}
+</ul>
+</body>
+</html>
+`
diff --git a/bench/tokens.go b/bench/tokens.go
new file mode 100644
index 00000000..f903fbab
--- /dev/null
+++ b/bench/tokens.go
@@ -0,0 +1,171 @@
+// Package bench is the mcpproxy benchmark harness (roadmap #19 / MCP-42).
+//
+// It produces the reproducible numbers behind mcpproxy's marketing claims —
+// token reduction, discovery accuracy, and latency — by comparing three ways
+// an agent can be wired to upstream MCP tools:
+//
+//   - baseline: every upstream tool definition is loaded directly into the
+//     agent's context (no proxy discovery).
+//   - retrieve_tools: only mcpproxy's discovery + call_tool variants occupy the
+//     context; tools are found on demand via BM25 search.
+//   - code_execution: only code_execution + retrieve_tools occupy the context;
+//     the agent orchestrates many tools from sandboxed JS in one round-trip.
+//
+// The token-reduction measurement in this file is fully deterministic and
+// offline: it counts the context cost of each mode over a frozen tool corpus
+// using the tiktoken cl100k_base encoding (a reproducible, model-agnostic
+// estimator). It reuses the Spec 065 frozen corpus
+// (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json) as its tool
+// universe so the benchmark scores a versioned, non-drifting snapshot (CN-002).
+//
+// Methodology, limitations, and the live (docker-compose) run that adds full
+// JSON input schemas and end-to-end accuracy/latency are documented in
+// bench/README.md.
+package bench
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/pkoukk/tiktoken-go"
+)
+
+// DefaultEncoding is the tiktoken encoding used for token estimation. cl100k_base
+// is a widely-used, reproducible BPE; exact counts for a specific pinned model
+// (e.g. Claude) will differ, but the *relative* savings between modes are stable.
+const DefaultEncoding = "cl100k_base"
+
+// Routing modes the benchmark compares. The mode names mirror the mcpproxy
+// MCP servers in internal/server/mcp.go (codeExecServer, callToolServer).
+const (
+	ModeBaseline      = "baseline"
+	ModeRetrieveTools = "retrieve_tools"
+	ModeCodeExecution = "code_execution"
+)
+
+// Tool is a single tool definition the benchmark scores token cost over. It
+// matches the shape of both the Spec 065 corpus snapshot and the embedded
+// proxy-tool fixture.
+type Tool struct {
+	ToolID      string `json:"tool_id"`
+	Server      string `json:"server"`
+	Name        string `json:"tool"`
+	Description string `json:"description"`
+}
+
+// Corpus is a frozen, versioned set of tool definitions.
+type Corpus struct {
+	Version string `json:"version"`
+	Tools   []Tool `json:"tools"`
+}
+
+// LoadCorpus reads a frozen corpus snapshot (e.g. the Spec 065
+// corpus_v1.tools.json) from disk.
+func LoadCorpus(path string) (*Corpus, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read corpus %q: %w", path, err)
+	}
+	var c Corpus
+	if err := json.Unmarshal(data, &c); err != nil {
+		return nil, fmt.Errorf("parse corpus %q: %w", path, err)
+	}
+	if len(c.Tools) == 0 {
+		return nil, fmt.Errorf("corpus %q contains no tools", path)
+	}
+	return &c, nil
+}
+
+// Tokenizer wraps a tiktoken encoding for reproducible token estimation.
+type Tokenizer struct {
+	enc      *tiktoken.Tiktoken
+	encoding string
+}
+
+// NewTokenizer constructs a Tokenizer for the given tiktoken encoding name.
+func NewTokenizer(encoding string) (*Tokenizer, error) {
+	enc, err := tiktoken.GetEncoding(encoding)
+	if err != nil {
+		return nil, fmt.Errorf("load tiktoken encoding %q: %w", encoding, err)
+	}
+	return &Tokenizer{enc: enc, encoding: encoding}, nil
+}
+
+// Count returns the number of tokens in text.
+func (t *Tokenizer) Count(text string) int {
+	return len(t.enc.Encode(text, nil, nil))
+}
+
+// CountTool returns the context-token cost of a single tool definition.
+//
+// It counts the tool name and description only. Input JSON schemas are excluded
+// uniformly across every mode because the committed Spec 065 corpus snapshot
+// does not carry schemas. This is deliberately conservative for the headline
+// claim: upstream tools carry far larger schemas than mcpproxy's handful of
+// proxy tools, so excluding schemas *understates* the baseline and therefore
+// understates the measured savings. The live docker-compose run (README.md)
+// adds full schemas from GET /api/v1/tools for the exact headline number.
+func (t *Tokenizer) CountTool(tl Tool) int {
+	return t.Count(tl.Name + "\n" + tl.Description)
+}
+
+func (t *Tokenizer) countTools(tools []Tool) int {
+	total := 0
+	for _, tl := range tools {
+		total += t.CountTool(tl)
+	}
+	return total
+}
+
+// ModeResult is the per-mode context-cost outcome.
+type ModeResult struct {
+	Mode         string  `json:"mode"`
+	ContextTools int     `json:"context_tools"`
+	Tokens       int     `json:"tokens"`
+	SavingsRatio float64 `json:"savings_vs_baseline"`
+}
+
+// Report is the full token-reduction benchmark result.
+type Report struct {
+	Encoding      string       `json:"encoding"`
+	CorpusVersion string       `json:"corpus_version"`
+	CorpusTools   int          `json:"corpus_tools"`
+	Modes         []ModeResult `json:"modes"`
+	Notes         []string     `json:"notes"`
+}
+
+// ComputeReport computes the per-mode context-token cost over the corpus and the
+// savings of each proxy mode versus the baseline (all tools loaded directly).
+func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report {
+	baseTokens := tk.countTools(corpus.Tools)
+
+	rtTools := ProxyToolsForMode(ModeRetrieveTools)
+	ceTools := ProxyToolsForMode(ModeCodeExecution)
+
+	savings := func(tokens int) float64 {
+		if baseTokens == 0 {
+			return 0
+		}
+		return 1.0 - float64(tokens)/float64(baseTokens)
+	}
+
+	rtTokens := tk.countTools(rtTools)
+	ceTokens := tk.countTools(ceTools)
+
+	return &Report{
+		Encoding:      tk.encoding,
+		CorpusVersion: corpus.Version,
+		CorpusTools:   len(corpus.Tools),
+		Modes: []ModeResult{
+			{Mode: ModeBaseline, ContextTools: len(corpus.Tools), Tokens: baseTokens, SavingsRatio: 0},
+			{Mode: ModeRetrieveTools, ContextTools: len(rtTools), Tokens: rtTokens, SavingsRatio: savings(rtTokens)},
+			{Mode: ModeCodeExecution, ContextTools: len(ceTools), Tokens: ceTokens, SavingsRatio: savings(ceTokens)},
+		},
+		Notes: []string{
+			"Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.",
+			"Counts tool name + description only; JSON input schemas are excluded uniformly, which understates the baseline and is therefore conservative for the savings claim.",
+			"Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.",
+		},
+	}
+}
diff --git a/bench/tokens_test.go b/bench/tokens_test.go
new file mode 100644
index 00000000..7265bd3e
--- /dev/null
+++ b/bench/tokens_test.go
@@ -0,0 +1,153 @@
+package bench
+
+import (
+	"path/filepath"
+	"testing"
+)
+
+// repoCorpus is the committed Spec 065 frozen corpus, reused here as the
+// benchmark's tool universe (45 tools, 7 no-auth reference servers).
+const repoCorpus = "../specs/065-evaluation-foundation/datasets/corpus_v1.tools.json"
+
+func newTestTokenizer(t *testing.T) *Tokenizer {
+	t.Helper()
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		t.Fatalf("NewTokenizer: %v", err)
+	}
+	return tk
+}
+
+func TestTokenizer_DeterministicAndPositive(t *testing.T) {
+	tk := newTestTokenizer(t)
+	text := "Fetches a URL from the internet and extracts its contents as markdown."
+	a := tk.Count(text)
+	b := tk.Count(text)
+	if a != b {
+		t.Fatalf("tokenizer not deterministic: %d != %d", a, b)
+	}
+	if a <= 0 {
+		t.Fatalf("expected positive token count, got %d", a)
+	}
+}
+
+func TestProxyToolsForMode(t *testing.T) {
+	rt := ProxyToolsForMode(ModeRetrieveTools)
+	if len(rt) == 0 {
+		t.Fatal("retrieve_tools mode exposes no proxy tools")
+	}
+	// retrieve_tools mode must expose the discovery tool + the call_tool variants.
+	want := map[string]bool{
+		"retrieve_tools":        false,
+		"call_tool_read":        false,
+		"call_tool_write":       false,
+		"call_tool_destructive": false,
+	}
+	for _, tl := range rt {
+		if _, ok := want[tl.Name]; ok {
+			want[tl.Name] = true
+		}
+	}
+	for name, found := range want {
+		if !found {
+			t.Errorf("retrieve_tools mode missing expected proxy tool %q", name)
+		}
+	}
+
+	ce := ProxyToolsForMode(ModeCodeExecution)
+	var hasCodeExec, hasRetrieve bool
+	for _, tl := range ce {
+		switch tl.Name {
+		case "code_execution":
+			hasCodeExec = true
+		case "retrieve_tools":
+			hasRetrieve = true
+		}
+	}
+	if !hasCodeExec || !hasRetrieve {
+		t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce))
+	}
+}
+
+func TestComputeReport_SavingsAreReal(t *testing.T) {
+	tk := newTestTokenizer(t)
+	corpus, err := LoadCorpus(filepath.Clean(repoCorpus))
+	if err != nil {
+		t.Fatalf("LoadCorpus: %v", err)
+	}
+	if len(corpus.Tools) < 40 {
+		t.Fatalf("expected the frozen corpus to have ~45 tools, got %d", len(corpus.Tools))
+	}
+
+	rep := ComputeReport(tk, corpus)
+
+	modes := map[string]ModeResult{}
+	for _, m := range rep.Modes {
+		modes[m.Mode] = m
+	}
+
+	base, ok := modes[ModeBaseline]
+	if !ok {
+		t.Fatal("report missing baseline mode")
+	}
+	if base.SavingsRatio != 0 {
+		t.Errorf("baseline savings must be 0, got %v", base.SavingsRatio)
+	}
+	if base.Tokens <= 0 {
+		t.Fatalf("baseline tokens must be positive, got %d", base.Tokens)
+	}
+
+	rt := modes[ModeRetrieveTools]
+	ce := modes[ModeCodeExecution]
+
+	// The whole product thesis: discovery/orchestration modes load far fewer
+	// tokens into context than loading every upstream tool directly.
+	if rt.Tokens >= base.Tokens {
+		t.Errorf("retrieve_tools (%d) should use fewer tokens than baseline (%d)", rt.Tokens, base.Tokens)
+	}
+	if ce.Tokens >= base.Tokens {
+		t.Errorf("code_execution (%d) should use fewer tokens than baseline (%d)", ce.Tokens, base.Tokens)
+	}
+
+	// Savings ratio must be in (0,1) and match the arithmetic.
+	wantRT := 1.0 - float64(rt.Tokens)/float64(base.Tokens)
+	if diff := rt.SavingsRatio - wantRT; diff > 1e-9 || diff < -1e-9 {
+		t.Errorf("retrieve_tools savings ratio %v != computed %v", rt.SavingsRatio, wantRT)
+	}
+	if rt.SavingsRatio <= 0 || rt.SavingsRatio >= 1 {
+		t.Errorf("retrieve_tools savings ratio out of (0,1): %v", rt.SavingsRatio)
+	}
+}
+
+func TestComputeReport_BaselineMonotonic(t *testing.T) {
+	tk := newTestTokenizer(t)
+	full := &Corpus{Version: "test", Tools: []Tool{
+		{ToolID: "a:1", Server: "a", Name: "one", Description: "alpha tool that does something useful"},
+		{ToolID: "b:2", Server: "b", Name: "two", Description: "beta tool that does something else useful"},
+		{ToolID: "c:3", Server: "c", Name: "three", Description: "gamma tool with a longer description for token weight"},
+	}}
+	fewer := &Corpus{Version: "test", Tools: full.Tools[:1]}
+
+	big := ComputeReport(tk, full)
+	small := ComputeReport(tk, fewer)
+
+	baseOf := func(r *Report) int {
+		for _, m := range r.Modes {
+			if m.Mode == ModeBaseline {
+				return m.Tokens
+			}
+		}
+		return -1
+	}
+	if baseOf(big) <= baseOf(small) {
+		t.Errorf("more tools must mean more baseline tokens: %d <= %d", baseOf(big), baseOf(small))
+	}
+}
+
+func toolNames(ts []Tool) []string {
+	out := make([]string, len(ts))
+	for i, t := range ts {
+		out[i] = t.Name
+	}
+	return out
+}

From 732b29c703e7ee388fbdf2d665bcbbfbf7e2f473 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 15:42:49 +0300
Subject: [PATCH 2/3] fix(bench): drop stale line numbers from provenance; add
 WriteReports smoke test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KimiReviewer finding 2: code_execution is at line 626 in mcp.go at 89f06b5c,
not 675 as claimed. Line numbers drift with unrelated edits and the actual
function names are the stable identifier — remove all line numbers from the
provenance comment to prevent future rot.

KimiReviewer finding 3: add TestWriteReports_SmokeTest covering WriteReports
output (JSON round-trips to Report, HTML is non-empty and contains all mode
names). All 5 tests pass; golangci-lint v2 clean.

Related #MCP-42

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 bench/proxy_tools_v1.json |  2 +-
 bench/tokens_test.go      | 44 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json
index 26fc14a8..841770e8 100644
--- a/bench/proxy_tools_v1.json
+++ b/bench/proxy_tools_v1.json
@@ -1,6 +1,6 @@
 {
   "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).",
-  "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools (mcp.go:561), call_tool_read/write/destructive variant descriptions (mcp.go:490-528), read_cache (mcp.go:605), code_execution (mcp.go:675). Captured verbatim at origin/main 89f06b5c.",
+  "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools, call_tool_read/write/destructive variant descriptions, read_cache, code_execution. Descriptions captured verbatim at origin/main 89f06b5c. Line numbers omitted (they drift with unrelated edits; grep the function names to locate them).",
   "version": "proxy_v1",
   "tools": [
     {
diff --git a/bench/tokens_test.go b/bench/tokens_test.go
index 7265bd3e..f05a3570 100644
--- a/bench/tokens_test.go
+++ b/bench/tokens_test.go
@@ -1,6 +1,9 @@
 package bench
 
 import (
+	"bytes"
+	"encoding/json"
+	"os"
 	"path/filepath"
 	"testing"
 )
@@ -144,6 +147,47 @@ func TestComputeReport_BaselineMonotonic(t *testing.T) {
 	}
 }
 
+func TestWriteReports_SmokeTest(t *testing.T) {
+	tk := newTestTokenizer(t)
+	corpus := &Corpus{Version: "test", Tools: []Tool{
+		{ToolID: "a:1", Server: "a", Name: "tool_a", Description: "does something"},
+	}}
+	rep := ComputeReport(tk, corpus)
+
+	dir := t.TempDir()
+	jsonPath, htmlPath, err := rep.WriteReports(dir)
+	if err != nil {
+		t.Fatalf("WriteReports: %v", err)
+	}
+
+	// JSON must parse back to a Report with the right corpus version.
+	data, err := os.ReadFile(jsonPath)
+	if err != nil {
+		t.Fatalf("read json: %v", err)
+	}
+	var got Report
+	if err := json.Unmarshal(data, &got); err != nil {
+		t.Fatalf("unmarshal json: %v", err)
+	}
+	if got.CorpusVersion != "test" {
+		t.Errorf("corpus version = %q, want %q", got.CorpusVersion, "test")
+	}
+
+	// HTML must be non-empty and contain the mode names.
+	html, err := os.ReadFile(htmlPath)
+	if err != nil {
+		t.Fatalf("read html: %v", err)
+	}
+	if len(html) < 100 {
+		t.Fatalf("dashboard.html too short (%d bytes)", len(html))
+	}
+	for _, mode := range []string{ModeBaseline, ModeRetrieveTools, ModeCodeExecution} {
+		if !bytes.Contains(html, []byte(mode)) {
+			t.Errorf("dashboard.html missing mode %q", mode)
+		}
+	}
+}
+
 func toolNames(ts []Tool) []string {
 	out := make([]string, len(ts))
 	for i, t := range ts {

From 9a92d7145c14113103d2a972c32f555b6c68b469 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 20:22:00 +0300
Subject: [PATCH 3/3] fix(bench): derive per-mode tool catalog from live server
 builders incl. management tools (MCP-3161)

The token-reduction benchmark scored only 6 hand-maintained proxy tools and
omitted the shared management tool set (upstream_servers, quarantine_security,
search_servers, list_registries) that both routing modes append via
buildManagementTools. That undercounted the proxy-mode context cost and
inflated the headline savings (Codex finding on PR #747).

Replace bench/proxy_tools_v1.json with server.ProxyModeToolDefs, which builds
the catalog from the live builders (buildCallToolModeTools /
buildCodeExecModeTools in internal/server/mcp_routing.go) so it can never drift
from production and always reflects the tools the agent actually sees. This
also fixes a second drift: the fixture's retrieve_tools descriptions did not
match the per-mode builder descriptions.

Corrected figures over the 45-tool Spec 065 corpus (name+description only):
retrieve_tools ~17% (10 tools), code_execution ~43% (6 tools). Updated README
and notes; the schema-exclusion claim is no longer unambiguously conservative
now that large-schema management tools are in the proxy cost.

Tests: bench asserts both modes include the 4 management tools; internal/server
pins ProxyModeToolDefs to the builders so the catalog can't silently drift.

Related #747
---
 bench/README.md                      | 54 ++++++++++++++++----
 bench/proxy_tools_v1.json            | 43 ----------------
 bench/proxytools.go                  | 73 ++++++++++++----------------
 bench/tokens.go                      | 14 +++---
 bench/tokens_test.go                 | 21 ++++++++
 internal/server/bench_export.go      | 57 ++++++++++++++++++++++
 internal/server/bench_export_test.go | 64 ++++++++++++++++++++++++
 7 files changed, 225 insertions(+), 101 deletions(-)
 delete mode 100644 bench/proxy_tools_v1.json
 create mode 100644 internal/server/bench_export.go
 create mode 100644 internal/server/bench_export_test.go

diff --git a/bench/README.md b/bench/README.md
index cd7227a6..c99a3868 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -13,10 +13,18 @@ wired to upstream MCP tools.
 | Mode | What the agent sees in context | mcpproxy server |
 |------|--------------------------------|-----------------|
 | `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) |
-| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache`; tools found on demand via BM25 | `callToolServer` |
-| `code_execution` | `code_execution` + `retrieve_tools`; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` |
+| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache` + `code_execution` + management tools; tools found on demand via BM25 | `callToolServer` |
+| `code_execution` | `code_execution` + `retrieve_tools` + management tools; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` |
 
-(Mode → exposed tools mirrors `internal/server/mcp.go`.)
+Both proxy modes also append the shared **management tool set** —
+`upstream_servers`, `quarantine_security`, `search_servers`, `list_registries`
+— that the live routing-mode servers expose. These count against the proxy
+context cost: omitting them undercounts that cost and inflates the savings.
+
+The per-mode catalog is **derived directly from the live tool builders**
+(`buildCallToolModeTools` / `buildCodeExecModeTools` in
+`internal/server/mcp_routing.go`, via `server.ProxyModeToolDefs`), so it can
+never drift from production.
 
 ## What ships today (deterministic, offline)
 
@@ -33,6 +41,23 @@ reports the savings of each proxy mode versus the baseline. Output: a
 `report.json` and a self-contained `dashboard.html` in `bench/results/`
 (gitignored).
 
+#### Current deterministic result
+
+Over the 45-tool Spec 065 reference corpus, counting **tool name + description
+only** (schemas excluded uniformly — see limitations), `cl100k_base`:
+
+| Mode | Context tools | Tokens | Savings vs. baseline |
+|------|---------------|--------|----------------------|
+| `baseline` | 45 | 1730 | — |
+| `retrieve_tools` | 10 | 1431 | **~17%** |
+| `code_execution` | 6 | 986 | **~43%** |
+
+These are deliberately modest: the proxy context here is the *full* per-mode
+tool set (discovery + call-tool variants + management tools), and the corpus is
+small. Savings grow toward the asymptote as the upstream tool count rises (the
+baseline grows linearly while the proxy context stays fixed) — always quote the
+corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`.
+
 ### Scoring rubric — token reduction
 
 - **Tool universe**: the frozen Spec 065 snapshot
@@ -43,6 +68,11 @@ reports the savings of each proxy mode versus the baseline. Output: a
   (already a repo dependency). It is a **model-agnostic estimator**; exact
   counts for a specific pinned model (e.g. Claude) will differ, but the
   *relative* savings between modes are stable.
+- **Proxy-mode tools**: the *complete* per-mode catalog, derived from the live
+  server builders — discovery, the call-tool variants, `code_execution`, **and
+  the shared management tool set** (`upstream_servers`, `quarantine_security`,
+  `search_servers`, `list_registries`). Nothing the agent actually sees is
+  dropped from the proxy cost.
 - **Cost of a tool**: `name + "\n" + description`. JSON input schemas are
   excluded **uniformly** across all modes (the committed corpus snapshot does
   not carry schemas).
@@ -50,10 +80,13 @@ reports the savings of each proxy mode versus the baseline. Output: a
 
 ### Known limitations (read before quoting a number)
 
-- **Schemas excluded → conservative.** Upstream tools carry far larger input
-  schemas than mcpproxy's handful of proxy tools, so excluding schemas
-  *understates* the baseline and therefore *understates* the savings. The live
-  run below adds full schemas for the exact headline number.
+- **Schemas excluded — direction is not clean.** Input schemas are dropped from
+  *both* sides. The 45 baseline tools lose their schemas, but so do the proxy
+  modes' management tools (e.g. `upstream_servers` carries a large multi-field
+  schema). So the name+description-only number is **not** unambiguously
+  conservative — it is its own well-defined metric. The live run below adds full
+  schemas from `GET /api/v1/tools` for the exact headline number; quote that for
+  marketing, not this offline estimate.
 - **Savings scale with tool count.** The 45-tool reference corpus is small; real
   deployments expose hundreds–thousands of tools, where the baseline grows
   linearly and the proxy context stays fixed, so savings approach the asymptote.
@@ -84,8 +117,11 @@ rather than landed here:
   (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively
   reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch,
   time, sequential-thinking).
-- Proxy tool definitions: `bench/proxy_tools_v1.json`, captured verbatim from
-  `internal/server/mcp.go` (provenance recorded in the file).
+- Proxy + management tool definitions: derived at run time from the live server
+  tool builders (`internal/server/mcp_routing.go` →
+  `buildCallToolModeTools` / `buildCodeExecModeTools`, exposed via
+  `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the
+  benchmark cannot drift from the tools the proxy actually serves.
 
 ## Reproducible live run (skeleton)
 
diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json
deleted file mode 100644
index 841770e8..00000000
--- a/bench/proxy_tools_v1.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).",
-  "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools, call_tool_read/write/destructive variant descriptions, read_cache, code_execution. Descriptions captured verbatim at origin/main 89f06b5c. Line numbers omitted (they drift with unrelated edits; grep the function names to locate them).",
-  "version": "proxy_v1",
-  "tools": [
-    {
-      "tool_id": "mcpproxy:retrieve_tools",
-      "tool": "retrieve_tools",
-      "description": "🔍 CALL THIS FIRST to discover relevant tools! This is the primary tool discovery mechanism that searches across ALL upstream MCP servers using intelligent BM25 full-text search. Always use this before attempting to call any specific tools. Use natural language to describe what you want to accomplish (e.g., 'create GitHub repository', 'query database', 'weather forecast'). Results include 'annotations' (tool behavior hints like destructiveHint) and 'call_with' recommendation indicating which tool variant to use (call_tool_read/write/destructive). Then use the recommended variant with an 'intent' parameter. NOTE: Quarantined servers are excluded from search results for security. Use 'quarantine_security' tool to examine and manage quarantined servers. TO ADD NEW SERVERS: Use 'list_registries' then 'search_servers' to find and add new MCP servers.",
-      "modes": ["retrieve_tools", "code_execution"]
-    },
-    {
-      "tool_id": "mcpproxy:call_tool_read",
-      "tool": "call_tool_read",
-      "description": "Execute a READ-ONLY tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: search, query, list, get, fetch, find, check, view, read, show, describe, lookup, retrieve, browse, explore, discover, scan, inspect, analyze, examine, validate, verify. Examples: search_files, get_user, list_repositories, query_database, find_issues, check_status. This is the DEFAULT choice when unsure - most tools are read-only.",
-      "modes": ["retrieve_tools"]
-    },
-    {
-      "tool_id": "mcpproxy:call_tool_write",
-      "tool": "call_tool_write",
-      "description": "Execute a STATE-MODIFYING tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: create, update, modify, add, set, send, edit, change, write, post, put, patch, insert, upload, submit, assign, configure, enable, register, subscribe, publish, move, copy, rename, merge. Examples: create_issue, update_file, send_message, add_comment, set_status, edit_page. Use only when explicitly modifying state.",
-      "modes": ["retrieve_tools"]
-    },
-    {
-      "tool_id": "mcpproxy:call_tool_destructive",
-      "tool": "call_tool_destructive",
-      "description": "Execute a DESTRUCTIVE tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: delete, remove, drop, revoke, disable, destroy, purge, reset, clear, unsubscribe, cancel, terminate, close, archive, ban, block, disconnect, kill, wipe, truncate, force, hard. Examples: delete_repo, remove_user, drop_table, revoke_access, clear_cache, terminate_session. Use for irreversible or high-impact operations.",
-      "modes": ["retrieve_tools"]
-    },
-    {
-      "tool_id": "mcpproxy:read_cache",
-      "tool": "read_cache",
-      "description": "Retrieve paginated data when mcpproxy indicates a tool response was truncated. Use the cache key provided in truncation messages to access the complete dataset with pagination.",
-      "modes": ["retrieve_tools"]
-    },
-    {
-      "tool_id": "mcpproxy:code_execution",
-      "tool": "code_execution",
-      "description": "Execute JavaScript or TypeScript code that orchestrates multiple upstream MCP tools in a single request. Use this when you need to combine results from 2+ tools, implement conditional logic, loops, or data transformations that would require multiple round-trips otherwise.\n\n**When to use**: Multi-step workflows with data transformation, conditional logic, error handling, or iterating over results.\n**When NOT to use**: Single tool calls (use call_tool directly), long-running operations (>2 minutes).\n\n**Available in code**:\n- `input` global: Your input data passed via the 'input' parameter\n- `call_tool(serverName, toolName, args)`: Call upstream tools (returns {ok, result} or {ok, error})\n- Modern JavaScript (ES2020+): arrow functions, const/let, template literals, destructuring, classes, for-of, optional chaining (?.), nullish coalescing (??), spread/rest, Promises, Symbols, Map/Set, Proxy/Reflect (no require(), filesystem, or network access)\n\n**TypeScript support**: Set `language: \"typescript\"` to write TypeScript code with type annotations, interfaces, enums, and generics. Types are automatically stripped before execution.\n\n**Important runtime rules**:\n- `call_tool` is strictly SYNCHRONOUS. Do not use `await`.\n- Upstream tools usually return an MCP content array. To parse JSON results: `const data = JSON.parse(res.result.content[0].text);`\n- The last evaluated expression in your script is automatically returned as the final output.\n\n**Security**: Sandboxed execution with timeout enforcement. Respects existing quarantine and server restrictions.",
-      "modes": ["code_execution"]
-    }
-  ]
-}
diff --git a/bench/proxytools.go b/bench/proxytools.go
index 8191d033..dda5edd4 100644
--- a/bench/proxytools.go
+++ b/bench/proxytools.go
@@ -1,53 +1,40 @@
 package bench
 
 import (
-	_ "embed"
-	"encoding/json"
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/server"
 )
 
-//go:embed proxy_tools_v1.json
-var proxyToolsJSON []byte
-
-// proxyTool is a built-in mcpproxy tool definition plus the routing modes that
-// expose it in the agent's context.
-type proxyTool struct {
-	ToolID      string   `json:"tool_id"`
-	Name        string   `json:"tool"`
-	Description string   `json:"description"`
-	Modes       []string `json:"modes"`
-}
-
-type proxyToolFixture struct {
-	Version string      `json:"version"`
-	Tools   []proxyTool `json:"tools"`
-}
-
-var proxyTools proxyToolFixture
-
-func init() {
-	if err := json.Unmarshal(proxyToolsJSON, &proxyTools); err != nil {
-		// The fixture is embedded at build time; a parse failure is a build/test
-		// bug, not a runtime condition.
-		panic("bench: invalid embedded proxy_tools_v1.json: " + err.Error())
+// ProxyToolsForMode returns the built-in mcpproxy proxy + management tool
+// definitions that occupy the agent's context window in the given routing mode.
+//
+// The catalog is derived directly from the live server tool builders
+// (internal/server.ProxyModeToolDefs → buildCallToolModeTools /
+// buildCodeExecModeTools in internal/server/mcp_routing.go). This is the single
+// source of truth: both routing modes append the shared management tool set
+// (upstream_servers, quarantine_security, search_servers, list_registries), so
+// deriving from the builders guarantees the benchmark counts the real per-mode
+// context cost and can never drift from production by re-introducing the
+// undercount that inflated the headline savings (MCP-3161).
+func ProxyToolsForMode(mode string) []Tool {
+	var routingMode string
+	switch mode {
+	case ModeCodeExecution:
+		routingMode = config.RoutingModeCodeExecution
+	case ModeRetrieveTools:
+		routingMode = config.RoutingModeRetrieveTools
+	default:
+		return nil
 	}
-}
 
-// ProxyToolsForMode returns the built-in proxy tool definitions that occupy the
-// agent's context window in the given routing mode. Provenance for each
-// definition is in proxy_tools_v1.json (captured from internal/server/mcp.go).
-func ProxyToolsForMode(mode string) []Tool {
-	var out []Tool
-	for _, pt := range proxyTools.Tools {
-		for _, m := range pt.Modes {
-			if m == mode {
-				out = append(out, Tool{
-					ToolID:      pt.ToolID,
-					Name:        pt.Name,
-					Description: pt.Description,
-				})
-				break
-			}
-		}
+	defs := server.ProxyModeToolDefs(routingMode)
+	out := make([]Tool, 0, len(defs))
+	for _, d := range defs {
+		out = append(out, Tool{
+			ToolID:      "mcpproxy:" + d.Name,
+			Name:        d.Name,
+			Description: d.Description,
+		})
 	}
 	return out
 }
diff --git a/bench/tokens.go b/bench/tokens.go
index f903fbab..e61b3ed4 100644
--- a/bench/tokens.go
+++ b/bench/tokens.go
@@ -101,11 +101,12 @@ func (t *Tokenizer) Count(text string) int {
 //
 // It counts the tool name and description only. Input JSON schemas are excluded
 // uniformly across every mode because the committed Spec 065 corpus snapshot
-// does not carry schemas. This is deliberately conservative for the headline
-// claim: upstream tools carry far larger schemas than mcpproxy's handful of
-// proxy tools, so excluding schemas *understates* the baseline and therefore
-// understates the measured savings. The live docker-compose run (README.md)
-// adds full schemas from GET /api/v1/tools for the exact headline number.
+// does not carry schemas. Schemas are dropped from BOTH sides — the baseline's
+// upstream tools and the proxy modes' management tools (e.g. upstream_servers
+// carries a large multi-field schema) — so this is a well-defined
+// name+description-only metric, not an unambiguously conservative one. The live
+// docker-compose run (README.md) adds full schemas from GET /api/v1/tools for
+// the exact headline number.
 func (t *Tokenizer) CountTool(tl Tool) int {
 	return t.Count(tl.Name + "\n" + tl.Description)
 }
@@ -164,7 +165,8 @@ func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report {
 		},
 		Notes: []string{
 			"Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.",
-			"Counts tool name + description only; JSON input schemas are excluded uniformly, which understates the baseline and is therefore conservative for the savings claim.",
+			"Proxy-mode tools are the full per-mode catalog derived from the live server builders (internal/server.ProxyModeToolDefs), including the shared management tool set (upstream_servers, quarantine_security, search_servers, list_registries).",
+			"Counts tool name + description only; JSON input schemas are excluded uniformly from both the baseline and the proxy modes, so this is a name+description-only metric (not unambiguously conservative). See bench/README.md for the live run with full schemas.",
 			"Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.",
 		},
 	}
diff --git a/bench/tokens_test.go b/bench/tokens_test.go
index f05a3570..26296fe4 100644
--- a/bench/tokens_test.go
+++ b/bench/tokens_test.go
@@ -70,6 +70,27 @@ func TestProxyToolsForMode(t *testing.T) {
 	if !hasCodeExec || !hasRetrieve {
 		t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce))
 	}
+
+	// Both routing modes append the shared management tool set
+	// (internal/server/mcp_routing.go buildManagementTools). Omitting these
+	// undercounts the proxy-mode context cost and overstates the savings
+	// (MCP-3161 / Codex finding on PR #747). Assert they are present so the
+	// benchmark catalog can never silently drop them again.
+	mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"}
+	for _, mode := range []string{ModeRetrieveTools, ModeCodeExecution} {
+		got := map[string]bool{}
+		for _, tl := range ProxyToolsForMode(mode) {
+			got[tl.Name] = true
+			if tl.Description == "" {
+				t.Errorf("mode %s: tool %q has empty description", mode, tl.Name)
+			}
+		}
+		for _, name := range mgmt {
+			if !got[name] {
+				t.Errorf("mode %s: missing management tool %q (proxy context cost undercounted)", mode, name)
+			}
+		}
+	}
 }
 
 func TestComputeReport_SavingsAreReal(t *testing.T) {
diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go
new file mode 100644
index 00000000..95987195
--- /dev/null
+++ b/internal/server/bench_export.go
@@ -0,0 +1,57 @@
+package server
+
+import (
+	mcpserver "github.com/mark3labs/mcp-go/server"
+	"go.uber.org/zap"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+)
+
+// BenchProxyToolDef is a static built-in proxy/management tool definition
+// (name + description) exposed for the in-repo benchmark harness (bench/).
+//
+// The benchmark scores the per-mode context cost an agent pays for mcpproxy's
+// own tools. That cost MUST reflect every tool the live routing-mode servers
+// expose — including the shared management tool set (upstream_servers,
+// quarantine_security, search_servers, list_registries) that both modes append
+// via buildManagementTools — or the benchmark overstates the token savings
+// (MCP-3161 / Codex finding on PR #747).
+type BenchProxyToolDef struct {
+	Name        string
+	Description string
+}
+
+// ProxyModeToolDefs returns the static built-in proxy + management tool
+// definitions an agent sees in its context window for the given routing mode
+// (config.RoutingModeRetrieveTools or config.RoutingModeCodeExecution).
+//
+// It is built from the SAME builders the live server uses
+// (buildCallToolModeTools / buildCodeExecModeTools in mcp_routing.go) so the
+// benchmark catalog can never drift from production. Code execution is enabled
+// so the real code_execution tool description (not the disabled stub) is scored
+// — the code_execution routing mode only makes sense with the tool enabled.
+func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef {
+	p := &MCPProxyServer{
+		logger: zap.NewNop(),
+		config: &config.Config{
+			EnableCodeExecution: true,
+		},
+	}
+
+	var serverTools []mcpserver.ServerTool
+	switch routingMode {
+	case config.RoutingModeCodeExecution:
+		serverTools = p.buildCodeExecModeTools()
+	default: // retrieve_tools — the default routing mode
+		serverTools = p.buildCallToolModeTools()
+	}
+
+	defs := make([]BenchProxyToolDef, 0, len(serverTools))
+	for _, st := range serverTools {
+		defs = append(defs, BenchProxyToolDef{
+			Name:        st.Tool.Name,
+			Description: st.Tool.Description,
+		})
+	}
+	return defs
+}
diff --git a/internal/server/bench_export_test.go b/internal/server/bench_export_test.go
new file mode 100644
index 00000000..5dd63132
--- /dev/null
+++ b/internal/server/bench_export_test.go
@@ -0,0 +1,64 @@
+package server
+
+import (
+	"testing"
+
+	mcpserver "github.com/mark3labs/mcp-go/server"
+	"go.uber.org/zap"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
+)
+
+// TestProxyModeToolDefs_IncludesManagementTools guards the benchmark integrity
+// fix (MCP-3161): every routing mode exposes the shared management tool set, so
+// the benchmark catalog must include it or it undercounts the proxy-mode context
+// cost and overstates the savings.
+func TestProxyModeToolDefs_IncludesManagementTools(t *testing.T) {
+	mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"}
+	for _, mode := range []string{config.RoutingModeRetrieveTools, config.RoutingModeCodeExecution} {
+		defs := ProxyModeToolDefs(mode)
+		if len(defs) == 0 {
+			t.Fatalf("mode %s: no proxy tool defs", mode)
+		}
+		names := map[string]bool{}
+		for _, d := range defs {
+			names[d.Name] = true
+			if d.Description == "" {
+				t.Errorf("mode %s: tool %q has empty description", mode, d.Name)
+			}
+		}
+		for _, m := range mgmt {
+			if !names[m] {
+				t.Errorf("mode %s: missing management tool %q", mode, m)
+			}
+		}
+	}
+}
+
+// TestProxyModeToolDefs_MatchesBuilders pins ProxyModeToolDefs to the live tool
+// builders. If a mode's tool set changes in mcp_routing.go, the benchmark
+// catalog tracks it automatically and this test proves the coupling holds.
+func TestProxyModeToolDefs_MatchesBuilders(t *testing.T) {
+	p := &MCPProxyServer{
+		logger: zap.NewNop(),
+		config: &config.Config{EnableCodeExecution: true},
+	}
+	cases := map[string][]mcpserver.ServerTool{
+		config.RoutingModeRetrieveTools: p.buildCallToolModeTools(),
+		config.RoutingModeCodeExecution: p.buildCodeExecModeTools(),
+	}
+	for mode, builderTools := range cases {
+		defs := ProxyModeToolDefs(mode)
+		if len(defs) != len(builderTools) {
+			t.Fatalf("mode %s: ProxyModeToolDefs len %d != builder len %d", mode, len(defs), len(builderTools))
+		}
+		for i := range builderTools {
+			if defs[i].Name != builderTools[i].Tool.Name {
+				t.Errorf("mode %s: def[%d] name %q != builder %q", mode, i, defs[i].Name, builderTools[i].Tool.Name)
+			}
+			if defs[i].Description != builderTools[i].Tool.Description {
+				t.Errorf("mode %s: def[%d] description mismatch for %q", mode, i, defs[i].Name)
+			}
+		}
+	}
+}