From fb64c91b4ae275b6bef31ddffb3e72e712e20f97 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 15:06:25 +0300 Subject: [PATCH 1/3] feat(bench): token-reduction benchmark harness over frozen corpus (MCP-42) Ship the first, fully-deterministic slice of the roadmap-#19 benchmark: the token-reduction numbers behind the "massive token savings" claim. Reuses the frozen Spec 065 tool corpus (45 tools, 7 reference servers) as a versioned, non-drifting universe and tiktoken cl100k_base (already a dep) as a reproducible model-agnostic estimator. Compares the three routing modes' static context cost: - baseline (all upstream tools loaded directly) - retrieve_tools (BM25 discovery + call_tool variants) - code_execution (orchestration + retrieve_tools) over the corpus and reports per-mode savings. Real proxy tool defs are captured verbatim from internal/server/mcp.go into bench/proxy_tools_v1.json (provenance recorded). Emits report.json + a self-contained dashboard.html (gitignored; reports never committed, per Spec 065 CN-003). Conservative by construction: input schemas excluded uniformly understates the baseline, so measured savings (65.5% / 70.3% on the 45-tool corpus) are a floor. Methodology, limitations, and the scoped-but-not-yet-built follow-ups (live run with full schemas + accuracy/latency, LLM e2e, CI publish) are in bench/README.md. Related #MCP-42 Co-Authored-By: Paperclip --- bench/.gitignore | 2 + bench/README.md | 99 ++++++++++++++++++++++ bench/cmd/bench/main.go | 52 ++++++++++++ bench/docker-compose.yml | 37 +++++++++ bench/proxy_tools_v1.json | 43 ++++++++++ bench/proxytools.go | 53 ++++++++++++ bench/report.go | 105 +++++++++++++++++++++++ bench/tokens.go | 171 ++++++++++++++++++++++++++++++++++++++ bench/tokens_test.go | 153 ++++++++++++++++++++++++++++++++++ 9 files changed, 715 insertions(+) create mode 100644 bench/.gitignore create mode 100644 bench/README.md create mode 100644 bench/cmd/bench/main.go create mode 100644 bench/docker-compose.yml create mode 100644 bench/proxy_tools_v1.json create mode 100644 bench/proxytools.go create mode 100644 bench/report.go create mode 100644 bench/tokens.go create mode 100644 bench/tokens_test.go diff --git a/bench/.gitignore b/bench/.gitignore new file mode 100644 index 00000000..4baf56cf --- /dev/null +++ b/bench/.gitignore @@ -0,0 +1,2 @@ +# Benchmark run artifacts are never committed (Spec 065 CN-003). +results/ diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 00000000..cd7227a6 --- /dev/null +++ b/bench/README.md @@ -0,0 +1,99 @@ +# mcpproxy benchmark harness + +The reproducible numbers behind mcpproxy's marketing claims — **token reduction**, +**discovery accuracy**, and **latency** — comparing three ways an agent can be +wired to upstream MCP tools. + +> Roadmap item #19 (MCP-42). In-repo (`bench/`), reproducible, intended to be +> refreshed on release. Reports are **never committed** (Spec 065 CN-003); only +> code, fixtures, and this methodology are versioned. + +## The three modes + +| Mode | What the agent sees in context | mcpproxy server | +|------|--------------------------------|-----------------| +| `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) | +| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache`; tools found on demand via BM25 | `callToolServer` | +| `code_execution` | `code_execution` + `retrieve_tools`; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` | + +(Mode → exposed tools mirrors `internal/server/mcp.go`.) + +## What ships today (deterministic, offline) + +The **token-reduction** measurement is fully deterministic and runs with no +network or LLM: + +```bash +go run ./bench/cmd/bench # scores the committed Spec 065 corpus +go test ./bench/ # unit + invariant tests +``` + +It counts the context-token cost of each mode over a **frozen tool corpus** and +reports the savings of each proxy mode versus the baseline. Output: a +`report.json` and a self-contained `dashboard.html` in `bench/results/` +(gitignored). + +### Scoring rubric — token reduction + +- **Tool universe**: the frozen Spec 065 snapshot + `specs/065-evaluation-foundation/datasets/corpus_v1.tools.json` — 45 tools + across 7 no-auth reference servers. Frozen + versioned so scoring never runs + against a drifting corpus (CN-002). +- **Tokenizer**: `tiktoken cl100k_base`, a widely-used reproducible BPE + (already a repo dependency). It is a **model-agnostic estimator**; exact + counts for a specific pinned model (e.g. Claude) will differ, but the + *relative* savings between modes are stable. +- **Cost of a tool**: `name + "\n" + description`. JSON input schemas are + excluded **uniformly** across all modes (the committed corpus snapshot does + not carry schemas). +- **Savings** for a mode `m`: `1 - tokens(m) / tokens(baseline)`. + +### Known limitations (read before quoting a number) + +- **Schemas excluded → conservative.** Upstream tools carry far larger input + schemas than mcpproxy's handful of proxy tools, so excluding schemas + *understates* the baseline and therefore *understates* the savings. The live + run below adds full schemas for the exact headline number. +- **Savings scale with tool count.** The 45-tool reference corpus is small; real + deployments expose hundreds–thousands of tools, where the baseline grows + linearly and the proxy context stays fixed, so savings approach the asymptote. + Quote the corpus size alongside any percentage. +- **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer + for the headline model is tracked as a follow-up (see "Roadmap"). + +## What is scoped but not yet built (follow-ups) + +These require decisions and/or other roles, so they are tracked as child issues +rather than landed here: + +- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the + Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull + `GET /api/v1/tools` for exact schemas, and: + - **Accuracy**: replay the Spec 065 retrieval golden set + (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k / + MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer. + - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the + fixed cost of loading all tools. +- **End-to-end task success with a pinned LLM** — requires a pinned model + an + LLM-call budget; this is the only part that costs spend. +- **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane. + +## Dataset sources & provenance + +- Tool corpus + retrieval golden set: Spec 065 frozen datasets + (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively + reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch, + time, sequential-thinking). +- Proxy tool definitions: `bench/proxy_tools_v1.json`, captured verbatim from + `internal/server/mcp.go` (provenance recorded in the file). + +## Reproducible live run (skeleton) + +`docker-compose.yml` boots mcpproxy over the frozen reference-server config so +the corpus and live tool list are reproducible across machines. Wiring the live +accuracy/latency scorers into it is the follow-up above. + +## Reviewer contact + +Methodology questions / disputes: open an issue in `smart-mcp-proxy/mcpproxy-go` +and tag the maintainers, or comment on the roadmap benchmark ticket (MCP-42). diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go new file mode 100644 index 00000000..a5e924b2 --- /dev/null +++ b/bench/cmd/bench/main.go @@ -0,0 +1,52 @@ +// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool +// corpus and writes a JSON report plus a static HTML dashboard. +// +// Usage: +// +// go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME] +// +// With no flags it scores the committed Spec 065 frozen corpus and writes the +// reports to bench/results/ (gitignored — reports are never committed, per the +// Spec 065 CN-003 repo rule). +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "github.com/smart-mcp-proxy/mcpproxy-go/bench" +) + +func main() { + corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot") + outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html") + encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name") + flag.Parse() + + tk, err := bench.NewTokenizer(*encoding) + if err != nil { + log.Fatalf("bench: %v", err) + } + corpus, err := bench.LoadCorpus(*corpusPath) + if err != nil { + log.Fatalf("bench: %v", err) + } + + report := bench.ComputeReport(tk, corpus) + jsonPath, htmlPath, err := report.WriteReports(*outDir) + if err != nil { + log.Fatalf("bench: %v", err) + } + + fmt.Fprintf(os.Stdout, "mcpproxy token-reduction benchmark (corpus %s, %d tools, %s)\n", report.CorpusVersion, report.CorpusTools, report.Encoding) + for _, m := range report.Modes { + if m.Mode == bench.ModeBaseline { + fmt.Fprintf(os.Stdout, " %-16s %6d tokens (%d tools) baseline\n", m.Mode, m.Tokens, m.ContextTools) + continue + } + fmt.Fprintf(os.Stdout, " %-16s %6d tokens (%d tools) %.1f%% fewer tokens\n", m.Mode, m.Tokens, m.ContextTools, m.SavingsRatio*100) + } + fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath) +} diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml new file mode 100644 index 00000000..7c420947 --- /dev/null +++ b/bench/docker-compose.yml @@ -0,0 +1,37 @@ +# Reproducible benchmark substrate (skeleton). +# +# Boots mcpproxy over the frozen Spec 065 reference-server config so the tool +# corpus and live tool list are identical across machines. The live +# accuracy/latency scorers (see bench/README.md "follow-ups") attach to this. +# +# Usage: +# docker compose -f bench/docker-compose.yml up --build +# # then, against the running proxy on 127.0.0.1:8092: +# # GET /api/v1/tools -> full tool defs (with schemas) for the live token run +# # retrieve_tools -> Recall@k accuracy over retrieval_golden_v1.json +# +# The committed corpus_v1 snapshot was frozen from exactly this config +# (specs/065-evaluation-foundation/datasets/README.md), so a live snapshot here +# reproduces it (modulo upstream-server version drift — pin images before +# publishing headline numbers). +services: + mcpproxy: + build: + context: .. + dockerfile: Dockerfile + command: + - serve + - --config=/data/snapshot-servers.config.json + - --data-dir=/data/state + - --listen=0.0.0.0:8092 + environment: + MCPPROXY_API_KEY: eval-corpus-snapshot + ports: + - "127.0.0.1:8092:8092" + volumes: + # The frozen, servable reference-server config (7 no-auth servers). + - ../specs/065-evaluation-foundation/datasets/snapshot-servers.config.json:/data/snapshot-servers.config.json:ro + - bench-state:/data/state + +volumes: + bench-state: diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json new file mode 100644 index 00000000..26fc14a8 --- /dev/null +++ b/bench/proxy_tools_v1.json @@ -0,0 +1,43 @@ +{ + "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).", + "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools (mcp.go:561), call_tool_read/write/destructive variant descriptions (mcp.go:490-528), read_cache (mcp.go:605), code_execution (mcp.go:675). Captured verbatim at origin/main 89f06b5c.", + "version": "proxy_v1", + "tools": [ + { + "tool_id": "mcpproxy:retrieve_tools", + "tool": "retrieve_tools", + "description": "🔍 CALL THIS FIRST to discover relevant tools! This is the primary tool discovery mechanism that searches across ALL upstream MCP servers using intelligent BM25 full-text search. Always use this before attempting to call any specific tools. Use natural language to describe what you want to accomplish (e.g., 'create GitHub repository', 'query database', 'weather forecast'). Results include 'annotations' (tool behavior hints like destructiveHint) and 'call_with' recommendation indicating which tool variant to use (call_tool_read/write/destructive). Then use the recommended variant with an 'intent' parameter. NOTE: Quarantined servers are excluded from search results for security. Use 'quarantine_security' tool to examine and manage quarantined servers. TO ADD NEW SERVERS: Use 'list_registries' then 'search_servers' to find and add new MCP servers.", + "modes": ["retrieve_tools", "code_execution"] + }, + { + "tool_id": "mcpproxy:call_tool_read", + "tool": "call_tool_read", + "description": "Execute a READ-ONLY tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: search, query, list, get, fetch, find, check, view, read, show, describe, lookup, retrieve, browse, explore, discover, scan, inspect, analyze, examine, validate, verify. Examples: search_files, get_user, list_repositories, query_database, find_issues, check_status. This is the DEFAULT choice when unsure - most tools are read-only.", + "modes": ["retrieve_tools"] + }, + { + "tool_id": "mcpproxy:call_tool_write", + "tool": "call_tool_write", + "description": "Execute a STATE-MODIFYING tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: create, update, modify, add, set, send, edit, change, write, post, put, patch, insert, upload, submit, assign, configure, enable, register, subscribe, publish, move, copy, rename, merge. Examples: create_issue, update_file, send_message, add_comment, set_status, edit_page. Use only when explicitly modifying state.", + "modes": ["retrieve_tools"] + }, + { + "tool_id": "mcpproxy:call_tool_destructive", + "tool": "call_tool_destructive", + "description": "Execute a DESTRUCTIVE tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: delete, remove, drop, revoke, disable, destroy, purge, reset, clear, unsubscribe, cancel, terminate, close, archive, ban, block, disconnect, kill, wipe, truncate, force, hard. Examples: delete_repo, remove_user, drop_table, revoke_access, clear_cache, terminate_session. Use for irreversible or high-impact operations.", + "modes": ["retrieve_tools"] + }, + { + "tool_id": "mcpproxy:read_cache", + "tool": "read_cache", + "description": "Retrieve paginated data when mcpproxy indicates a tool response was truncated. Use the cache key provided in truncation messages to access the complete dataset with pagination.", + "modes": ["retrieve_tools"] + }, + { + "tool_id": "mcpproxy:code_execution", + "tool": "code_execution", + "description": "Execute JavaScript or TypeScript code that orchestrates multiple upstream MCP tools in a single request. Use this when you need to combine results from 2+ tools, implement conditional logic, loops, or data transformations that would require multiple round-trips otherwise.\n\n**When to use**: Multi-step workflows with data transformation, conditional logic, error handling, or iterating over results.\n**When NOT to use**: Single tool calls (use call_tool directly), long-running operations (>2 minutes).\n\n**Available in code**:\n- `input` global: Your input data passed via the 'input' parameter\n- `call_tool(serverName, toolName, args)`: Call upstream tools (returns {ok, result} or {ok, error})\n- Modern JavaScript (ES2020+): arrow functions, const/let, template literals, destructuring, classes, for-of, optional chaining (?.), nullish coalescing (??), spread/rest, Promises, Symbols, Map/Set, Proxy/Reflect (no require(), filesystem, or network access)\n\n**TypeScript support**: Set `language: \"typescript\"` to write TypeScript code with type annotations, interfaces, enums, and generics. Types are automatically stripped before execution.\n\n**Important runtime rules**:\n- `call_tool` is strictly SYNCHRONOUS. Do not use `await`.\n- Upstream tools usually return an MCP content array. To parse JSON results: `const data = JSON.parse(res.result.content[0].text);`\n- The last evaluated expression in your script is automatically returned as the final output.\n\n**Security**: Sandboxed execution with timeout enforcement. Respects existing quarantine and server restrictions.", + "modes": ["code_execution"] + } + ] +} diff --git a/bench/proxytools.go b/bench/proxytools.go new file mode 100644 index 00000000..8191d033 --- /dev/null +++ b/bench/proxytools.go @@ -0,0 +1,53 @@ +package bench + +import ( + _ "embed" + "encoding/json" +) + +//go:embed proxy_tools_v1.json +var proxyToolsJSON []byte + +// proxyTool is a built-in mcpproxy tool definition plus the routing modes that +// expose it in the agent's context. +type proxyTool struct { + ToolID string `json:"tool_id"` + Name string `json:"tool"` + Description string `json:"description"` + Modes []string `json:"modes"` +} + +type proxyToolFixture struct { + Version string `json:"version"` + Tools []proxyTool `json:"tools"` +} + +var proxyTools proxyToolFixture + +func init() { + if err := json.Unmarshal(proxyToolsJSON, &proxyTools); err != nil { + // The fixture is embedded at build time; a parse failure is a build/test + // bug, not a runtime condition. + panic("bench: invalid embedded proxy_tools_v1.json: " + err.Error()) + } +} + +// ProxyToolsForMode returns the built-in proxy tool definitions that occupy the +// agent's context window in the given routing mode. Provenance for each +// definition is in proxy_tools_v1.json (captured from internal/server/mcp.go). +func ProxyToolsForMode(mode string) []Tool { + var out []Tool + for _, pt := range proxyTools.Tools { + for _, m := range pt.Modes { + if m == mode { + out = append(out, Tool{ + ToolID: pt.ToolID, + Name: pt.Name, + Description: pt.Description, + }) + break + } + } + } + return out +} diff --git a/bench/report.go b/bench/report.go new file mode 100644 index 00000000..0ccbee4f --- /dev/null +++ b/bench/report.go @@ -0,0 +1,105 @@ +package bench + +import ( + "encoding/json" + "fmt" + "html/template" + "os" + "path/filepath" +) + +// WriteJSON writes the report as indented JSON to path. +func (r *Report) WriteJSON(path string) error { + data, err := json.MarshalIndent(r, "", " ") + if err != nil { + return fmt.Errorf("marshal report: %w", err) + } + if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil { + return fmt.Errorf("write %q: %w", path, err) + } + return nil +} + +// WriteHTML renders the report as a self-contained static dashboard. The output +// is a single file with no external assets so it can be published as-is to a +// static host (CI release-tag publishing is tracked as a follow-up). +func (r *Report) WriteHTML(path string) error { + tmpl, err := template.New("dashboard").Funcs(template.FuncMap{ + "pct": func(f float64) string { return fmt.Sprintf("%.1f%%", f*100) }, + }).Parse(dashboardHTML) + if err != nil { + return fmt.Errorf("parse template: %w", err) + } + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create %q: %w", path, err) + } + defer f.Close() + if err := tmpl.Execute(f, r); err != nil { + return fmt.Errorf("render dashboard: %w", err) + } + return nil +} + +// WriteReports writes both report.json and dashboard.html into dir. +func (r *Report) WriteReports(dir string) (jsonPath, htmlPath string, err error) { + if err = os.MkdirAll(dir, 0o755); err != nil { + return "", "", fmt.Errorf("mkdir %q: %w", dir, err) + } + jsonPath = filepath.Join(dir, "report.json") + htmlPath = filepath.Join(dir, "dashboard.html") + if err = r.WriteJSON(jsonPath); err != nil { + return "", "", err + } + if err = r.WriteHTML(htmlPath); err != nil { + return "", "", err + } + return jsonPath, htmlPath, nil +} + +const dashboardHTML = ` + + + + +mcpproxy benchmark — token reduction + + + +

mcpproxy benchmark

+

Token cost of loading tools into an agent's context, by routing mode.

+

Corpus {{.CorpusVersion}} · {{.CorpusTools}} tools · encoding {{.Encoding}}

+ + + + + + {{range .Modes}} + + + + + + + {{end}} + +
ModeTools in contextContext tokensSavings vs. baseline
{{.Mode}}{{.ContextTools}}{{.Tokens}}{{if eq .Mode "baseline"}}—{{else}}{{pct .SavingsRatio}}{{end}}
+

Methodology notes

+ + + +` diff --git a/bench/tokens.go b/bench/tokens.go new file mode 100644 index 00000000..f903fbab --- /dev/null +++ b/bench/tokens.go @@ -0,0 +1,171 @@ +// Package bench is the mcpproxy benchmark harness (roadmap #19 / MCP-42). +// +// It produces the reproducible numbers behind mcpproxy's marketing claims — +// token reduction, discovery accuracy, and latency — by comparing three ways +// an agent can be wired to upstream MCP tools: +// +// - baseline: every upstream tool definition is loaded directly into the +// agent's context (no proxy discovery). +// - retrieve_tools: only mcpproxy's discovery + call_tool variants occupy the +// context; tools are found on demand via BM25 search. +// - code_execution: only code_execution + retrieve_tools occupy the context; +// the agent orchestrates many tools from sandboxed JS in one round-trip. +// +// The token-reduction measurement in this file is fully deterministic and +// offline: it counts the context cost of each mode over a frozen tool corpus +// using the tiktoken cl100k_base encoding (a reproducible, model-agnostic +// estimator). It reuses the Spec 065 frozen corpus +// (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json) as its tool +// universe so the benchmark scores a versioned, non-drifting snapshot (CN-002). +// +// Methodology, limitations, and the live (docker-compose) run that adds full +// JSON input schemas and end-to-end accuracy/latency are documented in +// bench/README.md. +package bench + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/pkoukk/tiktoken-go" +) + +// DefaultEncoding is the tiktoken encoding used for token estimation. cl100k_base +// is a widely-used, reproducible BPE; exact counts for a specific pinned model +// (e.g. Claude) will differ, but the *relative* savings between modes are stable. +const DefaultEncoding = "cl100k_base" + +// Routing modes the benchmark compares. The mode names mirror the mcpproxy +// MCP servers in internal/server/mcp.go (codeExecServer, callToolServer). +const ( + ModeBaseline = "baseline" + ModeRetrieveTools = "retrieve_tools" + ModeCodeExecution = "code_execution" +) + +// Tool is a single tool definition the benchmark scores token cost over. It +// matches the shape of both the Spec 065 corpus snapshot and the embedded +// proxy-tool fixture. +type Tool struct { + ToolID string `json:"tool_id"` + Server string `json:"server"` + Name string `json:"tool"` + Description string `json:"description"` +} + +// Corpus is a frozen, versioned set of tool definitions. +type Corpus struct { + Version string `json:"version"` + Tools []Tool `json:"tools"` +} + +// LoadCorpus reads a frozen corpus snapshot (e.g. the Spec 065 +// corpus_v1.tools.json) from disk. +func LoadCorpus(path string) (*Corpus, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read corpus %q: %w", path, err) + } + var c Corpus + if err := json.Unmarshal(data, &c); err != nil { + return nil, fmt.Errorf("parse corpus %q: %w", path, err) + } + if len(c.Tools) == 0 { + return nil, fmt.Errorf("corpus %q contains no tools", path) + } + return &c, nil +} + +// Tokenizer wraps a tiktoken encoding for reproducible token estimation. +type Tokenizer struct { + enc *tiktoken.Tiktoken + encoding string +} + +// NewTokenizer constructs a Tokenizer for the given tiktoken encoding name. +func NewTokenizer(encoding string) (*Tokenizer, error) { + enc, err := tiktoken.GetEncoding(encoding) + if err != nil { + return nil, fmt.Errorf("load tiktoken encoding %q: %w", encoding, err) + } + return &Tokenizer{enc: enc, encoding: encoding}, nil +} + +// Count returns the number of tokens in text. +func (t *Tokenizer) Count(text string) int { + return len(t.enc.Encode(text, nil, nil)) +} + +// CountTool returns the context-token cost of a single tool definition. +// +// It counts the tool name and description only. Input JSON schemas are excluded +// uniformly across every mode because the committed Spec 065 corpus snapshot +// does not carry schemas. This is deliberately conservative for the headline +// claim: upstream tools carry far larger schemas than mcpproxy's handful of +// proxy tools, so excluding schemas *understates* the baseline and therefore +// understates the measured savings. The live docker-compose run (README.md) +// adds full schemas from GET /api/v1/tools for the exact headline number. +func (t *Tokenizer) CountTool(tl Tool) int { + return t.Count(tl.Name + "\n" + tl.Description) +} + +func (t *Tokenizer) countTools(tools []Tool) int { + total := 0 + for _, tl := range tools { + total += t.CountTool(tl) + } + return total +} + +// ModeResult is the per-mode context-cost outcome. +type ModeResult struct { + Mode string `json:"mode"` + ContextTools int `json:"context_tools"` + Tokens int `json:"tokens"` + SavingsRatio float64 `json:"savings_vs_baseline"` +} + +// Report is the full token-reduction benchmark result. +type Report struct { + Encoding string `json:"encoding"` + CorpusVersion string `json:"corpus_version"` + CorpusTools int `json:"corpus_tools"` + Modes []ModeResult `json:"modes"` + Notes []string `json:"notes"` +} + +// ComputeReport computes the per-mode context-token cost over the corpus and the +// savings of each proxy mode versus the baseline (all tools loaded directly). +func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report { + baseTokens := tk.countTools(corpus.Tools) + + rtTools := ProxyToolsForMode(ModeRetrieveTools) + ceTools := ProxyToolsForMode(ModeCodeExecution) + + savings := func(tokens int) float64 { + if baseTokens == 0 { + return 0 + } + return 1.0 - float64(tokens)/float64(baseTokens) + } + + rtTokens := tk.countTools(rtTools) + ceTokens := tk.countTools(ceTools) + + return &Report{ + Encoding: tk.encoding, + CorpusVersion: corpus.Version, + CorpusTools: len(corpus.Tools), + Modes: []ModeResult{ + {Mode: ModeBaseline, ContextTools: len(corpus.Tools), Tokens: baseTokens, SavingsRatio: 0}, + {Mode: ModeRetrieveTools, ContextTools: len(rtTools), Tokens: rtTokens, SavingsRatio: savings(rtTokens)}, + {Mode: ModeCodeExecution, ContextTools: len(ceTools), Tokens: ceTokens, SavingsRatio: savings(ceTokens)}, + }, + Notes: []string{ + "Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.", + "Counts tool name + description only; JSON input schemas are excluded uniformly, which understates the baseline and is therefore conservative for the savings claim.", + "Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.", + }, + } +} diff --git a/bench/tokens_test.go b/bench/tokens_test.go new file mode 100644 index 00000000..7265bd3e --- /dev/null +++ b/bench/tokens_test.go @@ -0,0 +1,153 @@ +package bench + +import ( + "path/filepath" + "testing" +) + +// repoCorpus is the committed Spec 065 frozen corpus, reused here as the +// benchmark's tool universe (45 tools, 7 no-auth reference servers). +const repoCorpus = "../specs/065-evaluation-foundation/datasets/corpus_v1.tools.json" + +func newTestTokenizer(t *testing.T) *Tokenizer { + t.Helper() + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + t.Fatalf("NewTokenizer: %v", err) + } + return tk +} + +func TestTokenizer_DeterministicAndPositive(t *testing.T) { + tk := newTestTokenizer(t) + text := "Fetches a URL from the internet and extracts its contents as markdown." + a := tk.Count(text) + b := tk.Count(text) + if a != b { + t.Fatalf("tokenizer not deterministic: %d != %d", a, b) + } + if a <= 0 { + t.Fatalf("expected positive token count, got %d", a) + } +} + +func TestProxyToolsForMode(t *testing.T) { + rt := ProxyToolsForMode(ModeRetrieveTools) + if len(rt) == 0 { + t.Fatal("retrieve_tools mode exposes no proxy tools") + } + // retrieve_tools mode must expose the discovery tool + the call_tool variants. + want := map[string]bool{ + "retrieve_tools": false, + "call_tool_read": false, + "call_tool_write": false, + "call_tool_destructive": false, + } + for _, tl := range rt { + if _, ok := want[tl.Name]; ok { + want[tl.Name] = true + } + } + for name, found := range want { + if !found { + t.Errorf("retrieve_tools mode missing expected proxy tool %q", name) + } + } + + ce := ProxyToolsForMode(ModeCodeExecution) + var hasCodeExec, hasRetrieve bool + for _, tl := range ce { + switch tl.Name { + case "code_execution": + hasCodeExec = true + case "retrieve_tools": + hasRetrieve = true + } + } + if !hasCodeExec || !hasRetrieve { + t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce)) + } +} + +func TestComputeReport_SavingsAreReal(t *testing.T) { + tk := newTestTokenizer(t) + corpus, err := LoadCorpus(filepath.Clean(repoCorpus)) + if err != nil { + t.Fatalf("LoadCorpus: %v", err) + } + if len(corpus.Tools) < 40 { + t.Fatalf("expected the frozen corpus to have ~45 tools, got %d", len(corpus.Tools)) + } + + rep := ComputeReport(tk, corpus) + + modes := map[string]ModeResult{} + for _, m := range rep.Modes { + modes[m.Mode] = m + } + + base, ok := modes[ModeBaseline] + if !ok { + t.Fatal("report missing baseline mode") + } + if base.SavingsRatio != 0 { + t.Errorf("baseline savings must be 0, got %v", base.SavingsRatio) + } + if base.Tokens <= 0 { + t.Fatalf("baseline tokens must be positive, got %d", base.Tokens) + } + + rt := modes[ModeRetrieveTools] + ce := modes[ModeCodeExecution] + + // The whole product thesis: discovery/orchestration modes load far fewer + // tokens into context than loading every upstream tool directly. + if rt.Tokens >= base.Tokens { + t.Errorf("retrieve_tools (%d) should use fewer tokens than baseline (%d)", rt.Tokens, base.Tokens) + } + if ce.Tokens >= base.Tokens { + t.Errorf("code_execution (%d) should use fewer tokens than baseline (%d)", ce.Tokens, base.Tokens) + } + + // Savings ratio must be in (0,1) and match the arithmetic. + wantRT := 1.0 - float64(rt.Tokens)/float64(base.Tokens) + if diff := rt.SavingsRatio - wantRT; diff > 1e-9 || diff < -1e-9 { + t.Errorf("retrieve_tools savings ratio %v != computed %v", rt.SavingsRatio, wantRT) + } + if rt.SavingsRatio <= 0 || rt.SavingsRatio >= 1 { + t.Errorf("retrieve_tools savings ratio out of (0,1): %v", rt.SavingsRatio) + } +} + +func TestComputeReport_BaselineMonotonic(t *testing.T) { + tk := newTestTokenizer(t) + full := &Corpus{Version: "test", Tools: []Tool{ + {ToolID: "a:1", Server: "a", Name: "one", Description: "alpha tool that does something useful"}, + {ToolID: "b:2", Server: "b", Name: "two", Description: "beta tool that does something else useful"}, + {ToolID: "c:3", Server: "c", Name: "three", Description: "gamma tool with a longer description for token weight"}, + }} + fewer := &Corpus{Version: "test", Tools: full.Tools[:1]} + + big := ComputeReport(tk, full) + small := ComputeReport(tk, fewer) + + baseOf := func(r *Report) int { + for _, m := range r.Modes { + if m.Mode == ModeBaseline { + return m.Tokens + } + } + return -1 + } + if baseOf(big) <= baseOf(small) { + t.Errorf("more tools must mean more baseline tokens: %d <= %d", baseOf(big), baseOf(small)) + } +} + +func toolNames(ts []Tool) []string { + out := make([]string, len(ts)) + for i, t := range ts { + out[i] = t.Name + } + return out +} From 732b29c703e7ee388fbdf2d665bcbbfbf7e2f473 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 15:42:49 +0300 Subject: [PATCH 2/3] fix(bench): drop stale line numbers from provenance; add WriteReports smoke test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KimiReviewer finding 2: code_execution is at line 626 in mcp.go at 89f06b5c, not 675 as claimed. Line numbers drift with unrelated edits and the actual function names are the stable identifier — remove all line numbers from the provenance comment to prevent future rot. KimiReviewer finding 3: add TestWriteReports_SmokeTest covering WriteReports output (JSON round-trips to Report, HTML is non-empty and contains all mode names). All 5 tests pass; golangci-lint v2 clean. Related #MCP-42 Co-Authored-By: Paperclip --- bench/proxy_tools_v1.json | 2 +- bench/tokens_test.go | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json index 26fc14a8..841770e8 100644 --- a/bench/proxy_tools_v1.json +++ b/bench/proxy_tools_v1.json @@ -1,6 +1,6 @@ { "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).", - "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools (mcp.go:561), call_tool_read/write/destructive variant descriptions (mcp.go:490-528), read_cache (mcp.go:605), code_execution (mcp.go:675). Captured verbatim at origin/main 89f06b5c.", + "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools, call_tool_read/write/destructive variant descriptions, read_cache, code_execution. Descriptions captured verbatim at origin/main 89f06b5c. Line numbers omitted (they drift with unrelated edits; grep the function names to locate them).", "version": "proxy_v1", "tools": [ { diff --git a/bench/tokens_test.go b/bench/tokens_test.go index 7265bd3e..f05a3570 100644 --- a/bench/tokens_test.go +++ b/bench/tokens_test.go @@ -1,6 +1,9 @@ package bench import ( + "bytes" + "encoding/json" + "os" "path/filepath" "testing" ) @@ -144,6 +147,47 @@ func TestComputeReport_BaselineMonotonic(t *testing.T) { } } +func TestWriteReports_SmokeTest(t *testing.T) { + tk := newTestTokenizer(t) + corpus := &Corpus{Version: "test", Tools: []Tool{ + {ToolID: "a:1", Server: "a", Name: "tool_a", Description: "does something"}, + }} + rep := ComputeReport(tk, corpus) + + dir := t.TempDir() + jsonPath, htmlPath, err := rep.WriteReports(dir) + if err != nil { + t.Fatalf("WriteReports: %v", err) + } + + // JSON must parse back to a Report with the right corpus version. + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read json: %v", err) + } + var got Report + if err := json.Unmarshal(data, &got); err != nil { + t.Fatalf("unmarshal json: %v", err) + } + if got.CorpusVersion != "test" { + t.Errorf("corpus version = %q, want %q", got.CorpusVersion, "test") + } + + // HTML must be non-empty and contain the mode names. + html, err := os.ReadFile(htmlPath) + if err != nil { + t.Fatalf("read html: %v", err) + } + if len(html) < 100 { + t.Fatalf("dashboard.html too short (%d bytes)", len(html)) + } + for _, mode := range []string{ModeBaseline, ModeRetrieveTools, ModeCodeExecution} { + if !bytes.Contains(html, []byte(mode)) { + t.Errorf("dashboard.html missing mode %q", mode) + } + } +} + func toolNames(ts []Tool) []string { out := make([]string, len(ts)) for i, t := range ts { From 9a92d7145c14113103d2a972c32f555b6c68b469 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 20:22:00 +0300 Subject: [PATCH 3/3] fix(bench): derive per-mode tool catalog from live server builders incl. management tools (MCP-3161) The token-reduction benchmark scored only 6 hand-maintained proxy tools and omitted the shared management tool set (upstream_servers, quarantine_security, search_servers, list_registries) that both routing modes append via buildManagementTools. That undercounted the proxy-mode context cost and inflated the headline savings (Codex finding on PR #747). Replace bench/proxy_tools_v1.json with server.ProxyModeToolDefs, which builds the catalog from the live builders (buildCallToolModeTools / buildCodeExecModeTools in internal/server/mcp_routing.go) so it can never drift from production and always reflects the tools the agent actually sees. This also fixes a second drift: the fixture's retrieve_tools descriptions did not match the per-mode builder descriptions. Corrected figures over the 45-tool Spec 065 corpus (name+description only): retrieve_tools ~17% (10 tools), code_execution ~43% (6 tools). Updated README and notes; the schema-exclusion claim is no longer unambiguously conservative now that large-schema management tools are in the proxy cost. Tests: bench asserts both modes include the 4 management tools; internal/server pins ProxyModeToolDefs to the builders so the catalog can't silently drift. Related #747 --- bench/README.md | 54 ++++++++++++++++---- bench/proxy_tools_v1.json | 43 ---------------- bench/proxytools.go | 73 ++++++++++++---------------- bench/tokens.go | 14 +++--- bench/tokens_test.go | 21 ++++++++ internal/server/bench_export.go | 57 ++++++++++++++++++++++ internal/server/bench_export_test.go | 64 ++++++++++++++++++++++++ 7 files changed, 225 insertions(+), 101 deletions(-) delete mode 100644 bench/proxy_tools_v1.json create mode 100644 internal/server/bench_export.go create mode 100644 internal/server/bench_export_test.go diff --git a/bench/README.md b/bench/README.md index cd7227a6..c99a3868 100644 --- a/bench/README.md +++ b/bench/README.md @@ -13,10 +13,18 @@ wired to upstream MCP tools. | Mode | What the agent sees in context | mcpproxy server | |------|--------------------------------|-----------------| | `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) | -| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache`; tools found on demand via BM25 | `callToolServer` | -| `code_execution` | `code_execution` + `retrieve_tools`; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` | +| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache` + `code_execution` + management tools; tools found on demand via BM25 | `callToolServer` | +| `code_execution` | `code_execution` + `retrieve_tools` + management tools; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` | -(Mode → exposed tools mirrors `internal/server/mcp.go`.) +Both proxy modes also append the shared **management tool set** — +`upstream_servers`, `quarantine_security`, `search_servers`, `list_registries` +— that the live routing-mode servers expose. These count against the proxy +context cost: omitting them undercounts that cost and inflates the savings. + +The per-mode catalog is **derived directly from the live tool builders** +(`buildCallToolModeTools` / `buildCodeExecModeTools` in +`internal/server/mcp_routing.go`, via `server.ProxyModeToolDefs`), so it can +never drift from production. ## What ships today (deterministic, offline) @@ -33,6 +41,23 @@ reports the savings of each proxy mode versus the baseline. Output: a `report.json` and a self-contained `dashboard.html` in `bench/results/` (gitignored). +#### Current deterministic result + +Over the 45-tool Spec 065 reference corpus, counting **tool name + description +only** (schemas excluded uniformly — see limitations), `cl100k_base`: + +| Mode | Context tools | Tokens | Savings vs. baseline | +|------|---------------|--------|----------------------| +| `baseline` | 45 | 1730 | — | +| `retrieve_tools` | 10 | 1431 | **~17%** | +| `code_execution` | 6 | 986 | **~43%** | + +These are deliberately modest: the proxy context here is the *full* per-mode +tool set (discovery + call-tool variants + management tools), and the corpus is +small. Savings grow toward the asymptote as the upstream tool count rises (the +baseline grows linearly while the proxy context stays fixed) — always quote the +corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`. + ### Scoring rubric — token reduction - **Tool universe**: the frozen Spec 065 snapshot @@ -43,6 +68,11 @@ reports the savings of each proxy mode versus the baseline. Output: a (already a repo dependency). It is a **model-agnostic estimator**; exact counts for a specific pinned model (e.g. Claude) will differ, but the *relative* savings between modes are stable. +- **Proxy-mode tools**: the *complete* per-mode catalog, derived from the live + server builders — discovery, the call-tool variants, `code_execution`, **and + the shared management tool set** (`upstream_servers`, `quarantine_security`, + `search_servers`, `list_registries`). Nothing the agent actually sees is + dropped from the proxy cost. - **Cost of a tool**: `name + "\n" + description`. JSON input schemas are excluded **uniformly** across all modes (the committed corpus snapshot does not carry schemas). @@ -50,10 +80,13 @@ reports the savings of each proxy mode versus the baseline. Output: a ### Known limitations (read before quoting a number) -- **Schemas excluded → conservative.** Upstream tools carry far larger input - schemas than mcpproxy's handful of proxy tools, so excluding schemas - *understates* the baseline and therefore *understates* the savings. The live - run below adds full schemas for the exact headline number. +- **Schemas excluded — direction is not clean.** Input schemas are dropped from + *both* sides. The 45 baseline tools lose their schemas, but so do the proxy + modes' management tools (e.g. `upstream_servers` carries a large multi-field + schema). So the name+description-only number is **not** unambiguously + conservative — it is its own well-defined metric. The live run below adds full + schemas from `GET /api/v1/tools` for the exact headline number; quote that for + marketing, not this offline estimate. - **Savings scale with tool count.** The 45-tool reference corpus is small; real deployments expose hundreds–thousands of tools, where the baseline grows linearly and the proxy context stays fixed, so savings approach the asymptote. @@ -84,8 +117,11 @@ rather than landed here: (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch, time, sequential-thinking). -- Proxy tool definitions: `bench/proxy_tools_v1.json`, captured verbatim from - `internal/server/mcp.go` (provenance recorded in the file). +- Proxy + management tool definitions: derived at run time from the live server + tool builders (`internal/server/mcp_routing.go` → + `buildCallToolModeTools` / `buildCodeExecModeTools`, exposed via + `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the + benchmark cannot drift from the tools the proxy actually serves. ## Reproducible live run (skeleton) diff --git a/bench/proxy_tools_v1.json b/bench/proxy_tools_v1.json deleted file mode 100644 index 841770e8..00000000 --- a/bench/proxy_tools_v1.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "__doc__": "Frozen snapshot of the mcpproxy built-in proxy tool definitions that occupy the agent's context window in each routing mode. These are the static per-mode context cost the benchmark scores against the baseline (all upstream tools loaded directly).", - "provenance": "internal/server/mcp.go registerTools()/buildCallToolVariantTool() — retrieve_tools, call_tool_read/write/destructive variant descriptions, read_cache, code_execution. Descriptions captured verbatim at origin/main 89f06b5c. Line numbers omitted (they drift with unrelated edits; grep the function names to locate them).", - "version": "proxy_v1", - "tools": [ - { - "tool_id": "mcpproxy:retrieve_tools", - "tool": "retrieve_tools", - "description": "🔍 CALL THIS FIRST to discover relevant tools! This is the primary tool discovery mechanism that searches across ALL upstream MCP servers using intelligent BM25 full-text search. Always use this before attempting to call any specific tools. Use natural language to describe what you want to accomplish (e.g., 'create GitHub repository', 'query database', 'weather forecast'). Results include 'annotations' (tool behavior hints like destructiveHint) and 'call_with' recommendation indicating which tool variant to use (call_tool_read/write/destructive). Then use the recommended variant with an 'intent' parameter. NOTE: Quarantined servers are excluded from search results for security. Use 'quarantine_security' tool to examine and manage quarantined servers. TO ADD NEW SERVERS: Use 'list_registries' then 'search_servers' to find and add new MCP servers.", - "modes": ["retrieve_tools", "code_execution"] - }, - { - "tool_id": "mcpproxy:call_tool_read", - "tool": "call_tool_read", - "description": "Execute a READ-ONLY tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: search, query, list, get, fetch, find, check, view, read, show, describe, lookup, retrieve, browse, explore, discover, scan, inspect, analyze, examine, validate, verify. Examples: search_files, get_user, list_repositories, query_database, find_issues, check_status. This is the DEFAULT choice when unsure - most tools are read-only.", - "modes": ["retrieve_tools"] - }, - { - "tool_id": "mcpproxy:call_tool_write", - "tool": "call_tool_write", - "description": "Execute a STATE-MODIFYING tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: create, update, modify, add, set, send, edit, change, write, post, put, patch, insert, upload, submit, assign, configure, enable, register, subscribe, publish, move, copy, rename, merge. Examples: create_issue, update_file, send_message, add_comment, set_status, edit_page. Use only when explicitly modifying state.", - "modes": ["retrieve_tools"] - }, - { - "tool_id": "mcpproxy:call_tool_destructive", - "tool": "call_tool_destructive", - "description": "Execute a DESTRUCTIVE tool. WORKFLOW: 1) Call retrieve_tools first to find tools, 2) Use the exact 'name' field from results. DECISION RULE: Use this when the tool name contains: delete, remove, drop, revoke, disable, destroy, purge, reset, clear, unsubscribe, cancel, terminate, close, archive, ban, block, disconnect, kill, wipe, truncate, force, hard. Examples: delete_repo, remove_user, drop_table, revoke_access, clear_cache, terminate_session. Use for irreversible or high-impact operations.", - "modes": ["retrieve_tools"] - }, - { - "tool_id": "mcpproxy:read_cache", - "tool": "read_cache", - "description": "Retrieve paginated data when mcpproxy indicates a tool response was truncated. Use the cache key provided in truncation messages to access the complete dataset with pagination.", - "modes": ["retrieve_tools"] - }, - { - "tool_id": "mcpproxy:code_execution", - "tool": "code_execution", - "description": "Execute JavaScript or TypeScript code that orchestrates multiple upstream MCP tools in a single request. Use this when you need to combine results from 2+ tools, implement conditional logic, loops, or data transformations that would require multiple round-trips otherwise.\n\n**When to use**: Multi-step workflows with data transformation, conditional logic, error handling, or iterating over results.\n**When NOT to use**: Single tool calls (use call_tool directly), long-running operations (>2 minutes).\n\n**Available in code**:\n- `input` global: Your input data passed via the 'input' parameter\n- `call_tool(serverName, toolName, args)`: Call upstream tools (returns {ok, result} or {ok, error})\n- Modern JavaScript (ES2020+): arrow functions, const/let, template literals, destructuring, classes, for-of, optional chaining (?.), nullish coalescing (??), spread/rest, Promises, Symbols, Map/Set, Proxy/Reflect (no require(), filesystem, or network access)\n\n**TypeScript support**: Set `language: \"typescript\"` to write TypeScript code with type annotations, interfaces, enums, and generics. Types are automatically stripped before execution.\n\n**Important runtime rules**:\n- `call_tool` is strictly SYNCHRONOUS. Do not use `await`.\n- Upstream tools usually return an MCP content array. To parse JSON results: `const data = JSON.parse(res.result.content[0].text);`\n- The last evaluated expression in your script is automatically returned as the final output.\n\n**Security**: Sandboxed execution with timeout enforcement. Respects existing quarantine and server restrictions.", - "modes": ["code_execution"] - } - ] -} diff --git a/bench/proxytools.go b/bench/proxytools.go index 8191d033..dda5edd4 100644 --- a/bench/proxytools.go +++ b/bench/proxytools.go @@ -1,53 +1,40 @@ package bench import ( - _ "embed" - "encoding/json" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/server" ) -//go:embed proxy_tools_v1.json -var proxyToolsJSON []byte - -// proxyTool is a built-in mcpproxy tool definition plus the routing modes that -// expose it in the agent's context. -type proxyTool struct { - ToolID string `json:"tool_id"` - Name string `json:"tool"` - Description string `json:"description"` - Modes []string `json:"modes"` -} - -type proxyToolFixture struct { - Version string `json:"version"` - Tools []proxyTool `json:"tools"` -} - -var proxyTools proxyToolFixture - -func init() { - if err := json.Unmarshal(proxyToolsJSON, &proxyTools); err != nil { - // The fixture is embedded at build time; a parse failure is a build/test - // bug, not a runtime condition. - panic("bench: invalid embedded proxy_tools_v1.json: " + err.Error()) +// ProxyToolsForMode returns the built-in mcpproxy proxy + management tool +// definitions that occupy the agent's context window in the given routing mode. +// +// The catalog is derived directly from the live server tool builders +// (internal/server.ProxyModeToolDefs → buildCallToolModeTools / +// buildCodeExecModeTools in internal/server/mcp_routing.go). This is the single +// source of truth: both routing modes append the shared management tool set +// (upstream_servers, quarantine_security, search_servers, list_registries), so +// deriving from the builders guarantees the benchmark counts the real per-mode +// context cost and can never drift from production by re-introducing the +// undercount that inflated the headline savings (MCP-3161). +func ProxyToolsForMode(mode string) []Tool { + var routingMode string + switch mode { + case ModeCodeExecution: + routingMode = config.RoutingModeCodeExecution + case ModeRetrieveTools: + routingMode = config.RoutingModeRetrieveTools + default: + return nil } -} -// ProxyToolsForMode returns the built-in proxy tool definitions that occupy the -// agent's context window in the given routing mode. Provenance for each -// definition is in proxy_tools_v1.json (captured from internal/server/mcp.go). -func ProxyToolsForMode(mode string) []Tool { - var out []Tool - for _, pt := range proxyTools.Tools { - for _, m := range pt.Modes { - if m == mode { - out = append(out, Tool{ - ToolID: pt.ToolID, - Name: pt.Name, - Description: pt.Description, - }) - break - } - } + defs := server.ProxyModeToolDefs(routingMode) + out := make([]Tool, 0, len(defs)) + for _, d := range defs { + out = append(out, Tool{ + ToolID: "mcpproxy:" + d.Name, + Name: d.Name, + Description: d.Description, + }) } return out } diff --git a/bench/tokens.go b/bench/tokens.go index f903fbab..e61b3ed4 100644 --- a/bench/tokens.go +++ b/bench/tokens.go @@ -101,11 +101,12 @@ func (t *Tokenizer) Count(text string) int { // // It counts the tool name and description only. Input JSON schemas are excluded // uniformly across every mode because the committed Spec 065 corpus snapshot -// does not carry schemas. This is deliberately conservative for the headline -// claim: upstream tools carry far larger schemas than mcpproxy's handful of -// proxy tools, so excluding schemas *understates* the baseline and therefore -// understates the measured savings. The live docker-compose run (README.md) -// adds full schemas from GET /api/v1/tools for the exact headline number. +// does not carry schemas. Schemas are dropped from BOTH sides — the baseline's +// upstream tools and the proxy modes' management tools (e.g. upstream_servers +// carries a large multi-field schema) — so this is a well-defined +// name+description-only metric, not an unambiguously conservative one. The live +// docker-compose run (README.md) adds full schemas from GET /api/v1/tools for +// the exact headline number. func (t *Tokenizer) CountTool(tl Tool) int { return t.Count(tl.Name + "\n" + tl.Description) } @@ -164,7 +165,8 @@ func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report { }, Notes: []string{ "Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.", - "Counts tool name + description only; JSON input schemas are excluded uniformly, which understates the baseline and is therefore conservative for the savings claim.", + "Proxy-mode tools are the full per-mode catalog derived from the live server builders (internal/server.ProxyModeToolDefs), including the shared management tool set (upstream_servers, quarantine_security, search_servers, list_registries).", + "Counts tool name + description only; JSON input schemas are excluded uniformly from both the baseline and the proxy modes, so this is a name+description-only metric (not unambiguously conservative). See bench/README.md for the live run with full schemas.", "Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.", }, } diff --git a/bench/tokens_test.go b/bench/tokens_test.go index f05a3570..26296fe4 100644 --- a/bench/tokens_test.go +++ b/bench/tokens_test.go @@ -70,6 +70,27 @@ func TestProxyToolsForMode(t *testing.T) { if !hasCodeExec || !hasRetrieve { t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce)) } + + // Both routing modes append the shared management tool set + // (internal/server/mcp_routing.go buildManagementTools). Omitting these + // undercounts the proxy-mode context cost and overstates the savings + // (MCP-3161 / Codex finding on PR #747). Assert they are present so the + // benchmark catalog can never silently drop them again. + mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"} + for _, mode := range []string{ModeRetrieveTools, ModeCodeExecution} { + got := map[string]bool{} + for _, tl := range ProxyToolsForMode(mode) { + got[tl.Name] = true + if tl.Description == "" { + t.Errorf("mode %s: tool %q has empty description", mode, tl.Name) + } + } + for _, name := range mgmt { + if !got[name] { + t.Errorf("mode %s: missing management tool %q (proxy context cost undercounted)", mode, name) + } + } + } } func TestComputeReport_SavingsAreReal(t *testing.T) { diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go new file mode 100644 index 00000000..95987195 --- /dev/null +++ b/internal/server/bench_export.go @@ -0,0 +1,57 @@ +package server + +import ( + mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" +) + +// BenchProxyToolDef is a static built-in proxy/management tool definition +// (name + description) exposed for the in-repo benchmark harness (bench/). +// +// The benchmark scores the per-mode context cost an agent pays for mcpproxy's +// own tools. That cost MUST reflect every tool the live routing-mode servers +// expose — including the shared management tool set (upstream_servers, +// quarantine_security, search_servers, list_registries) that both modes append +// via buildManagementTools — or the benchmark overstates the token savings +// (MCP-3161 / Codex finding on PR #747). +type BenchProxyToolDef struct { + Name string + Description string +} + +// ProxyModeToolDefs returns the static built-in proxy + management tool +// definitions an agent sees in its context window for the given routing mode +// (config.RoutingModeRetrieveTools or config.RoutingModeCodeExecution). +// +// It is built from the SAME builders the live server uses +// (buildCallToolModeTools / buildCodeExecModeTools in mcp_routing.go) so the +// benchmark catalog can never drift from production. Code execution is enabled +// so the real code_execution tool description (not the disabled stub) is scored +// — the code_execution routing mode only makes sense with the tool enabled. +func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef { + p := &MCPProxyServer{ + logger: zap.NewNop(), + config: &config.Config{ + EnableCodeExecution: true, + }, + } + + var serverTools []mcpserver.ServerTool + switch routingMode { + case config.RoutingModeCodeExecution: + serverTools = p.buildCodeExecModeTools() + default: // retrieve_tools — the default routing mode + serverTools = p.buildCallToolModeTools() + } + + defs := make([]BenchProxyToolDef, 0, len(serverTools)) + for _, st := range serverTools { + defs = append(defs, BenchProxyToolDef{ + Name: st.Tool.Name, + Description: st.Tool.Description, + }) + } + return defs +} diff --git a/internal/server/bench_export_test.go b/internal/server/bench_export_test.go new file mode 100644 index 00000000..5dd63132 --- /dev/null +++ b/internal/server/bench_export_test.go @@ -0,0 +1,64 @@ +package server + +import ( + "testing" + + mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" +) + +// TestProxyModeToolDefs_IncludesManagementTools guards the benchmark integrity +// fix (MCP-3161): every routing mode exposes the shared management tool set, so +// the benchmark catalog must include it or it undercounts the proxy-mode context +// cost and overstates the savings. +func TestProxyModeToolDefs_IncludesManagementTools(t *testing.T) { + mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"} + for _, mode := range []string{config.RoutingModeRetrieveTools, config.RoutingModeCodeExecution} { + defs := ProxyModeToolDefs(mode) + if len(defs) == 0 { + t.Fatalf("mode %s: no proxy tool defs", mode) + } + names := map[string]bool{} + for _, d := range defs { + names[d.Name] = true + if d.Description == "" { + t.Errorf("mode %s: tool %q has empty description", mode, d.Name) + } + } + for _, m := range mgmt { + if !names[m] { + t.Errorf("mode %s: missing management tool %q", mode, m) + } + } + } +} + +// TestProxyModeToolDefs_MatchesBuilders pins ProxyModeToolDefs to the live tool +// builders. If a mode's tool set changes in mcp_routing.go, the benchmark +// catalog tracks it automatically and this test proves the coupling holds. +func TestProxyModeToolDefs_MatchesBuilders(t *testing.T) { + p := &MCPProxyServer{ + logger: zap.NewNop(), + config: &config.Config{EnableCodeExecution: true}, + } + cases := map[string][]mcpserver.ServerTool{ + config.RoutingModeRetrieveTools: p.buildCallToolModeTools(), + config.RoutingModeCodeExecution: p.buildCodeExecModeTools(), + } + for mode, builderTools := range cases { + defs := ProxyModeToolDefs(mode) + if len(defs) != len(builderTools) { + t.Fatalf("mode %s: ProxyModeToolDefs len %d != builder len %d", mode, len(defs), len(builderTools)) + } + for i := range builderTools { + if defs[i].Name != builderTools[i].Tool.Name { + t.Errorf("mode %s: def[%d] name %q != builder %q", mode, i, defs[i].Name, builderTools[i].Tool.Name) + } + if defs[i].Description != builderTools[i].Tool.Description { + t.Errorf("mode %s: def[%d] description mismatch for %q", mode, i, defs[i].Name) + } + } + } +}