diff --git a/bench/.gitignore b/bench/.gitignore new file mode 100644 index 00000000..4baf56cf --- /dev/null +++ b/bench/.gitignore @@ -0,0 +1,2 @@ +# Benchmark run artifacts are never committed (Spec 065 CN-003). +results/ diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 00000000..c99a3868 --- /dev/null +++ b/bench/README.md @@ -0,0 +1,135 @@ +# mcpproxy benchmark harness + +The reproducible numbers behind mcpproxy's marketing claims — **token reduction**, +**discovery accuracy**, and **latency** — comparing three ways an agent can be +wired to upstream MCP tools. + +> Roadmap item #19 (MCP-42). In-repo (`bench/`), reproducible, intended to be +> refreshed on release. Reports are **never committed** (Spec 065 CN-003); only +> code, fixtures, and this methodology are versioned. + +## The three modes + +| Mode | What the agent sees in context | mcpproxy server | +|------|--------------------------------|-----------------| +| `baseline` | Every upstream tool definition, loaded directly | (no proxy discovery) | +| `retrieve_tools` | `retrieve_tools` + `call_tool_read/write/destructive` + `read_cache` + `code_execution` + management tools; tools found on demand via BM25 | `callToolServer` | +| `code_execution` | `code_execution` + `retrieve_tools` + management tools; many tools orchestrated from sandboxed JS in one round-trip | `codeExecServer` | + +Both proxy modes also append the shared **management tool set** — +`upstream_servers`, `quarantine_security`, `search_servers`, `list_registries` +— that the live routing-mode servers expose. These count against the proxy +context cost: omitting them undercounts that cost and inflates the savings. + +The per-mode catalog is **derived directly from the live tool builders** +(`buildCallToolModeTools` / `buildCodeExecModeTools` in +`internal/server/mcp_routing.go`, via `server.ProxyModeToolDefs`), so it can +never drift from production. + +## What ships today (deterministic, offline) + +The **token-reduction** measurement is fully deterministic and runs with no +network or LLM: + +```bash +go run ./bench/cmd/bench # scores the committed Spec 065 corpus +go test ./bench/ # unit + invariant tests +``` + +It counts the context-token cost of each mode over a **frozen tool corpus** and +reports the savings of each proxy mode versus the baseline. Output: a +`report.json` and a self-contained `dashboard.html` in `bench/results/` +(gitignored). + +#### Current deterministic result + +Over the 45-tool Spec 065 reference corpus, counting **tool name + description +only** (schemas excluded uniformly — see limitations), `cl100k_base`: + +| Mode | Context tools | Tokens | Savings vs. baseline | +|------|---------------|--------|----------------------| +| `baseline` | 45 | 1730 | — | +| `retrieve_tools` | 10 | 1431 | **~17%** | +| `code_execution` | 6 | 986 | **~43%** | + +These are deliberately modest: the proxy context here is the *full* per-mode +tool set (discovery + call-tool variants + management tools), and the corpus is +small. Savings grow toward the asymptote as the upstream tool count rises (the +baseline grows linearly while the proxy context stays fixed) — always quote the +corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`. + +### Scoring rubric — token reduction + +- **Tool universe**: the frozen Spec 065 snapshot + `specs/065-evaluation-foundation/datasets/corpus_v1.tools.json` — 45 tools + across 7 no-auth reference servers. Frozen + versioned so scoring never runs + against a drifting corpus (CN-002). +- **Tokenizer**: `tiktoken cl100k_base`, a widely-used reproducible BPE + (already a repo dependency). It is a **model-agnostic estimator**; exact + counts for a specific pinned model (e.g. Claude) will differ, but the + *relative* savings between modes are stable. +- **Proxy-mode tools**: the *complete* per-mode catalog, derived from the live + server builders — discovery, the call-tool variants, `code_execution`, **and + the shared management tool set** (`upstream_servers`, `quarantine_security`, + `search_servers`, `list_registries`). Nothing the agent actually sees is + dropped from the proxy cost. +- **Cost of a tool**: `name + "\n" + description`. JSON input schemas are + excluded **uniformly** across all modes (the committed corpus snapshot does + not carry schemas). +- **Savings** for a mode `m`: `1 - tokens(m) / tokens(baseline)`. + +### Known limitations (read before quoting a number) + +- **Schemas excluded — direction is not clean.** Input schemas are dropped from + *both* sides. The 45 baseline tools lose their schemas, but so do the proxy + modes' management tools (e.g. `upstream_servers` carries a large multi-field + schema). So the name+description-only number is **not** unambiguously + conservative — it is its own well-defined metric. The live run below adds full + schemas from `GET /api/v1/tools` for the exact headline number; quote that for + marketing, not this offline estimate. +- **Savings scale with tool count.** The 45-tool reference corpus is small; real + deployments expose hundreds–thousands of tools, where the baseline grows + linearly and the proxy context stays fixed, so savings approach the asymptote. + Quote the corpus size alongside any percentage. +- **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer + for the headline model is tracked as a follow-up (see "Roadmap"). + +## What is scoped but not yet built (follow-ups) + +These require decisions and/or other roles, so they are tracked as child issues +rather than landed here: + +- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the + Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull + `GET /api/v1/tools` for exact schemas, and: + - **Accuracy**: replay the Spec 065 retrieval golden set + (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k / + MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer. + - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the + fixed cost of loading all tools. +- **End-to-end task success with a pinned LLM** — requires a pinned model + an + LLM-call budget; this is the only part that costs spend. +- **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane. + +## Dataset sources & provenance + +- Tool corpus + retrieval golden set: Spec 065 frozen datasets + (`specs/065-evaluation-foundation/datasets/`), generated from 7 permissively + reachable no-auth reference servers (filesystem, git, memory, sqlite, fetch, + time, sequential-thinking). +- Proxy + management tool definitions: derived at run time from the live server + tool builders (`internal/server/mcp_routing.go` → + `buildCallToolModeTools` / `buildCodeExecModeTools`, exposed via + `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the + benchmark cannot drift from the tools the proxy actually serves. + +## Reproducible live run (skeleton) + +`docker-compose.yml` boots mcpproxy over the frozen reference-server config so +the corpus and live tool list are reproducible across machines. Wiring the live +accuracy/latency scorers into it is the follow-up above. + +## Reviewer contact + +Methodology questions / disputes: open an issue in `smart-mcp-proxy/mcpproxy-go` +and tag the maintainers, or comment on the roadmap benchmark ticket (MCP-42). diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go new file mode 100644 index 00000000..a5e924b2 --- /dev/null +++ b/bench/cmd/bench/main.go @@ -0,0 +1,52 @@ +// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool +// corpus and writes a JSON report plus a static HTML dashboard. +// +// Usage: +// +// go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME] +// +// With no flags it scores the committed Spec 065 frozen corpus and writes the +// reports to bench/results/ (gitignored — reports are never committed, per the +// Spec 065 CN-003 repo rule). +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "github.com/smart-mcp-proxy/mcpproxy-go/bench" +) + +func main() { + corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot") + outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html") + encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name") + flag.Parse() + + tk, err := bench.NewTokenizer(*encoding) + if err != nil { + log.Fatalf("bench: %v", err) + } + corpus, err := bench.LoadCorpus(*corpusPath) + if err != nil { + log.Fatalf("bench: %v", err) + } + + report := bench.ComputeReport(tk, corpus) + jsonPath, htmlPath, err := report.WriteReports(*outDir) + if err != nil { + log.Fatalf("bench: %v", err) + } + + fmt.Fprintf(os.Stdout, "mcpproxy token-reduction benchmark (corpus %s, %d tools, %s)\n", report.CorpusVersion, report.CorpusTools, report.Encoding) + for _, m := range report.Modes { + if m.Mode == bench.ModeBaseline { + fmt.Fprintf(os.Stdout, " %-16s %6d tokens (%d tools) baseline\n", m.Mode, m.Tokens, m.ContextTools) + continue + } + fmt.Fprintf(os.Stdout, " %-16s %6d tokens (%d tools) %.1f%% fewer tokens\n", m.Mode, m.Tokens, m.ContextTools, m.SavingsRatio*100) + } + fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath) +} diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml new file mode 100644 index 00000000..7c420947 --- /dev/null +++ b/bench/docker-compose.yml @@ -0,0 +1,37 @@ +# Reproducible benchmark substrate (skeleton). +# +# Boots mcpproxy over the frozen Spec 065 reference-server config so the tool +# corpus and live tool list are identical across machines. The live +# accuracy/latency scorers (see bench/README.md "follow-ups") attach to this. +# +# Usage: +# docker compose -f bench/docker-compose.yml up --build +# # then, against the running proxy on 127.0.0.1:8092: +# # GET /api/v1/tools -> full tool defs (with schemas) for the live token run +# # retrieve_tools -> Recall@k accuracy over retrieval_golden_v1.json +# +# The committed corpus_v1 snapshot was frozen from exactly this config +# (specs/065-evaluation-foundation/datasets/README.md), so a live snapshot here +# reproduces it (modulo upstream-server version drift — pin images before +# publishing headline numbers). +services: + mcpproxy: + build: + context: .. + dockerfile: Dockerfile + command: + - serve + - --config=/data/snapshot-servers.config.json + - --data-dir=/data/state + - --listen=0.0.0.0:8092 + environment: + MCPPROXY_API_KEY: eval-corpus-snapshot + ports: + - "127.0.0.1:8092:8092" + volumes: + # The frozen, servable reference-server config (7 no-auth servers). + - ../specs/065-evaluation-foundation/datasets/snapshot-servers.config.json:/data/snapshot-servers.config.json:ro + - bench-state:/data/state + +volumes: + bench-state: diff --git a/bench/proxytools.go b/bench/proxytools.go new file mode 100644 index 00000000..dda5edd4 --- /dev/null +++ b/bench/proxytools.go @@ -0,0 +1,40 @@ +package bench + +import ( + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/server" +) + +// ProxyToolsForMode returns the built-in mcpproxy proxy + management tool +// definitions that occupy the agent's context window in the given routing mode. +// +// The catalog is derived directly from the live server tool builders +// (internal/server.ProxyModeToolDefs → buildCallToolModeTools / +// buildCodeExecModeTools in internal/server/mcp_routing.go). This is the single +// source of truth: both routing modes append the shared management tool set +// (upstream_servers, quarantine_security, search_servers, list_registries), so +// deriving from the builders guarantees the benchmark counts the real per-mode +// context cost and can never drift from production by re-introducing the +// undercount that inflated the headline savings (MCP-3161). +func ProxyToolsForMode(mode string) []Tool { + var routingMode string + switch mode { + case ModeCodeExecution: + routingMode = config.RoutingModeCodeExecution + case ModeRetrieveTools: + routingMode = config.RoutingModeRetrieveTools + default: + return nil + } + + defs := server.ProxyModeToolDefs(routingMode) + out := make([]Tool, 0, len(defs)) + for _, d := range defs { + out = append(out, Tool{ + ToolID: "mcpproxy:" + d.Name, + Name: d.Name, + Description: d.Description, + }) + } + return out +} diff --git a/bench/report.go b/bench/report.go new file mode 100644 index 00000000..0ccbee4f --- /dev/null +++ b/bench/report.go @@ -0,0 +1,105 @@ +package bench + +import ( + "encoding/json" + "fmt" + "html/template" + "os" + "path/filepath" +) + +// WriteJSON writes the report as indented JSON to path. +func (r *Report) WriteJSON(path string) error { + data, err := json.MarshalIndent(r, "", " ") + if err != nil { + return fmt.Errorf("marshal report: %w", err) + } + if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil { + return fmt.Errorf("write %q: %w", path, err) + } + return nil +} + +// WriteHTML renders the report as a self-contained static dashboard. The output +// is a single file with no external assets so it can be published as-is to a +// static host (CI release-tag publishing is tracked as a follow-up). +func (r *Report) WriteHTML(path string) error { + tmpl, err := template.New("dashboard").Funcs(template.FuncMap{ + "pct": func(f float64) string { return fmt.Sprintf("%.1f%%", f*100) }, + }).Parse(dashboardHTML) + if err != nil { + return fmt.Errorf("parse template: %w", err) + } + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create %q: %w", path, err) + } + defer f.Close() + if err := tmpl.Execute(f, r); err != nil { + return fmt.Errorf("render dashboard: %w", err) + } + return nil +} + +// WriteReports writes both report.json and dashboard.html into dir. +func (r *Report) WriteReports(dir string) (jsonPath, htmlPath string, err error) { + if err = os.MkdirAll(dir, 0o755); err != nil { + return "", "", fmt.Errorf("mkdir %q: %w", dir, err) + } + jsonPath = filepath.Join(dir, "report.json") + htmlPath = filepath.Join(dir, "dashboard.html") + if err = r.WriteJSON(jsonPath); err != nil { + return "", "", err + } + if err = r.WriteHTML(htmlPath); err != nil { + return "", "", err + } + return jsonPath, htmlPath, nil +} + +const dashboardHTML = ` + + + + +mcpproxy benchmark — token reduction + + + +

mcpproxy benchmark

+

Token cost of loading tools into an agent's context, by routing mode.

+

Corpus {{.CorpusVersion}} · {{.CorpusTools}} tools · encoding {{.Encoding}}

+ + + + + + {{range .Modes}} + + + + + + + {{end}} + +
ModeTools in contextContext tokensSavings vs. baseline
{{.Mode}}{{.ContextTools}}{{.Tokens}}{{if eq .Mode "baseline"}}—{{else}}{{pct .SavingsRatio}}{{end}}
+

Methodology notes

+ + + +` diff --git a/bench/tokens.go b/bench/tokens.go new file mode 100644 index 00000000..e61b3ed4 --- /dev/null +++ b/bench/tokens.go @@ -0,0 +1,173 @@ +// Package bench is the mcpproxy benchmark harness (roadmap #19 / MCP-42). +// +// It produces the reproducible numbers behind mcpproxy's marketing claims — +// token reduction, discovery accuracy, and latency — by comparing three ways +// an agent can be wired to upstream MCP tools: +// +// - baseline: every upstream tool definition is loaded directly into the +// agent's context (no proxy discovery). +// - retrieve_tools: only mcpproxy's discovery + call_tool variants occupy the +// context; tools are found on demand via BM25 search. +// - code_execution: only code_execution + retrieve_tools occupy the context; +// the agent orchestrates many tools from sandboxed JS in one round-trip. +// +// The token-reduction measurement in this file is fully deterministic and +// offline: it counts the context cost of each mode over a frozen tool corpus +// using the tiktoken cl100k_base encoding (a reproducible, model-agnostic +// estimator). It reuses the Spec 065 frozen corpus +// (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json) as its tool +// universe so the benchmark scores a versioned, non-drifting snapshot (CN-002). +// +// Methodology, limitations, and the live (docker-compose) run that adds full +// JSON input schemas and end-to-end accuracy/latency are documented in +// bench/README.md. +package bench + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/pkoukk/tiktoken-go" +) + +// DefaultEncoding is the tiktoken encoding used for token estimation. cl100k_base +// is a widely-used, reproducible BPE; exact counts for a specific pinned model +// (e.g. Claude) will differ, but the *relative* savings between modes are stable. +const DefaultEncoding = "cl100k_base" + +// Routing modes the benchmark compares. The mode names mirror the mcpproxy +// MCP servers in internal/server/mcp.go (codeExecServer, callToolServer). +const ( + ModeBaseline = "baseline" + ModeRetrieveTools = "retrieve_tools" + ModeCodeExecution = "code_execution" +) + +// Tool is a single tool definition the benchmark scores token cost over. It +// matches the shape of both the Spec 065 corpus snapshot and the embedded +// proxy-tool fixture. +type Tool struct { + ToolID string `json:"tool_id"` + Server string `json:"server"` + Name string `json:"tool"` + Description string `json:"description"` +} + +// Corpus is a frozen, versioned set of tool definitions. +type Corpus struct { + Version string `json:"version"` + Tools []Tool `json:"tools"` +} + +// LoadCorpus reads a frozen corpus snapshot (e.g. the Spec 065 +// corpus_v1.tools.json) from disk. +func LoadCorpus(path string) (*Corpus, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read corpus %q: %w", path, err) + } + var c Corpus + if err := json.Unmarshal(data, &c); err != nil { + return nil, fmt.Errorf("parse corpus %q: %w", path, err) + } + if len(c.Tools) == 0 { + return nil, fmt.Errorf("corpus %q contains no tools", path) + } + return &c, nil +} + +// Tokenizer wraps a tiktoken encoding for reproducible token estimation. +type Tokenizer struct { + enc *tiktoken.Tiktoken + encoding string +} + +// NewTokenizer constructs a Tokenizer for the given tiktoken encoding name. +func NewTokenizer(encoding string) (*Tokenizer, error) { + enc, err := tiktoken.GetEncoding(encoding) + if err != nil { + return nil, fmt.Errorf("load tiktoken encoding %q: %w", encoding, err) + } + return &Tokenizer{enc: enc, encoding: encoding}, nil +} + +// Count returns the number of tokens in text. +func (t *Tokenizer) Count(text string) int { + return len(t.enc.Encode(text, nil, nil)) +} + +// CountTool returns the context-token cost of a single tool definition. +// +// It counts the tool name and description only. Input JSON schemas are excluded +// uniformly across every mode because the committed Spec 065 corpus snapshot +// does not carry schemas. Schemas are dropped from BOTH sides — the baseline's +// upstream tools and the proxy modes' management tools (e.g. upstream_servers +// carries a large multi-field schema) — so this is a well-defined +// name+description-only metric, not an unambiguously conservative one. The live +// docker-compose run (README.md) adds full schemas from GET /api/v1/tools for +// the exact headline number. +func (t *Tokenizer) CountTool(tl Tool) int { + return t.Count(tl.Name + "\n" + tl.Description) +} + +func (t *Tokenizer) countTools(tools []Tool) int { + total := 0 + for _, tl := range tools { + total += t.CountTool(tl) + } + return total +} + +// ModeResult is the per-mode context-cost outcome. +type ModeResult struct { + Mode string `json:"mode"` + ContextTools int `json:"context_tools"` + Tokens int `json:"tokens"` + SavingsRatio float64 `json:"savings_vs_baseline"` +} + +// Report is the full token-reduction benchmark result. +type Report struct { + Encoding string `json:"encoding"` + CorpusVersion string `json:"corpus_version"` + CorpusTools int `json:"corpus_tools"` + Modes []ModeResult `json:"modes"` + Notes []string `json:"notes"` +} + +// ComputeReport computes the per-mode context-token cost over the corpus and the +// savings of each proxy mode versus the baseline (all tools loaded directly). +func ComputeReport(tk *Tokenizer, corpus *Corpus) *Report { + baseTokens := tk.countTools(corpus.Tools) + + rtTools := ProxyToolsForMode(ModeRetrieveTools) + ceTools := ProxyToolsForMode(ModeCodeExecution) + + savings := func(tokens int) float64 { + if baseTokens == 0 { + return 0 + } + return 1.0 - float64(tokens)/float64(baseTokens) + } + + rtTokens := tk.countTools(rtTools) + ceTokens := tk.countTools(ceTools) + + return &Report{ + Encoding: tk.encoding, + CorpusVersion: corpus.Version, + CorpusTools: len(corpus.Tools), + Modes: []ModeResult{ + {Mode: ModeBaseline, ContextTools: len(corpus.Tools), Tokens: baseTokens, SavingsRatio: 0}, + {Mode: ModeRetrieveTools, ContextTools: len(rtTools), Tokens: rtTokens, SavingsRatio: savings(rtTokens)}, + {Mode: ModeCodeExecution, ContextTools: len(ceTools), Tokens: ceTokens, SavingsRatio: savings(ceTokens)}, + }, + Notes: []string{ + "Token counts use the tiktoken " + tk.encoding + " encoding as a reproducible, model-agnostic estimator; exact counts for a pinned model may differ.", + "Proxy-mode tools are the full per-mode catalog derived from the live server builders (internal/server.ProxyModeToolDefs), including the shared management tool set (upstream_servers, quarantine_security, search_servers, list_registries).", + "Counts tool name + description only; JSON input schemas are excluded uniformly from both the baseline and the proxy modes, so this is a name+description-only metric (not unambiguously conservative). See bench/README.md for the live run with full schemas.", + "Corpus is the frozen Spec 065 snapshot (specs/065-evaluation-foundation/datasets/corpus_v1.tools.json); see bench/README.md for the live run with full schemas.", + }, + } +} diff --git a/bench/tokens_test.go b/bench/tokens_test.go new file mode 100644 index 00000000..26296fe4 --- /dev/null +++ b/bench/tokens_test.go @@ -0,0 +1,218 @@ +package bench + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "testing" +) + +// repoCorpus is the committed Spec 065 frozen corpus, reused here as the +// benchmark's tool universe (45 tools, 7 no-auth reference servers). +const repoCorpus = "../specs/065-evaluation-foundation/datasets/corpus_v1.tools.json" + +func newTestTokenizer(t *testing.T) *Tokenizer { + t.Helper() + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + t.Fatalf("NewTokenizer: %v", err) + } + return tk +} + +func TestTokenizer_DeterministicAndPositive(t *testing.T) { + tk := newTestTokenizer(t) + text := "Fetches a URL from the internet and extracts its contents as markdown." + a := tk.Count(text) + b := tk.Count(text) + if a != b { + t.Fatalf("tokenizer not deterministic: %d != %d", a, b) + } + if a <= 0 { + t.Fatalf("expected positive token count, got %d", a) + } +} + +func TestProxyToolsForMode(t *testing.T) { + rt := ProxyToolsForMode(ModeRetrieveTools) + if len(rt) == 0 { + t.Fatal("retrieve_tools mode exposes no proxy tools") + } + // retrieve_tools mode must expose the discovery tool + the call_tool variants. + want := map[string]bool{ + "retrieve_tools": false, + "call_tool_read": false, + "call_tool_write": false, + "call_tool_destructive": false, + } + for _, tl := range rt { + if _, ok := want[tl.Name]; ok { + want[tl.Name] = true + } + } + for name, found := range want { + if !found { + t.Errorf("retrieve_tools mode missing expected proxy tool %q", name) + } + } + + ce := ProxyToolsForMode(ModeCodeExecution) + var hasCodeExec, hasRetrieve bool + for _, tl := range ce { + switch tl.Name { + case "code_execution": + hasCodeExec = true + case "retrieve_tools": + hasRetrieve = true + } + } + if !hasCodeExec || !hasRetrieve { + t.Errorf("code_execution mode must expose code_execution + retrieve_tools, got %v", toolNames(ce)) + } + + // Both routing modes append the shared management tool set + // (internal/server/mcp_routing.go buildManagementTools). Omitting these + // undercounts the proxy-mode context cost and overstates the savings + // (MCP-3161 / Codex finding on PR #747). Assert they are present so the + // benchmark catalog can never silently drop them again. + mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"} + for _, mode := range []string{ModeRetrieveTools, ModeCodeExecution} { + got := map[string]bool{} + for _, tl := range ProxyToolsForMode(mode) { + got[tl.Name] = true + if tl.Description == "" { + t.Errorf("mode %s: tool %q has empty description", mode, tl.Name) + } + } + for _, name := range mgmt { + if !got[name] { + t.Errorf("mode %s: missing management tool %q (proxy context cost undercounted)", mode, name) + } + } + } +} + +func TestComputeReport_SavingsAreReal(t *testing.T) { + tk := newTestTokenizer(t) + corpus, err := LoadCorpus(filepath.Clean(repoCorpus)) + if err != nil { + t.Fatalf("LoadCorpus: %v", err) + } + if len(corpus.Tools) < 40 { + t.Fatalf("expected the frozen corpus to have ~45 tools, got %d", len(corpus.Tools)) + } + + rep := ComputeReport(tk, corpus) + + modes := map[string]ModeResult{} + for _, m := range rep.Modes { + modes[m.Mode] = m + } + + base, ok := modes[ModeBaseline] + if !ok { + t.Fatal("report missing baseline mode") + } + if base.SavingsRatio != 0 { + t.Errorf("baseline savings must be 0, got %v", base.SavingsRatio) + } + if base.Tokens <= 0 { + t.Fatalf("baseline tokens must be positive, got %d", base.Tokens) + } + + rt := modes[ModeRetrieveTools] + ce := modes[ModeCodeExecution] + + // The whole product thesis: discovery/orchestration modes load far fewer + // tokens into context than loading every upstream tool directly. + if rt.Tokens >= base.Tokens { + t.Errorf("retrieve_tools (%d) should use fewer tokens than baseline (%d)", rt.Tokens, base.Tokens) + } + if ce.Tokens >= base.Tokens { + t.Errorf("code_execution (%d) should use fewer tokens than baseline (%d)", ce.Tokens, base.Tokens) + } + + // Savings ratio must be in (0,1) and match the arithmetic. + wantRT := 1.0 - float64(rt.Tokens)/float64(base.Tokens) + if diff := rt.SavingsRatio - wantRT; diff > 1e-9 || diff < -1e-9 { + t.Errorf("retrieve_tools savings ratio %v != computed %v", rt.SavingsRatio, wantRT) + } + if rt.SavingsRatio <= 0 || rt.SavingsRatio >= 1 { + t.Errorf("retrieve_tools savings ratio out of (0,1): %v", rt.SavingsRatio) + } +} + +func TestComputeReport_BaselineMonotonic(t *testing.T) { + tk := newTestTokenizer(t) + full := &Corpus{Version: "test", Tools: []Tool{ + {ToolID: "a:1", Server: "a", Name: "one", Description: "alpha tool that does something useful"}, + {ToolID: "b:2", Server: "b", Name: "two", Description: "beta tool that does something else useful"}, + {ToolID: "c:3", Server: "c", Name: "three", Description: "gamma tool with a longer description for token weight"}, + }} + fewer := &Corpus{Version: "test", Tools: full.Tools[:1]} + + big := ComputeReport(tk, full) + small := ComputeReport(tk, fewer) + + baseOf := func(r *Report) int { + for _, m := range r.Modes { + if m.Mode == ModeBaseline { + return m.Tokens + } + } + return -1 + } + if baseOf(big) <= baseOf(small) { + t.Errorf("more tools must mean more baseline tokens: %d <= %d", baseOf(big), baseOf(small)) + } +} + +func TestWriteReports_SmokeTest(t *testing.T) { + tk := newTestTokenizer(t) + corpus := &Corpus{Version: "test", Tools: []Tool{ + {ToolID: "a:1", Server: "a", Name: "tool_a", Description: "does something"}, + }} + rep := ComputeReport(tk, corpus) + + dir := t.TempDir() + jsonPath, htmlPath, err := rep.WriteReports(dir) + if err != nil { + t.Fatalf("WriteReports: %v", err) + } + + // JSON must parse back to a Report with the right corpus version. + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read json: %v", err) + } + var got Report + if err := json.Unmarshal(data, &got); err != nil { + t.Fatalf("unmarshal json: %v", err) + } + if got.CorpusVersion != "test" { + t.Errorf("corpus version = %q, want %q", got.CorpusVersion, "test") + } + + // HTML must be non-empty and contain the mode names. + html, err := os.ReadFile(htmlPath) + if err != nil { + t.Fatalf("read html: %v", err) + } + if len(html) < 100 { + t.Fatalf("dashboard.html too short (%d bytes)", len(html)) + } + for _, mode := range []string{ModeBaseline, ModeRetrieveTools, ModeCodeExecution} { + if !bytes.Contains(html, []byte(mode)) { + t.Errorf("dashboard.html missing mode %q", mode) + } + } +} + +func toolNames(ts []Tool) []string { + out := make([]string, len(ts)) + for i, t := range ts { + out[i] = t.Name + } + return out +} diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go new file mode 100644 index 00000000..95987195 --- /dev/null +++ b/internal/server/bench_export.go @@ -0,0 +1,57 @@ +package server + +import ( + mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" +) + +// BenchProxyToolDef is a static built-in proxy/management tool definition +// (name + description) exposed for the in-repo benchmark harness (bench/). +// +// The benchmark scores the per-mode context cost an agent pays for mcpproxy's +// own tools. That cost MUST reflect every tool the live routing-mode servers +// expose — including the shared management tool set (upstream_servers, +// quarantine_security, search_servers, list_registries) that both modes append +// via buildManagementTools — or the benchmark overstates the token savings +// (MCP-3161 / Codex finding on PR #747). +type BenchProxyToolDef struct { + Name string + Description string +} + +// ProxyModeToolDefs returns the static built-in proxy + management tool +// definitions an agent sees in its context window for the given routing mode +// (config.RoutingModeRetrieveTools or config.RoutingModeCodeExecution). +// +// It is built from the SAME builders the live server uses +// (buildCallToolModeTools / buildCodeExecModeTools in mcp_routing.go) so the +// benchmark catalog can never drift from production. Code execution is enabled +// so the real code_execution tool description (not the disabled stub) is scored +// — the code_execution routing mode only makes sense with the tool enabled. +func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef { + p := &MCPProxyServer{ + logger: zap.NewNop(), + config: &config.Config{ + EnableCodeExecution: true, + }, + } + + var serverTools []mcpserver.ServerTool + switch routingMode { + case config.RoutingModeCodeExecution: + serverTools = p.buildCodeExecModeTools() + default: // retrieve_tools — the default routing mode + serverTools = p.buildCallToolModeTools() + } + + defs := make([]BenchProxyToolDef, 0, len(serverTools)) + for _, st := range serverTools { + defs = append(defs, BenchProxyToolDef{ + Name: st.Tool.Name, + Description: st.Tool.Description, + }) + } + return defs +} diff --git a/internal/server/bench_export_test.go b/internal/server/bench_export_test.go new file mode 100644 index 00000000..5dd63132 --- /dev/null +++ b/internal/server/bench_export_test.go @@ -0,0 +1,64 @@ +package server + +import ( + "testing" + + mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" +) + +// TestProxyModeToolDefs_IncludesManagementTools guards the benchmark integrity +// fix (MCP-3161): every routing mode exposes the shared management tool set, so +// the benchmark catalog must include it or it undercounts the proxy-mode context +// cost and overstates the savings. +func TestProxyModeToolDefs_IncludesManagementTools(t *testing.T) { + mgmt := []string{"upstream_servers", "quarantine_security", "search_servers", "list_registries"} + for _, mode := range []string{config.RoutingModeRetrieveTools, config.RoutingModeCodeExecution} { + defs := ProxyModeToolDefs(mode) + if len(defs) == 0 { + t.Fatalf("mode %s: no proxy tool defs", mode) + } + names := map[string]bool{} + for _, d := range defs { + names[d.Name] = true + if d.Description == "" { + t.Errorf("mode %s: tool %q has empty description", mode, d.Name) + } + } + for _, m := range mgmt { + if !names[m] { + t.Errorf("mode %s: missing management tool %q", mode, m) + } + } + } +} + +// TestProxyModeToolDefs_MatchesBuilders pins ProxyModeToolDefs to the live tool +// builders. If a mode's tool set changes in mcp_routing.go, the benchmark +// catalog tracks it automatically and this test proves the coupling holds. +func TestProxyModeToolDefs_MatchesBuilders(t *testing.T) { + p := &MCPProxyServer{ + logger: zap.NewNop(), + config: &config.Config{EnableCodeExecution: true}, + } + cases := map[string][]mcpserver.ServerTool{ + config.RoutingModeRetrieveTools: p.buildCallToolModeTools(), + config.RoutingModeCodeExecution: p.buildCodeExecModeTools(), + } + for mode, builderTools := range cases { + defs := ProxyModeToolDefs(mode) + if len(defs) != len(builderTools) { + t.Fatalf("mode %s: ProxyModeToolDefs len %d != builder len %d", mode, len(defs), len(builderTools)) + } + for i := range builderTools { + if defs[i].Name != builderTools[i].Tool.Name { + t.Errorf("mode %s: def[%d] name %q != builder %q", mode, i, defs[i].Name, builderTools[i].Tool.Name) + } + if defs[i].Description != builderTools[i].Tool.Description { + t.Errorf("mode %s: def[%d] description mismatch for %q", mode, i, defs[i].Name) + } + } + } +}