smart-mcp-proxy · Dumbris · Jun 23, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/bench/README.md b/bench/README.md
@@ -94,19 +94,48 @@ corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`.
 - **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer
   for the headline model is tracked as a follow-up (see "Roadmap").
 
+## Live run — full schemas + accuracy + latency
+
+The live run boots mcpproxy over the Spec 065 reference-server config and
+measures the three headline claims against a *running* proxy. Everything here is
+still deterministic and LLM-free.
+
+```bash
+# 1. Boot the reproducible substrate (proxy + 7 no-auth reference servers)
+docker compose -f bench/docker-compose.yml up --build -d
+
+# 2. Score against the running proxy (writes bench/results/live_report.json)
+go run ./bench/cmd/bench -live -proxy http://127.0.0.1:8092 -api-key eval-corpus-snapshot
+```
+
+What it adds over the offline token run:
+
+- **Exact token number (full schemas).** Pulls `GET /api/v1/tools` for the
+  upstream tools *with their full JSON input schemas* and counts them against
+  the proxy modes — whose management-tool schemas come from the same live
+  builders as the offline run (`server.ProxyModeToolDefs`). Because schemas are
+  counted on **both** sides, the savings is authoritative.
+  - **Safety valve (MCP-3161):** if any proxy tool is missing a schema, counting
+    the baseline's schemas alone would *overstate* savings, so the run
+    **withholds the headline %** and reports raw token totals only
+    (`authoritative_headline: false`). Never quote a withheld run.
+- **Accuracy.** Replays `retrieval_golden_v1.json` through the proxy's BM25
+  search (`GET /api/v1/index/search`) and scores **Recall@{1,3,5,10}, MRR,
+  nDCG@10, MAP** against the graded labels. Deterministic (BM25), so a single
+  run is reported (`runs_averaged: 1`). The emitted `retrieval` block **conforms
+  to** the Spec 065 `score-report.schema.json` shape — nested `metrics` + `gate`
+  (verified by a schema-validation test). A standalone live run has no stored
+  baseline to regress against, so `gate.passed` is `true` by construction;
+  CI regression-gating against a committed baseline is the MCP-3133 lane.
+- **Latency.** Client-measured per-query search latency (p50/p95/p99/max) vs.
+  the one-shot cost of loading all tools. Measured client-side on purpose: the
+  server's `SearchToolsResponse.took` field is currently a `"0ms"` stub.
+
 ## What is scoped but not yet built (follow-ups)
 
 These require decisions and/or other roles, so they are tracked as child issues
 rather than landed here:
 
-- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the
-  Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull
-  `GET /api/v1/tools` for exact schemas, and:
-  - **Accuracy**: replay the Spec 065 retrieval golden set
-    (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k /
-    MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer.
-  - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the
-    fixed cost of loading all tools.
 - **End-to-end task success with a pinned LLM** — requires a pinned model + an
   LLM-call budget; this is the only part that costs spend.
 - **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane.
@@ -123,11 +152,13 @@ rather than landed here:
   `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the
   benchmark cannot drift from the tools the proxy actually serves.
 
-## Reproducible live run (skeleton)
+## Reproducible live run
 
 `docker-compose.yml` boots mcpproxy over the frozen reference-server config so
-the corpus and live tool list are reproducible across machines. Wiring the live
-accuracy/latency scorers into it is the follow-up above.
+the corpus and live tool list are reproducible across machines. The live
+accuracy/latency/full-schema scorers attach to it via `-live` (see "Live run"
+above). Pin the upstream-server images before publishing headline numbers
+(image drift can change the tool corpus).
 
 ## Reviewer contact
 

diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go
@@ -1,16 +1,22 @@
-// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool
-// corpus and writes a JSON report plus a static HTML dashboard.
+// Command bench runs the mcpproxy benchmark.
 //
-// Usage:
+// Default (offline) mode scores the committed Spec 065 frozen corpus for
+// token reduction and writes a JSON report plus a static HTML dashboard:
 //
 //	go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME]
 //
-// With no flags it scores the committed Spec 065 frozen corpus and writes the
-// reports to bench/results/ (gitignored — reports are never committed, per the
-// Spec 065 CN-003 repo rule).
+// Live mode boots against a running proxy (see bench/docker-compose.yml) to add
+// the exact-token comparison (full schemas), retrieval accuracy (Recall@k / MRR
+// / nDCG over the golden set), and search latency:
+//
+//	go run ./bench/cmd/bench -live [-proxy URL] [-api-key KEY] [-golden PATH]
+//
+// Reports land in bench/results/ (gitignored — reports are never committed, per
+// the Spec 065 CN-003 repo rule).
 package main
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"log"
@@ -21,21 +27,33 @@ import (
 
 func main() {
 	corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot")
-	outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html")
+	outDir := flag.String("out", "bench/results", "output directory for reports")
 	encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name")
+	live := flag.Bool("live", false, "run the live benchmark against a running proxy (full schemas + accuracy + latency)")
+	proxy := flag.String("proxy", "http://127.0.0.1:8092", "live proxy base URL")
+	apiKey := flag.String("api-key", "eval-corpus-snapshot", "live proxy API key (X-API-Key)")
+	goldenPath := flag.String("golden", "specs/065-evaluation-foundation/datasets/retrieval_golden_v1.json", "path to the retrieval golden set")
 	flag.Parse()
 
-	tk, err := bench.NewTokenizer(*encoding)
+	if *live {
+		runLive(*proxy, *apiKey, *goldenPath, *outDir)
+		return
+	}
+	runOffline(*corpusPath, *encoding, *outDir)
+}
+
+func runOffline(corpusPath, encoding, outDir string) {
+	tk, err := bench.NewTokenizer(encoding)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
-	corpus, err := bench.LoadCorpus(*corpusPath)
+	corpus, err := bench.LoadCorpus(corpusPath)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
 
 	report := bench.ComputeReport(tk, corpus)
-	jsonPath, htmlPath, err := report.WriteReports(*outDir)
+	jsonPath, htmlPath, err := report.WriteReports(outDir)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
@@ -50,3 +68,45 @@ func main() {
 	}
 	fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath)
 }
+
+func runLive(proxy, apiKey, goldenPath, outDir string) {
+	golden, err := bench.LoadGoldenSet(goldenPath)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	client := bench.NewLiveClient(proxy, apiKey)
+	report, err := bench.RunLive(context.Background(), client, golden)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	jsonPath, err := report.WriteJSON(outDir)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	fmt.Fprintf(os.Stdout, "mcpproxy LIVE benchmark (proxy %s, %s)\n", report.Proxy, report.Encoding)
+	tr := report.Tokens
+	fmt.Fprintf(os.Stdout, "  tokens: %d upstream tools, baseline %d tokens (with full schemas)\n", tr.UpstreamTools, tr.BaselineTokens)
+	for _, m := range tr.Modes {
+		if m.Mode == bench.ModeBaseline {
+			continue
+		}
+		if tr.AuthoritativeHeadline {
+			fmt.Fprintf(os.Stdout, "    %-16s %6d tokens  %.1f%% fewer\n", m.Mode, m.Tokens, m.SavingsRatio*100)
+		} else {
+			fmt.Fprintf(os.Stdout, "    %-16s %6d tokens  (savings withheld — see notes)\n", m.Mode, m.Tokens)
+		}
+	}
+	if !tr.AuthoritativeHeadline {
+		for _, n := range tr.Notes {
+			fmt.Fprintf(os.Stdout, "  NOTE: %s\n", n)
+		}
+	}
+	r := report.Retrieval
+	fmt.Fprintf(os.Stdout, "  accuracy (%d queries): Recall@1=%.3f Recall@5=%.3f MRR=%.3f nDCG@10=%.3f MAP=%.3f\n",
+		r.QueryCount, r.Metrics.RecallAt[1], r.Metrics.RecallAt[5], r.Metrics.MRR, r.Metrics.NDCGAt10, r.Metrics.MAP)
+	l := report.Latency
+	fmt.Fprintf(os.Stdout, "  latency (%d searches): p50=%.1fms p95=%.1fms p99=%.1fms max=%.1fms; load-all-tools=%.1fms\n",
+		l.Samples, l.P50ms, l.P95ms, l.P99ms, l.MaxMs, l.LoadAllToolsMs)
+	fmt.Fprintf(os.Stdout, "wrote %s\n", jsonPath)
+}
diff --git a/bench/live.go b/bench/live.go
@@ -0,0 +1,168 @@
+package bench
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"strconv"
+	"time"
+)
+
+// LiveClient talks to a running mcpproxy instance (e.g. the bench
+// docker-compose substrate on 127.0.0.1:8092) over its REST API. It is used by
+// the live benchmark run to pull the exact tool definitions (with schemas) and
+// to replay the retrieval golden set through the proxy's BM25 search.
+type LiveClient struct {
+	BaseURL string
+	APIKey  string
+	HTTP    *http.Client
+}
+
+// NewLiveClient builds a LiveClient for baseURL (e.g. "http://127.0.0.1:8092")
+// authenticating with apiKey via the X-API-Key header.
+func NewLiveClient(baseURL, apiKey string) *LiveClient {
+	return &LiveClient{
+		BaseURL: baseURL,
+		APIKey:  apiKey,
+		HTTP:    &http.Client{Timeout: 30 * time.Second},
+	}
+}
+
+// successEnvelope is the standard mcpproxy REST response wrapper
+// ({"success":true,"data":{...}}). Data is decoded lazily by each caller.
+type successEnvelope struct {
+	Success bool            `json:"success"`
+	Data    json.RawMessage `json:"data"`
+	Error   string          `json:"error,omitempty"`
+}
+
+// getJSON performs an authenticated GET and unmarshals the envelope's data
+// field into out.
+func (c *LiveClient) getJSON(ctx context.Context, path string, out interface{}) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.BaseURL+path, nil)
+	if err != nil {
+		return fmt.Errorf("build request %q: %w", path, err)
+	}
+	if c.APIKey != "" {
+		req.Header.Set("X-API-Key", c.APIKey)
+	}
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return fmt.Errorf("GET %q: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return fmt.Errorf("read %q: %w", path, err)
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("GET %q: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	var env successEnvelope
+	if err := json.Unmarshal(body, &env); err != nil {
+		return fmt.Errorf("decode envelope %q: %w", path, err)
+	}
+	if !env.Success {
+		return fmt.Errorf("GET %q: api error: %s", path, env.Error)
+	}
+	if err := json.Unmarshal(env.Data, out); err != nil {
+		return fmt.Errorf("decode data %q: %w", path, err)
+	}
+	return nil
+}
+
+// apiTool mirrors contracts.Tool for the fields the benchmark needs. The schema
+// is kept raw so its exact serialized form is what gets tokenized.
+type apiTool struct {
+	Name        string          `json:"name"`
+	ServerName  string          `json:"server_name"`
+	Description string          `json:"description"`
+	Schema      json.RawMessage `json:"schema,omitempty"`
+}
+
+// FetchUpstreamTools pulls the consolidated tool list (GET /api/v1/tools) and
+// returns every upstream tool with its full JSON input schema, ready to feed
+// into schema-aware token counting for the baseline.
+func (c *LiveClient) FetchUpstreamTools(ctx context.Context) ([]Tool, error) {
+	var resp struct {
+		Tools []apiTool `json:"tools"`
+	}
+	if err := c.getJSON(ctx, "/api/v1/tools", &resp); err != nil {
+		return nil, err
+	}
+	tools := make([]Tool, 0, len(resp.Tools))
+	for _, t := range resp.Tools {
+		tools = append(tools, Tool{
+			ToolID:      t.ServerName + ":" + t.Name,
+			Server:      t.ServerName,
+			Name:        t.Name,
+			Description: t.Description,
+			Schema:      normalizeSchema(t.Schema),
+		})
+	}
+	return tools, nil
+}
+
+// normalizeSchema treats an empty JSON object ("{}") or JSON null the same as an
+// absent schema so a tool with no real parameters does not inflate token counts.
+func normalizeSchema(raw json.RawMessage) json.RawMessage {
+	switch string(raw) {
+	case "", "null", "{}":
+		return nil
+	default:
+		return raw
+	}
+}
+
+// Search replays one query through the proxy's BM25 tool search
+// (GET /api/v1/index/search) and returns the ranked tool IDs (server:tool,
+// best first) plus the client-measured round-trip latency.
+//
+// Latency is measured client-side on purpose: the server's SearchToolsResponse
+// "took" field is currently a hardcoded "0ms" stub (internal/httpapi
+// handleSearchTools), so it cannot be trusted as the proxy-side timing.
+func (c *LiveClient) Search(ctx context.Context, query string, limit int) (ranked []string, latency time.Duration, err error) {
+	q := url.Values{}
+	q.Set("q", query)
+	q.Set("limit", strconv.Itoa(limit))
+	path := "/api/v1/index/search?" + q.Encode()
+
+	var resp struct {
+		Results []struct {
+			Tool  apiTool `json:"tool"`
+			Score float64 `json:"score"`
+		} `json:"results"`
+	}
+	start := time.Now()
+	err = c.getJSON(ctx, path, &resp)
+	latency = time.Since(start)
+	if err != nil {
+		return nil, latency, err
+	}
+	ranked = make([]string, 0, len(resp.Results))
+	for _, r := range resp.Results {
+		ranked = append(ranked, r.Tool.ServerName+":"+r.Tool.Name)
+	}
+	return ranked, latency, nil
+}
+
+// LoadGoldenSet reads the Spec 065 retrieval golden set
+// (retrieval_golden_v1.json) from disk.
+func LoadGoldenSet(path string) (*GoldenSet, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read golden set %q: %w", path, err)
+	}
+	var g GoldenSet
+	if err := json.Unmarshal(data, &g); err != nil {
+		return nil, fmt.Errorf("parse golden set %q: %w", path, err)
+	}
+	if len(g.Queries) == 0 {
+		return nil, fmt.Errorf("golden set %q contains no queries", path)
+	}
+	return &g, nil
+}