From 4778afc1f8200c9dfe863bc717c1215b05965f47 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 21:08:18 +0300
Subject: [PATCH 1/3] =?UTF-8?q?feat(bench):=20live=20benchmark=20run=20?=
 =?UTF-8?q?=E2=80=94=20full=20schemas=20+=20Recall@k=20+=20latency=20(MCP-?=
 =?UTF-8?q?42a)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the bench/ harness (PR #747) with a live run against a running proxy:

- Exact token number: GET /api/v1/tools pulls upstream tools WITH full JSON
  input schemas; proxy-mode tools carry their live schemas via the extended
  server.ProxyModeToolDefs (BenchProxyToolDef.Schema). Schemas counted on BOTH
  sides so the headline savings is authoritative — and withheld
  (authoritative_headline=false) if any proxy tool lacks a schema, the MCP-3161
  overstatement guard.
- Accuracy: replays the Spec 065 retrieval golden set through the proxy BM25
  search (GET /api/v1/index/search) and scores Recall@{1,3,5,10}/MRR/nDCG@10/MAP
  against graded labels (deterministic, no LLM). Field names mirror Spec 065
  score-report.schema.json.
- Latency: client-measured per-query search latency (p50/p95/p99/max) vs. the
  one-shot load-all-tools cost (server "took" is a 0ms stub).

CLI: `go run ./bench/cmd/bench -live -proxy URL -api-key KEY`. Reports stay
gitignored (CN-003). All metric math + the live client are unit-tested with
httptest stubs; the docker-compose substrate is the live-reproduction path.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 bench/README.md                 |  50 ++++++--
 bench/cmd/bench/main.go         |  80 ++++++++++--
 bench/live.go                   | 168 +++++++++++++++++++++++++
 bench/live_report.go            | 215 ++++++++++++++++++++++++++++++++
 bench/live_report_test.go       | 134 ++++++++++++++++++++
 bench/live_test.go              | 128 +++++++++++++++++++
 bench/metrics.go                | 212 +++++++++++++++++++++++++++++++
 bench/metrics_test.go           | 116 +++++++++++++++++
 bench/proxytools.go             |   1 +
 bench/tokens.go                 |  38 +++++-
 internal/server/bench_export.go |  23 +++-
 11 files changed, 1136 insertions(+), 29 deletions(-)
 create mode 100644 bench/live.go
 create mode 100644 bench/live_report.go
 create mode 100644 bench/live_report_test.go
 create mode 100644 bench/live_test.go
 create mode 100644 bench/metrics.go
 create mode 100644 bench/metrics_test.go

diff --git a/bench/README.md b/bench/README.md
index c99a3868..24188f42 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -94,19 +94,45 @@ corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`.
 - **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer
   for the headline model is tracked as a follow-up (see "Roadmap").
 
+## Live run — full schemas + accuracy + latency
+
+The live run boots mcpproxy over the Spec 065 reference-server config and
+measures the three headline claims against a *running* proxy. Everything here is
+still deterministic and LLM-free.
+
+```bash
+# 1. Boot the reproducible substrate (proxy + 7 no-auth reference servers)
+docker compose -f bench/docker-compose.yml up --build -d
+
+# 2. Score against the running proxy (writes bench/results/live_report.json)
+go run ./bench/cmd/bench -live -proxy http://127.0.0.1:8092 -api-key eval-corpus-snapshot
+```
+
+What it adds over the offline token run:
+
+- **Exact token number (full schemas).** Pulls `GET /api/v1/tools` for the
+  upstream tools *with their full JSON input schemas* and counts them against
+  the proxy modes — whose management-tool schemas come from the same live
+  builders as the offline run (`server.ProxyModeToolDefs`). Because schemas are
+  counted on **both** sides, the savings is authoritative.
+  - **Safety valve (MCP-3161):** if any proxy tool is missing a schema, counting
+    the baseline's schemas alone would *overstate* savings, so the run
+    **withholds the headline %** and reports raw token totals only
+    (`authoritative_headline: false`). Never quote a withheld run.
+- **Accuracy.** Replays `retrieval_golden_v1.json` through the proxy's BM25
+  search (`GET /api/v1/index/search`) and scores **Recall@{1,3,5,10}, MRR,
+  nDCG@10, MAP** against the graded labels. Deterministic (BM25), so a single
+  run is reported (`runs_averaged: 1`). Metric field names mirror the Spec 065
+  `score-report.schema.json` `retrieval` block.
+- **Latency.** Client-measured per-query search latency (p50/p95/p99/max) vs.
+  the one-shot cost of loading all tools. Measured client-side on purpose: the
+  server's `SearchToolsResponse.took` field is currently a `"0ms"` stub.
+
 ## What is scoped but not yet built (follow-ups)
 
 These require decisions and/or other roles, so they are tracked as child issues
 rather than landed here:
 
-- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the
-  Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull
-  `GET /api/v1/tools` for exact schemas, and:
-  - **Accuracy**: replay the Spec 065 retrieval golden set
-    (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k /
-    MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer.
-  - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the
-    fixed cost of loading all tools.
 - **End-to-end task success with a pinned LLM** — requires a pinned model + an
   LLM-call budget; this is the only part that costs spend.
 - **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane.
@@ -123,11 +149,13 @@ rather than landed here:
   `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the
   benchmark cannot drift from the tools the proxy actually serves.
 
-## Reproducible live run (skeleton)
+## Reproducible live run
 
 `docker-compose.yml` boots mcpproxy over the frozen reference-server config so
-the corpus and live tool list are reproducible across machines. Wiring the live
-accuracy/latency scorers into it is the follow-up above.
+the corpus and live tool list are reproducible across machines. The live
+accuracy/latency/full-schema scorers attach to it via `-live` (see "Live run"
+above). Pin the upstream-server images before publishing headline numbers
+(image drift can change the tool corpus).
 
 ## Reviewer contact
 
diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go
index a5e924b2..7ffb0122 100644
--- a/bench/cmd/bench/main.go
+++ b/bench/cmd/bench/main.go
@@ -1,16 +1,22 @@
-// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool
-// corpus and writes a JSON report plus a static HTML dashboard.
+// Command bench runs the mcpproxy benchmark.
 //
-// Usage:
+// Default (offline) mode scores the committed Spec 065 frozen corpus for
+// token reduction and writes a JSON report plus a static HTML dashboard:
 //
 //	go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME]
 //
-// With no flags it scores the committed Spec 065 frozen corpus and writes the
-// reports to bench/results/ (gitignored — reports are never committed, per the
-// Spec 065 CN-003 repo rule).
+// Live mode boots against a running proxy (see bench/docker-compose.yml) to add
+// the exact-token comparison (full schemas), retrieval accuracy (Recall@k / MRR
+// / nDCG over the golden set), and search latency:
+//
+//	go run ./bench/cmd/bench -live [-proxy URL] [-api-key KEY] [-golden PATH]
+//
+// Reports land in bench/results/ (gitignored — reports are never committed, per
+// the Spec 065 CN-003 repo rule).
 package main
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"log"
@@ -21,21 +27,33 @@ import (
 
 func main() {
 	corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot")
-	outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html")
+	outDir := flag.String("out", "bench/results", "output directory for reports")
 	encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name")
+	live := flag.Bool("live", false, "run the live benchmark against a running proxy (full schemas + accuracy + latency)")
+	proxy := flag.String("proxy", "http://127.0.0.1:8092", "live proxy base URL")
+	apiKey := flag.String("api-key", "eval-corpus-snapshot", "live proxy API key (X-API-Key)")
+	goldenPath := flag.String("golden", "specs/065-evaluation-foundation/datasets/retrieval_golden_v1.json", "path to the retrieval golden set")
 	flag.Parse()
 
-	tk, err := bench.NewTokenizer(*encoding)
+	if *live {
+		runLive(*proxy, *apiKey, *goldenPath, *outDir)
+		return
+	}
+	runOffline(*corpusPath, *encoding, *outDir)
+}
+
+func runOffline(corpusPath, encoding, outDir string) {
+	tk, err := bench.NewTokenizer(encoding)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
-	corpus, err := bench.LoadCorpus(*corpusPath)
+	corpus, err := bench.LoadCorpus(corpusPath)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
 
 	report := bench.ComputeReport(tk, corpus)
-	jsonPath, htmlPath, err := report.WriteReports(*outDir)
+	jsonPath, htmlPath, err := report.WriteReports(outDir)
 	if err != nil {
 		log.Fatalf("bench: %v", err)
 	}
@@ -50,3 +68,45 @@ func main() {
 	}
 	fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath)
 }
+
+func runLive(proxy, apiKey, goldenPath, outDir string) {
+	golden, err := bench.LoadGoldenSet(goldenPath)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	client := bench.NewLiveClient(proxy, apiKey)
+	report, err := bench.RunLive(context.Background(), client, golden)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+	jsonPath, err := report.WriteJSON(outDir)
+	if err != nil {
+		log.Fatalf("bench: %v", err)
+	}
+
+	fmt.Fprintf(os.Stdout, "mcpproxy LIVE benchmark (proxy %s, %s)\n", report.Proxy, report.Encoding)
+	tr := report.Tokens
+	fmt.Fprintf(os.Stdout, "  tokens: %d upstream tools, baseline %d tokens (with full schemas)\n", tr.UpstreamTools, tr.BaselineTokens)
+	for _, m := range tr.Modes {
+		if m.Mode == bench.ModeBaseline {
+			continue
+		}
+		if tr.AuthoritativeHeadline {
+			fmt.Fprintf(os.Stdout, "    %-16s %6d tokens  %.1f%% fewer\n", m.Mode, m.Tokens, m.SavingsRatio*100)
+		} else {
+			fmt.Fprintf(os.Stdout, "    %-16s %6d tokens  (savings withheld — see notes)\n", m.Mode, m.Tokens)
+		}
+	}
+	if !tr.AuthoritativeHeadline {
+		for _, n := range tr.Notes {
+			fmt.Fprintf(os.Stdout, "  NOTE: %s\n", n)
+		}
+	}
+	r := report.Retrieval
+	fmt.Fprintf(os.Stdout, "  accuracy (%d queries): Recall@1=%.3f Recall@5=%.3f MRR=%.3f nDCG@10=%.3f MAP=%.3f\n",
+		r.QueryCount, r.RecallAt[1], r.RecallAt[5], r.MRR, r.NDCGAt10, r.MAP)
+	l := report.Latency
+	fmt.Fprintf(os.Stdout, "  latency (%d searches): p50=%.1fms p95=%.1fms p99=%.1fms max=%.1fms; load-all-tools=%.1fms\n",
+		l.Samples, l.P50ms, l.P95ms, l.P99ms, l.MaxMs, l.LoadAllToolsMs)
+	fmt.Fprintf(os.Stdout, "wrote %s\n", jsonPath)
+}
diff --git a/bench/live.go b/bench/live.go
new file mode 100644
index 00000000..fd82faa7
--- /dev/null
+++ b/bench/live.go
@@ -0,0 +1,168 @@
+package bench
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"strconv"
+	"time"
+)
+
+// LiveClient talks to a running mcpproxy instance (e.g. the bench
+// docker-compose substrate on 127.0.0.1:8092) over its REST API. It is used by
+// the live benchmark run to pull the exact tool definitions (with schemas) and
+// to replay the retrieval golden set through the proxy's BM25 search.
+type LiveClient struct {
+	BaseURL string
+	APIKey  string
+	HTTP    *http.Client
+}
+
+// NewLiveClient builds a LiveClient for baseURL (e.g. "http://127.0.0.1:8092")
+// authenticating with apiKey via the X-API-Key header.
+func NewLiveClient(baseURL, apiKey string) *LiveClient {
+	return &LiveClient{
+		BaseURL: baseURL,
+		APIKey:  apiKey,
+		HTTP:    &http.Client{Timeout: 30 * time.Second},
+	}
+}
+
+// successEnvelope is the standard mcpproxy REST response wrapper
+// ({"success":true,"data":{...}}). Data is decoded lazily by each caller.
+type successEnvelope struct {
+	Success bool            `json:"success"`
+	Data    json.RawMessage `json:"data"`
+	Error   string          `json:"error,omitempty"`
+}
+
+// getJSON performs an authenticated GET and unmarshals the envelope's data
+// field into out.
+func (c *LiveClient) getJSON(ctx context.Context, path string, out interface{}) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.BaseURL+path, nil)
+	if err != nil {
+		return fmt.Errorf("build request %q: %w", path, err)
+	}
+	if c.APIKey != "" {
+		req.Header.Set("X-API-Key", c.APIKey)
+	}
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		return fmt.Errorf("GET %q: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return fmt.Errorf("read %q: %w", path, err)
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("GET %q: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	var env successEnvelope
+	if err := json.Unmarshal(body, &env); err != nil {
+		return fmt.Errorf("decode envelope %q: %w", path, err)
+	}
+	if !env.Success {
+		return fmt.Errorf("GET %q: api error: %s", path, env.Error)
+	}
+	if err := json.Unmarshal(env.Data, out); err != nil {
+		return fmt.Errorf("decode data %q: %w", path, err)
+	}
+	return nil
+}
+
+// apiTool mirrors contracts.Tool for the fields the benchmark needs. The schema
+// is kept raw so its exact serialized form is what gets tokenized.
+type apiTool struct {
+	Name        string          `json:"name"`
+	ServerName  string          `json:"server_name"`
+	Description string          `json:"description"`
+	Schema      json.RawMessage `json:"schema,omitempty"`
+}
+
+// FetchUpstreamTools pulls the consolidated tool list (GET /api/v1/tools) and
+// returns every upstream tool with its full JSON input schema, ready to feed
+// into schema-aware token counting for the baseline.
+func (c *LiveClient) FetchUpstreamTools(ctx context.Context) ([]Tool, error) {
+	var resp struct {
+		Tools []apiTool `json:"tools"`
+	}
+	if err := c.getJSON(ctx, "/api/v1/tools", &resp); err != nil {
+		return nil, err
+	}
+	tools := make([]Tool, 0, len(resp.Tools))
+	for _, t := range resp.Tools {
+		tools = append(tools, Tool{
+			ToolID:      t.ServerName + ":" + t.Name,
+			Server:      t.ServerName,
+			Name:        t.Name,
+			Description: t.Description,
+			Schema:      normalizeSchema(t.Schema),
+		})
+	}
+	return tools, nil
+}
+
+// normalizeSchema treats an empty JSON object ("{}") or JSON null the same as an
+// absent schema so a tool with no real parameters does not inflate token counts.
+func normalizeSchema(raw json.RawMessage) json.RawMessage {
+	switch string(raw) {
+	case "", "null", "{}":
+		return nil
+	default:
+		return raw
+	}
+}
+
+// Search replays one query through the proxy's BM25 tool search
+// (GET /api/v1/index/search) and returns the ranked tool IDs (server:tool,
+// best first) plus the client-measured round-trip latency.
+//
+// Latency is measured client-side on purpose: the server's SearchToolsResponse
+// "took" field is currently a hardcoded "0ms" stub (internal/httpapi
+// handleSearchTools), so it cannot be trusted as the proxy-side timing.
+func (c *LiveClient) Search(ctx context.Context, query string, limit int) (ranked []string, latency time.Duration, err error) {
+	q := url.Values{}
+	q.Set("q", query)
+	q.Set("limit", strconv.Itoa(limit))
+	path := "/api/v1/index/search?" + q.Encode()
+
+	var resp struct {
+		Results []struct {
+			Tool  apiTool `json:"tool"`
+			Score float64 `json:"score"`
+		} `json:"results"`
+	}
+	start := time.Now()
+	err = c.getJSON(ctx, path, &resp)
+	latency = time.Since(start)
+	if err != nil {
+		return nil, latency, err
+	}
+	ranked = make([]string, 0, len(resp.Results))
+	for _, r := range resp.Results {
+		ranked = append(ranked, r.Tool.ServerName+":"+r.Tool.Name)
+	}
+	return ranked, latency, nil
+}
+
+// LoadGoldenSet reads the Spec 065 retrieval golden set
+// (retrieval_golden_v1.json) from disk.
+func LoadGoldenSet(path string) (*GoldenSet, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read golden set %q: %w", path, err)
+	}
+	var g GoldenSet
+	if err := json.Unmarshal(data, &g); err != nil {
+		return nil, fmt.Errorf("parse golden set %q: %w", path, err)
+	}
+	if len(g.Queries) == 0 {
+		return nil, fmt.Errorf("golden set %q contains no queries", path)
+	}
+	return &g, nil
+}
diff --git a/bench/live_report.go b/bench/live_report.go
new file mode 100644
index 00000000..3dc65df7
--- /dev/null
+++ b/bench/live_report.go
@@ -0,0 +1,215 @@
+package bench
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"time"
+)
+
+// LiveModeResult is the per-mode context-token cost from the live run.
+type LiveModeResult struct {
+	Mode         string  `json:"mode"`
+	ContextTools int     `json:"context_tools"`
+	Tokens       int     `json:"tokens"`
+	SavingsRatio float64 `json:"savings_vs_baseline,omitempty"`
+}
+
+// LiveTokenReport is the exact-token comparison from a live proxy, with the
+// baseline upstream tools counted WITH their full JSON input schemas.
+//
+// AuthoritativeHeadline gates the savings percentage: it is only true when the
+// proxy management tools were ALSO counted with their schemas. Counting schemas
+// on the baseline but not the proxy side overstates savings — the exact error
+// corrected in MCP-3161 — so when proxy schemas are absent the savings ratio is
+// withheld and only raw token totals are reported.
+type LiveTokenReport struct {
+	Encoding              string           `json:"encoding"`
+	UpstreamTools         int              `json:"upstream_tools"`
+	BaselineTokens        int              `json:"baseline_tokens"`
+	Modes                 []LiveModeResult `json:"modes"`
+	ProxySchemasCounted   bool             `json:"proxy_schemas_counted"`
+	AuthoritativeHeadline bool             `json:"authoritative_headline"`
+	Notes                 []string         `json:"notes"`
+}
+
+// LatencyReport summarizes proxy-side retrieve_tools search latency versus the
+// fixed one-shot cost of loading every tool. Times are client-measured
+// (milliseconds); the server's SearchToolsResponse "took" field is a "0ms" stub.
+type LatencyReport struct {
+	Samples        int     `json:"samples"`
+	P50ms          float64 `json:"p50_ms"`
+	P95ms          float64 `json:"p95_ms"`
+	P99ms          float64 `json:"p99_ms"`
+	MaxMs          float64 `json:"max_ms"`
+	LoadAllToolsMs float64 `json:"load_all_tools_ms"`
+}
+
+// LiveReport is the full live benchmark result: exact-token comparison,
+// retrieval accuracy, and search latency, all gathered from one running proxy.
+type LiveReport struct {
+	Proxy     string            `json:"proxy"`
+	Encoding  string            `json:"encoding"`
+	Tokens    *LiveTokenReport  `json:"tokens"`
+	Retrieval *RetrievalMetrics `json:"retrieval"`
+	Latency   *LatencyReport    `json:"latency"`
+}
+
+// recallCutoffs are the standard Recall@k cutoffs reported (matches Spec 065
+// score-report.schema.json recall_at keys).
+var recallCutoffs = []int{1, 3, 5, 10}
+
+// WriteJSON writes the live report as indented JSON into dir/live_report.json
+// (the dir is gitignored — reports are never committed, per Spec 065 CN-003).
+func (r *LiveReport) WriteJSON(dir string) (string, error) {
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return "", fmt.Errorf("mkdir %q: %w", dir, err)
+	}
+	path := filepath.Join(dir, "live_report.json")
+	data, err := json.MarshalIndent(r, "", "  ")
+	if err != nil {
+		return "", fmt.Errorf("marshal live report: %w", err)
+	}
+	if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil {
+		return "", fmt.Errorf("write %q: %w", path, err)
+	}
+	return path, nil
+}
+
+// RunLive gathers the full live benchmark from a running proxy: it pulls the
+// exact tool definitions (with schemas) for the token comparison, replays the
+// golden set through the proxy's BM25 search for accuracy, and records the
+// per-query search latency.
+func RunLive(ctx context.Context, client *LiveClient, golden *GoldenSet) (*LiveReport, error) {
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		return nil, err
+	}
+
+	// 1. Exact-token: fetch upstream tools with schemas (also times "load all").
+	loadStart := time.Now()
+	upstream, err := client.FetchUpstreamTools(ctx)
+	loadAll := time.Since(loadStart)
+	if err != nil {
+		return nil, fmt.Errorf("fetch upstream tools: %w", err)
+	}
+	tokenRep := buildTokenReport(tk, upstream,
+		ProxyToolsForMode(ModeRetrieveTools), ProxyToolsForMode(ModeCodeExecution))
+
+	// 2. Accuracy + 3. Latency: replay the golden set, capturing search latency.
+	var latencies []time.Duration
+	searchFn := func(query string, limit int) ([]string, error) {
+		ranked, lat, serr := client.Search(ctx, query, limit)
+		latencies = append(latencies, lat)
+		return ranked, serr
+	}
+	metrics, err := ScoreRetrieval(golden, searchFn, recallCutoffs)
+	if err != nil {
+		return nil, fmt.Errorf("score retrieval: %w", err)
+	}
+
+	return &LiveReport{
+		Proxy:     client.BaseURL,
+		Encoding:  tk.encoding,
+		Tokens:    tokenRep,
+		Retrieval: metrics,
+		Latency:   computeLatency(latencies, loadAll),
+	}, nil
+}
+
+// buildTokenReport counts the baseline upstream tools WITH schemas against each
+// proxy routing mode (rt = retrieve_tools, ce = code_execution), also counted
+// with schemas. The headline savings is only emitted when EVERY proxy tool
+// carries a schema; otherwise counting schemas on the baseline alone would
+// overstate savings (MCP-3161), so the ratio is withheld and only raw token
+// totals are reported.
+func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport {
+	baseTokens := tk.countToolsWithSchema(upstream)
+
+	proxySchemasCounted := allHaveSchema(rt) && allHaveSchema(ce)
+
+	rep := &LiveTokenReport{
+		Encoding:            tk.encoding,
+		UpstreamTools:       len(upstream),
+		BaselineTokens:      baseTokens,
+		ProxySchemasCounted: proxySchemasCounted,
+		Modes: []LiveModeResult{
+			{Mode: ModeBaseline, ContextTools: len(upstream), Tokens: baseTokens},
+			{Mode: ModeRetrieveTools, ContextTools: len(rt), Tokens: tk.countToolsWithSchema(rt)},
+			{Mode: ModeCodeExecution, ContextTools: len(ce), Tokens: tk.countToolsWithSchema(ce)},
+		},
+	}
+	rep.AuthoritativeHeadline = proxySchemasCounted
+	if proxySchemasCounted {
+		for i := range rep.Modes {
+			m := &rep.Modes[i]
+			if m.Mode != ModeBaseline && baseTokens > 0 {
+				m.SavingsRatio = 1.0 - float64(m.Tokens)/float64(baseTokens)
+			}
+		}
+		rep.Notes = []string{
+			"Baseline counts upstream tools with full JSON input schemas from GET /api/v1/tools; proxy modes count the management tools with their schemas. Headline savings is authoritative.",
+		}
+	} else {
+		rep.Notes = []string{
+			"HEADLINE SAVINGS WITHHELD: the baseline upstream tools are counted with full schemas, but the proxy management tools (proxy_tools_v1.json) are description-only. Reporting savings now would count schemas on one side only and OVERSTATE the reduction — the exact error corrected in MCP-3161. Token totals are shown for transparency; the authoritative headline lands once proxy-tool schemas are captured live via MCP tools/list.",
+		}
+	}
+	return rep
+}
+
+func allHaveSchema(tools []Tool) bool {
+	if len(tools) == 0 {
+		return false
+	}
+	for _, t := range tools {
+		if len(t.Schema) == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// computeLatency summarizes search-call latencies with nearest-rank
+// percentiles, plus the fixed one-shot cost of loading all tools.
+func computeLatency(samples []time.Duration, loadAll time.Duration) *LatencyReport {
+	rep := &LatencyReport{
+		Samples:        len(samples),
+		LoadAllToolsMs: ms(loadAll),
+	}
+	if len(samples) == 0 {
+		return rep
+	}
+	sorted := make([]time.Duration, len(samples))
+	copy(sorted, samples)
+	sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] })
+	rep.P50ms = ms(percentile(sorted, 50))
+	rep.P95ms = ms(percentile(sorted, 95))
+	rep.P99ms = ms(percentile(sorted, 99))
+	rep.MaxMs = ms(sorted[len(sorted)-1])
+	return rep
+}
+
+// percentile returns the nearest-rank percentile p (0-100) of a sorted slice.
+func percentile(sorted []time.Duration, p float64) time.Duration {
+	if len(sorted) == 0 {
+		return 0
+	}
+	rank := int(math.Ceil(p / 100.0 * float64(len(sorted))))
+	if rank < 1 {
+		rank = 1
+	}
+	if rank > len(sorted) {
+		rank = len(sorted)
+	}
+	return sorted[rank-1]
+}
+
+// ms converts a duration to milliseconds as a float.
+func ms(d time.Duration) float64 {
+	return float64(d.Microseconds()) / 1000.0
+}
diff --git a/bench/live_report_test.go b/bench/live_report_test.go
new file mode 100644
index 00000000..92886737
--- /dev/null
+++ b/bench/live_report_test.go
@@ -0,0 +1,134 @@
+package bench
+
+import (
+	"context"
+	"encoding/json"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+// goldenPath locates the committed Spec 065 golden set relative to the repo
+// root (tests run from the bench/ package dir).
+func goldenPath() string {
+	return filepath.Join("..", "specs", "065-evaluation-foundation", "datasets", "retrieval_golden_v1.json")
+}
+
+func TestLoadGoldenSetReal(t *testing.T) {
+	g, err := LoadGoldenSet(goldenPath())
+	if err != nil {
+		t.Fatalf("LoadGoldenSet: %v", err)
+	}
+	if g.CorpusVersion == "" {
+		t.Error("corpus_version empty")
+	}
+	if len(g.Queries) < 10 {
+		t.Errorf("expected a substantial golden set, got %d queries", len(g.Queries))
+	}
+	for _, q := range g.Queries {
+		if q.ID == "" || q.Query == "" {
+			t.Errorf("query missing id/text: %+v", q)
+		}
+		if relevantCount(q.Labels) == 0 {
+			t.Errorf("query %q has no relevant labels", q.ID)
+		}
+	}
+}
+
+func TestPercentiles(t *testing.T) {
+	ds := []time.Duration{
+		10 * time.Millisecond, 20 * time.Millisecond, 30 * time.Millisecond,
+		40 * time.Millisecond, 50 * time.Millisecond, 60 * time.Millisecond,
+		70 * time.Millisecond, 80 * time.Millisecond, 90 * time.Millisecond,
+		100 * time.Millisecond,
+	}
+	lat := computeLatency(ds, 5*time.Millisecond)
+	if lat.Samples != 10 {
+		t.Errorf("Samples = %d, want 10", lat.Samples)
+	}
+	// nearest-rank: p50 -> ceil(0.5*10)=5th value (50ms); p95 -> 10th (100ms)
+	if lat.P50ms != 50 {
+		t.Errorf("P50ms = %v, want 50", lat.P50ms)
+	}
+	if lat.P95ms != 100 {
+		t.Errorf("P95ms = %v, want 100", lat.P95ms)
+	}
+	if lat.MaxMs != 100 {
+		t.Errorf("MaxMs = %v, want 100", lat.MaxMs)
+	}
+	if lat.LoadAllToolsMs != 5 {
+		t.Errorf("LoadAllToolsMs = %v, want 5", lat.LoadAllToolsMs)
+	}
+}
+
+func TestRunLiveAuthoritativeHeadline(t *testing.T) {
+	srv := stubProxy(t)
+	defer srv.Close()
+
+	c := NewLiveClient(srv.URL, "test-key")
+	golden := &GoldenSet{
+		CorpusVersion: "corpus_v1",
+		Queries: []GoldenQuery{
+			{ID: "q1", Query: "read a file", Labels: []Label{{ToolID: "filesystem:read_text_file", Relevance: 2}}},
+		},
+	}
+	rep, err := RunLive(context.Background(), c, golden)
+	if err != nil {
+		t.Fatalf("RunLive: %v", err)
+	}
+	// Token report: baseline counted with schemas AND proxy tools carry their
+	// live schemas (from server.ProxyModeToolDefs), so the headline is
+	// authoritative — schemas on BOTH sides, no MCP-3161 overstatement.
+	if rep.Tokens == nil || rep.Tokens.UpstreamTools != 2 {
+		t.Fatalf("expected 2 upstream tools, got %+v", rep.Tokens)
+	}
+	if !rep.Tokens.ProxySchemasCounted {
+		t.Error("proxy tools should carry schemas from the live builders")
+	}
+	if !rep.Tokens.AuthoritativeHeadline {
+		t.Error("headline should be authoritative when both sides count schemas")
+	}
+	if rep.Tokens.BaselineTokens <= 0 {
+		t.Error("baseline tokens should be counted with schemas")
+	}
+	// A savings ratio must be present for the proxy modes.
+	for _, m := range rep.Tokens.Modes {
+		if m.Mode != ModeBaseline && m.SavingsRatio == 0 {
+			t.Errorf("expected a savings ratio for mode %q", m.Mode)
+		}
+	}
+	// Accuracy: perfect ranking for the one query.
+	if rep.Retrieval == nil || rep.Retrieval.RecallAt[1] != 1.0 {
+		t.Errorf("expected Recall@1=1.0, got %+v", rep.Retrieval)
+	}
+	// Latency populated.
+	if rep.Latency == nil || rep.Latency.Samples != 1 {
+		t.Errorf("expected 1 latency sample, got %+v", rep.Latency)
+	}
+}
+
+// TestBuildTokenReportWithholdsWhenProxySchemasMissing guards the MCP-3161
+// safety valve: if any proxy tool lacks a schema, counting the baseline's
+// schemas alone would overstate savings, so the headline is withheld.
+func TestBuildTokenReportWithholdsWhenProxySchemasMissing(t *testing.T) {
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		t.Fatalf("tokenizer: %v", err)
+	}
+	upstream := []Tool{{Name: "big", Description: "d", Schema: json.RawMessage(`{"type":"object","properties":{"x":{"type":"string"}}}`)}}
+	rtSchemaless := []Tool{{Name: "retrieve_tools", Description: "d"}} // no schema
+	ce := []Tool{{Name: "code_execution", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}}
+
+	rep := buildTokenReport(tk, upstream, rtSchemaless, ce)
+	if rep.AuthoritativeHeadline {
+		t.Error("headline must be withheld when a proxy tool lacks a schema")
+	}
+	for _, m := range rep.Modes {
+		if m.SavingsRatio != 0 {
+			t.Errorf("savings ratio must be withheld (0), got %v for %q", m.SavingsRatio, m.Mode)
+		}
+	}
+	if rep.BaselineTokens <= 0 {
+		t.Error("baseline tokens should still be reported for transparency")
+	}
+}
diff --git a/bench/live_test.go b/bench/live_test.go
new file mode 100644
index 00000000..437370bc
--- /dev/null
+++ b/bench/live_test.go
@@ -0,0 +1,128 @@
+package bench
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+// stubProxy returns an httptest server that mimics the two mcpproxy REST
+// endpoints the live benchmark uses, wrapping payloads in the standard
+// {success, data} envelope.
+func stubProxy(t *testing.T) *httptest.Server {
+	t.Helper()
+	mux := http.NewServeMux()
+	mux.HandleFunc("/api/v1/tools", func(w http.ResponseWriter, _ *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"success": true,
+			"data": map[string]any{
+				"tools": []map[string]any{
+					{
+						"name":        "read_text_file",
+						"server_name": "filesystem",
+						"description": "Read a file as text",
+						"schema": map[string]any{
+							"type":       "object",
+							"properties": map[string]any{"path": map[string]any{"type": "string"}},
+							"required":   []string{"path"},
+						},
+					},
+					{
+						"name":        "echo",
+						"server_name": "memory",
+						"description": "Echo input",
+					},
+				},
+			},
+		})
+	})
+	mux.HandleFunc("/api/v1/index/search", func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Query().Get("q") == "" {
+			http.Error(w, "missing q", http.StatusBadRequest)
+			return
+		}
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"success": true,
+			"data": map[string]any{
+				"query": r.URL.Query().Get("q"),
+				"results": []map[string]any{
+					{"tool": map[string]any{"name": "read_text_file", "server_name": "filesystem"}, "score": 0.9},
+					{"tool": map[string]any{"name": "echo", "server_name": "memory"}, "score": 0.1},
+				},
+				"total": 2,
+				"took":  "0ms",
+			},
+		})
+	})
+	return httptest.NewServer(mux)
+}
+
+func TestLiveClientFetchUpstreamTools(t *testing.T) {
+	srv := stubProxy(t)
+	defer srv.Close()
+
+	c := NewLiveClient(srv.URL, "test-key")
+	tools, err := c.FetchUpstreamTools(context.Background())
+	if err != nil {
+		t.Fatalf("FetchUpstreamTools: %v", err)
+	}
+	if len(tools) != 2 {
+		t.Fatalf("got %d tools, want 2", len(tools))
+	}
+	if tools[0].ToolID != "filesystem:read_text_file" {
+		t.Errorf("ToolID = %q, want filesystem:read_text_file", tools[0].ToolID)
+	}
+	if len(tools[0].Schema) == 0 {
+		t.Errorf("expected schema captured for tool with input schema, got none")
+	}
+	if len(tools[1].Schema) != 0 {
+		t.Errorf("expected no schema for schemaless tool, got %s", tools[1].Schema)
+	}
+}
+
+func TestLiveClientSearch(t *testing.T) {
+	srv := stubProxy(t)
+	defer srv.Close()
+
+	c := NewLiveClient(srv.URL, "test-key")
+	ranked, latency, err := c.Search(context.Background(), "read a file", 10)
+	if err != nil {
+		t.Fatalf("Search: %v", err)
+	}
+	want := []string{"filesystem:read_text_file", "memory:echo"}
+	if len(ranked) != len(want) {
+		t.Fatalf("ranked = %v, want %v", ranked, want)
+	}
+	for i := range want {
+		if ranked[i] != want[i] {
+			t.Errorf("ranked[%d] = %q, want %q", i, ranked[i], want[i])
+		}
+	}
+	if latency < 0 {
+		t.Errorf("latency should be non-negative, got %v", latency)
+	}
+}
+
+func TestSchemaAwareTokenCountExceedsDescOnly(t *testing.T) {
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		t.Fatalf("tokenizer: %v", err)
+	}
+	withSchema := Tool{
+		Name:        "read_text_file",
+		Description: "Read a file as text",
+		Schema:      json.RawMessage(`{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}`),
+	}
+	descOnly := Tool{Name: withSchema.Name, Description: withSchema.Description}
+	if tk.CountToolWithSchema(withSchema) <= tk.CountTool(descOnly) {
+		t.Errorf("schema-aware count (%d) must exceed desc-only count (%d)",
+			tk.CountToolWithSchema(withSchema), tk.CountTool(descOnly))
+	}
+	// A schemaless tool must count identically under both methods.
+	if tk.CountToolWithSchema(descOnly) != tk.CountTool(descOnly) {
+		t.Errorf("schemaless tool should count identically: %d vs %d",
+			tk.CountToolWithSchema(descOnly), tk.CountTool(descOnly))
+	}
+}
diff --git a/bench/metrics.go b/bench/metrics.go
new file mode 100644
index 00000000..ddfc1f3b
--- /dev/null
+++ b/bench/metrics.go
@@ -0,0 +1,212 @@
+package bench
+
+import (
+	"fmt"
+	"math"
+)
+
+// Label is a graded relevance judgement for one tool against one query, taken
+// from the Spec 065 retrieval golden set (relevance 2 = primary, 1 = related,
+// 0 / absent = irrelevant).
+type Label struct {
+	ToolID    string `json:"tool_id"`
+	Relevance int    `json:"relevance"`
+}
+
+// GoldenQuery is one labelled query -> relevant-tool(s) judgement.
+type GoldenQuery struct {
+	ID     string  `json:"id"`
+	Query  string  `json:"query"`
+	Labels []Label `json:"labels"`
+}
+
+// GoldenSet is the frozen Spec 065 retrieval golden set
+// (retrieval_golden_v1.json).
+type GoldenSet struct {
+	CorpusVersion string        `json:"corpus_version"`
+	Queries       []GoldenQuery `json:"queries"`
+}
+
+// relevanceOf returns the graded relevance of toolID for the given labels (0 if
+// the tool is not a labelled relevant result).
+func relevanceOf(toolID string, labels []Label) int {
+	for _, l := range labels {
+		if l.ToolID == toolID {
+			return l.Relevance
+		}
+	}
+	return 0
+}
+
+// relevantCount is the number of tools with relevance >= 1 for a query.
+func relevantCount(labels []Label) int {
+	n := 0
+	for _, l := range labels {
+		if l.Relevance >= 1 {
+			n++
+		}
+	}
+	return n
+}
+
+// RecallAtK is the fraction of the query's relevant tools (relevance >= 1) that
+// appear in the top-k of the ranking. Returns 0 when there are no relevant
+// tools (a degenerate query that should not be scored).
+func RecallAtK(ranked []string, labels []Label, k int) float64 {
+	total := relevantCount(labels)
+	if total == 0 {
+		return 0
+	}
+	hits := 0
+	for i, id := range ranked {
+		if i >= k {
+			break
+		}
+		if relevanceOf(id, labels) >= 1 {
+			hits++
+		}
+	}
+	return float64(hits) / float64(total)
+}
+
+// ReciprocalRank is 1/rank of the first relevant tool in the ranking, or 0 if
+// none of the ranked tools are relevant.
+func ReciprocalRank(ranked []string, labels []Label) float64 {
+	for i, id := range ranked {
+		if relevanceOf(id, labels) >= 1 {
+			return 1.0 / float64(i+1)
+		}
+	}
+	return 0
+}
+
+// NDCGAtK is the normalized discounted cumulative gain at k using the graded
+// relevance as the gain (linear gain, log2 position discount). 1.0 means the
+// ranking is in ideal (relevance-descending) order; 0 means no gain in top-k.
+func NDCGAtK(ranked []string, labels []Label, k int) float64 {
+	dcg := 0.0
+	for i, id := range ranked {
+		if i >= k {
+			break
+		}
+		rel := relevanceOf(id, labels)
+		if rel == 0 {
+			continue
+		}
+		dcg += float64(rel) / math.Log2(float64(i+2)) // position i (0-based) -> log2(i+2)
+	}
+	idcg := idealDCG(labels, k)
+	if idcg == 0 {
+		return 0
+	}
+	return dcg / idcg
+}
+
+// idealDCG is the DCG of the best possible ordering (relevances sorted
+// descending) capped at k.
+func idealDCG(labels []Label, k int) float64 {
+	rels := make([]int, 0, len(labels))
+	for _, l := range labels {
+		if l.Relevance >= 1 {
+			rels = append(rels, l.Relevance)
+		}
+	}
+	// descending sort (small slice; insertion sort keeps deps minimal)
+	for i := 1; i < len(rels); i++ {
+		for j := i; j > 0 && rels[j] > rels[j-1]; j-- {
+			rels[j], rels[j-1] = rels[j-1], rels[j]
+		}
+	}
+	idcg := 0.0
+	for i, rel := range rels {
+		if i >= k {
+			break
+		}
+		idcg += float64(rel) / math.Log2(float64(i+2))
+	}
+	return idcg
+}
+
+// AveragePrecision is the mean of the precision values computed at each rank
+// where a relevant tool is retrieved, divided by the total number of relevant
+// tools (so unretrieved relevant tools lower the score). Binary relevance
+// (relevance >= 1) is used, matching the standard MAP definition.
+func AveragePrecision(ranked []string, labels []Label) float64 {
+	total := relevantCount(labels)
+	if total == 0 {
+		return 0
+	}
+	hits := 0
+	sumPrec := 0.0
+	for i, id := range ranked {
+		if relevanceOf(id, labels) >= 1 {
+			hits++
+			sumPrec += float64(hits) / float64(i+1)
+		}
+	}
+	return sumPrec / float64(total)
+}
+
+// SearchFunc replays one query through the retrieval system under test and
+// returns the ranked tool IDs (most relevant first), limited to `limit`.
+type SearchFunc func(query string, limit int) (ranked []string, err error)
+
+// RetrievalMetrics is the aggregated retrieval-quality report over a golden
+// set. Field names mirror the Spec 065 score-report.schema.json `retrieval`
+// block so the report can be emitted to that contract.
+type RetrievalMetrics struct {
+	CorpusVersion string          `json:"corpus_version"`
+	GoldenVersion string          `json:"golden_version,omitempty"`
+	RunsAveraged  int             `json:"runs_averaged"`
+	QueryCount    int             `json:"query_count"`
+	RecallAt      map[int]float64 `json:"recall_at"`
+	MRR           float64         `json:"mrr"`
+	NDCGAt10      float64         `json:"ndcg_at_10"`
+	MAP           float64         `json:"map"`
+}
+
+// ScoreRetrieval replays every golden query through search and aggregates
+// Recall@k (for each k in ks), MRR, nDCG@10 and MAP as the mean over all
+// queries. The search is deterministic (BM25), so a single run is averaged.
+func ScoreRetrieval(golden *GoldenSet, search SearchFunc, ks []int) (*RetrievalMetrics, error) {
+	if golden == nil || len(golden.Queries) == 0 {
+		return nil, fmt.Errorf("golden set is empty")
+	}
+	// The largest k we must retrieve to score every requested cutoff and nDCG@10.
+	maxK := 10
+	for _, k := range ks {
+		if k > maxK {
+			maxK = k
+		}
+	}
+
+	recallSum := make(map[int]float64, len(ks))
+	var mrrSum, ndcgSum, mapSum float64
+	for _, q := range golden.Queries {
+		ranked, err := search(q.Query, maxK)
+		if err != nil {
+			return nil, fmt.Errorf("search %q: %w", q.ID, err)
+		}
+		for _, k := range ks {
+			recallSum[k] += RecallAtK(ranked, q.Labels, k)
+		}
+		mrrSum += ReciprocalRank(ranked, q.Labels)
+		ndcgSum += NDCGAtK(ranked, q.Labels, 10)
+		mapSum += AveragePrecision(ranked, q.Labels)
+	}
+
+	n := float64(len(golden.Queries))
+	recallAt := make(map[int]float64, len(ks))
+	for _, k := range ks {
+		recallAt[k] = recallSum[k] / n
+	}
+	return &RetrievalMetrics{
+		CorpusVersion: golden.CorpusVersion,
+		RunsAveraged:  1,
+		QueryCount:    len(golden.Queries),
+		RecallAt:      recallAt,
+		MRR:           mrrSum / n,
+		NDCGAt10:      ndcgSum / n,
+		MAP:           mapSum / n,
+	}, nil
+}
diff --git a/bench/metrics_test.go b/bench/metrics_test.go
new file mode 100644
index 00000000..48ef34e9
--- /dev/null
+++ b/bench/metrics_test.go
@@ -0,0 +1,116 @@
+package bench
+
+import (
+	"math"
+	"testing"
+)
+
+// almostEqual compares floats within a small tolerance (metric math involves
+// log2 divisions, so exact equality is brittle).
+func almostEqual(a, b float64) bool {
+	return math.Abs(a-b) < 1e-6
+}
+
+// worked example reused across the metric tests:
+//
+//	relevant labels: A(rel 2), B(rel 1), C(rel 1)  -> 3 relevant tools
+//	ranking returned: [A, X, B, Y]                 -> X, Y are irrelevant
+var (
+	exLabels = []Label{
+		{ToolID: "A", Relevance: 2},
+		{ToolID: "B", Relevance: 1},
+		{ToolID: "C", Relevance: 1},
+	}
+	exRanked = []string{"A", "X", "B", "Y"}
+)
+
+func TestRecallAtK(t *testing.T) {
+	cases := []struct {
+		k    int
+		want float64
+	}{
+		{1, 1.0 / 3.0}, // top-1 {A}: 1 of 3 relevant
+		{3, 2.0 / 3.0}, // top-3 {A,X,B}: 2 of 3 relevant
+		{5, 2.0 / 3.0}, // only 4 results; {A,B} retrieved: 2 of 3
+	}
+	for _, c := range cases {
+		got := RecallAtK(exRanked, exLabels, c.k)
+		if !almostEqual(got, c.want) {
+			t.Errorf("RecallAtK(k=%d) = %v, want %v", c.k, got, c.want)
+		}
+	}
+}
+
+func TestReciprocalRank(t *testing.T) {
+	// First relevant (A) is at rank 1 -> RR = 1.0
+	if got := ReciprocalRank(exRanked, exLabels); !almostEqual(got, 1.0) {
+		t.Errorf("ReciprocalRank = %v, want 1.0", got)
+	}
+	// First relevant (B) at rank 2 -> RR = 0.5
+	if got := ReciprocalRank([]string{"Z", "B", "A"}, exLabels); !almostEqual(got, 0.5) {
+		t.Errorf("ReciprocalRank(B@2) = %v, want 0.5", got)
+	}
+	// No relevant retrieved -> RR = 0
+	if got := ReciprocalRank([]string{"Z", "Y"}, exLabels); !almostEqual(got, 0.0) {
+		t.Errorf("ReciprocalRank(none) = %v, want 0", got)
+	}
+}
+
+func TestNDCGAtK(t *testing.T) {
+	// DCG  = 2/log2(2) + 0 + 1/log2(4)          = 2 + 0.5      = 2.5
+	// IDCG = 2/log2(2) + 1/log2(3) + 1/log2(4)  = 2 + 0.63093 + 0.5 = 3.13093
+	// nDCG = 2.5 / 3.13093 = 0.798486
+	want := 2.5 / (2.0 + 1.0/math.Log2(3) + 0.5)
+	if got := NDCGAtK(exRanked, exLabels, 10); !almostEqual(got, want) {
+		t.Errorf("NDCGAtK(10) = %v, want %v", got, want)
+	}
+	// Perfect ranking -> nDCG = 1.0
+	if got := NDCGAtK([]string{"A", "B", "C"}, exLabels, 10); !almostEqual(got, 1.0) {
+		t.Errorf("NDCGAtK(perfect) = %v, want 1.0", got)
+	}
+}
+
+func TestAveragePrecision(t *testing.T) {
+	// A@1 -> precision 1/1 = 1.0 ; B@3 -> precision 2/3 ; C not retrieved -> 0
+	// AP = (1.0 + 0.6667 + 0) / 3 = 0.555556
+	want := (1.0 + 2.0/3.0) / 3.0
+	if got := AveragePrecision(exRanked, exLabels); !almostEqual(got, want) {
+		t.Errorf("AveragePrecision = %v, want %v", got, want)
+	}
+}
+
+func TestScoreRetrieval(t *testing.T) {
+	golden := &GoldenSet{
+		CorpusVersion: "corpus_v1",
+		Queries: []GoldenQuery{
+			{ID: "q1", Query: "find A", Labels: exLabels},
+			{ID: "q2", Query: "find D", Labels: []Label{{ToolID: "D", Relevance: 2}}},
+		},
+	}
+	// Deterministic fake search: q1 -> exRanked, q2 -> perfect [D]
+	search := func(query string, _ int) ([]string, error) {
+		if query == "find A" {
+			return exRanked, nil
+		}
+		return []string{"D"}, nil
+	}
+	m, err := ScoreRetrieval(golden, search, []int{1, 3, 5, 10})
+	if err != nil {
+		t.Fatalf("ScoreRetrieval error: %v", err)
+	}
+	if m.RunsAveraged != 1 {
+		t.Errorf("RunsAveraged = %d, want 1", m.RunsAveraged)
+	}
+	// Recall@1: q1=1/3, q2=1 -> mean = (0.3333+1)/2 = 0.66667
+	wantR1 := (1.0/3.0 + 1.0) / 2.0
+	if !almostEqual(m.RecallAt[1], wantR1) {
+		t.Errorf("mean Recall@1 = %v, want %v", m.RecallAt[1], wantR1)
+	}
+	// MRR: q1=1.0, q2=1.0 -> 1.0
+	if !almostEqual(m.MRR, 1.0) {
+		t.Errorf("MRR = %v, want 1.0", m.MRR)
+	}
+	if m.QueryCount != 2 {
+		t.Errorf("QueryCount = %d, want 2", m.QueryCount)
+	}
+}
diff --git a/bench/proxytools.go b/bench/proxytools.go
index dda5edd4..555c9f3e 100644
--- a/bench/proxytools.go
+++ b/bench/proxytools.go
@@ -34,6 +34,7 @@ func ProxyToolsForMode(mode string) []Tool {
 			ToolID:      "mcpproxy:" + d.Name,
 			Name:        d.Name,
 			Description: d.Description,
+			Schema:      d.Schema,
 		})
 	}
 	return out
diff --git a/bench/tokens.go b/bench/tokens.go
index e61b3ed4..1fa74bad 100644
--- a/bench/tokens.go
+++ b/bench/tokens.go
@@ -46,12 +46,15 @@ const (
 
 // Tool is a single tool definition the benchmark scores token cost over. It
 // matches the shape of both the Spec 065 corpus snapshot and the embedded
-// proxy-tool fixture.
+// proxy-tool fixture. Schema is optional: the committed corpus snapshot is
+// description-only (nil schema), while the live run (live.go) populates it with
+// each tool's full JSON input schema for the exact-token headline.
 type Tool struct {
-	ToolID      string `json:"tool_id"`
-	Server      string `json:"server"`
-	Name        string `json:"tool"`
-	Description string `json:"description"`
+	ToolID      string          `json:"tool_id"`
+	Server      string          `json:"server"`
+	Name        string          `json:"tool"`
+	Description string          `json:"description"`
+	Schema      json.RawMessage `json:"schema,omitempty"`
 }
 
 // Corpus is a frozen, versioned set of tool definitions.
@@ -111,6 +114,22 @@ func (t *Tokenizer) CountTool(tl Tool) int {
 	return t.Count(tl.Name + "\n" + tl.Description)
 }
 
+// CountToolWithSchema returns the context-token cost of a tool definition
+// INCLUDING its JSON input schema (name + description + schema). This is the
+// authoritative per-tool context cost an agent actually pays. A tool with no
+// schema counts identically to CountTool, so mixing schema-bearing (live) and
+// schemaless tools in one report is well-defined. Used by the live run, where
+// both the baseline upstream tools AND the proxy management tools carry their
+// real schemas — counting schemas on BOTH sides is what keeps the headline
+// savings honest rather than overstated.
+func (t *Tokenizer) CountToolWithSchema(tl Tool) int {
+	s := tl.Name + "\n" + tl.Description
+	if len(tl.Schema) > 0 {
+		s += "\n" + string(tl.Schema)
+	}
+	return t.Count(s)
+}
+
 func (t *Tokenizer) countTools(tools []Tool) int {
 	total := 0
 	for _, tl := range tools {
@@ -119,6 +138,15 @@ func (t *Tokenizer) countTools(tools []Tool) int {
 	return total
 }
 
+// countToolsWithSchema sums CountToolWithSchema over tools.
+func (t *Tokenizer) countToolsWithSchema(tools []Tool) int {
+	total := 0
+	for _, tl := range tools {
+		total += t.CountToolWithSchema(tl)
+	}
+	return total
+}
+
 // ModeResult is the per-mode context-cost outcome.
 type ModeResult struct {
 	Mode         string  `json:"mode"`
diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go
index 95987195..7020bb4d 100644
--- a/internal/server/bench_export.go
+++ b/internal/server/bench_export.go
@@ -1,6 +1,8 @@
 package server
 
 import (
+	"encoding/json"
+
 	mcpserver "github.com/mark3labs/mcp-go/server"
 	"go.uber.org/zap"
 
@@ -8,7 +10,8 @@ import (
 )
 
 // BenchProxyToolDef is a static built-in proxy/management tool definition
-// (name + description) exposed for the in-repo benchmark harness (bench/).
+// (name + description + JSON input schema) exposed for the in-repo benchmark
+// harness (bench/).
 //
 // The benchmark scores the per-mode context cost an agent pays for mcpproxy's
 // own tools. That cost MUST reflect every tool the live routing-mode servers
@@ -16,9 +19,16 @@ import (
 // quarantine_security, search_servers, list_registries) that both modes append
 // via buildManagementTools — or the benchmark overstates the token savings
 // (MCP-3161 / Codex finding on PR #747).
+//
+// Schema is the exact JSON input schema the proxy advertises via tools/list,
+// captured from the live builder. The benchmark's exact-token headline counts
+// schemas on BOTH the baseline upstream tools and these proxy tools; omitting
+// the proxy schemas while counting the baseline's would overstate savings (the
+// MCP-3161 error), so Schema makes the headline honest without drift.
 type BenchProxyToolDef struct {
 	Name        string
 	Description string
+	Schema      json.RawMessage
 }
 
 // ProxyModeToolDefs returns the static built-in proxy + management tool
@@ -48,10 +58,17 @@ func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef {
 
 	defs := make([]BenchProxyToolDef, 0, len(serverTools))
 	for _, st := range serverTools {
-		defs = append(defs, BenchProxyToolDef{
+		def := BenchProxyToolDef{
 			Name:        st.Tool.Name,
 			Description: st.Tool.Description,
-		})
+		}
+		// InputSchema marshals to the exact {"type":"object","properties":...}
+		// an agent receives via tools/list. A marshal failure leaves Schema nil
+		// (the benchmark then withholds the headline rather than undercount).
+		if raw, err := json.Marshal(st.Tool.InputSchema); err == nil {
+			def.Schema = raw
+		}
+		defs = append(defs, def)
 	}
 	return defs
 }

From d48c2b7bc2866d1a5724aee7abefe51a8f024fa4 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 21:18:37 +0300
Subject: [PATCH 2/3] fix(bench): preserve upstream schemas through
 /api/v1/tools baseline

ConvertGenericToolsToTyped read generic["schema"], but every producer
of the generic tool map (runtime/server GetServerTools, mcp.go) emits the
upstream input schema under "inputSchema". The /api/v1/tools response
therefore dropped every schema, so the MCP-42a live benchmark baseline was
silently a description-only token count instead of the required full-schema
count, while still able to emit authoritative_headline=true.

- Read "inputSchema" first in the converter, keep "schema" as a legacy fallback.
- Gate the live headline on baseline schemas too (BaselineSchemasCounted via
  anyHaveSchema): a systematically schema-less baseline now withholds the
  headline instead of claiming a full-schema baseline it never had.
- Tests: converter preserves inputSchema (+legacy schema fallback); headline
  withheld when the baseline carries no schemas.

Related #748
---
 bench/live_report.go                  | 74 +++++++++++++++++++--------
 bench/live_report_test.go             | 31 +++++++++++
 internal/contracts/converters.go      | 10 +++-
 internal/contracts/converters_test.go | 44 ++++++++++++++++
 4 files changed, 135 insertions(+), 24 deletions(-)

diff --git a/bench/live_report.go b/bench/live_report.go
index 3dc65df7..be8a4131 100644
--- a/bench/live_report.go
+++ b/bench/live_report.go
@@ -22,19 +22,23 @@ type LiveModeResult struct {
 // LiveTokenReport is the exact-token comparison from a live proxy, with the
 // baseline upstream tools counted WITH their full JSON input schemas.
 //
-// AuthoritativeHeadline gates the savings percentage: it is only true when the
-// proxy management tools were ALSO counted with their schemas. Counting schemas
-// on the baseline but not the proxy side overstates savings — the exact error
-// corrected in MCP-3161 — so when proxy schemas are absent the savings ratio is
-// withheld and only raw token totals are reported.
+// AuthoritativeHeadline gates the savings percentage: it is only true when
+// schemas were counted on BOTH sides — the proxy management tools carry schemas
+// (ProxySchemasCounted) AND the baseline upstream tools carry schemas
+// (BaselineSchemasCounted). Counting schemas on one side only overstates or
+// distorts savings — the exact error corrected in MCP-3161 — so when either side
+// is schema-less the savings ratio is withheld and only raw token totals are
+// reported. BaselineSchemasCounted also guards against a /api/v1/tools response
+// that silently dropped upstream schemas (MCP-3167).
 type LiveTokenReport struct {
-	Encoding              string           `json:"encoding"`
-	UpstreamTools         int              `json:"upstream_tools"`
-	BaselineTokens        int              `json:"baseline_tokens"`
-	Modes                 []LiveModeResult `json:"modes"`
-	ProxySchemasCounted   bool             `json:"proxy_schemas_counted"`
-	AuthoritativeHeadline bool             `json:"authoritative_headline"`
-	Notes                 []string         `json:"notes"`
+	Encoding               string           `json:"encoding"`
+	UpstreamTools          int              `json:"upstream_tools"`
+	BaselineTokens         int              `json:"baseline_tokens"`
+	Modes                  []LiveModeResult `json:"modes"`
+	ProxySchemasCounted    bool             `json:"proxy_schemas_counted"`
+	BaselineSchemasCounted bool             `json:"baseline_schemas_counted"`
+	AuthoritativeHeadline  bool             `json:"authoritative_headline"`
+	Notes                  []string         `json:"notes"`
 }
 
 // LatencyReport summarizes proxy-side retrieve_tools search latency versus the
@@ -123,28 +127,38 @@ func RunLive(ctx context.Context, client *LiveClient, golden *GoldenSet) (*LiveR
 
 // buildTokenReport counts the baseline upstream tools WITH schemas against each
 // proxy routing mode (rt = retrieve_tools, ce = code_execution), also counted
-// with schemas. The headline savings is only emitted when EVERY proxy tool
-// carries a schema; otherwise counting schemas on the baseline alone would
-// overstate savings (MCP-3161), so the ratio is withheld and only raw token
-// totals are reported.
+// with schemas. The headline savings is only emitted when schemas were counted
+// on BOTH sides: every proxy tool carries a schema AND the baseline upstream
+// tools actually carry schemas. Counting schemas on only one side overstates (or
+// distorts) savings — the exact error corrected in MCP-3161 — so otherwise the
+// ratio is withheld and only raw token totals are reported. The baseline guard
+// also catches a silently schema-less /api/v1/tools response (MCP-3167): if the
+// management endpoint drops upstream schemas, no upstream tool has one and the
+// headline is withheld rather than claiming a full-schema baseline it never had.
 func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport {
 	baseTokens := tk.countToolsWithSchema(upstream)
 
 	proxySchemasCounted := allHaveSchema(rt) && allHaveSchema(ce)
+	// A correct full-schema baseline has schemas on at least some upstream tools.
+	// Requiring ALL would wrongly fail on legitimately parameter-less tools, so
+	// "any" is the signal that schemas were not systematically dropped.
+	baselineSchemasCounted := anyHaveSchema(upstream)
+	authoritative := proxySchemasCounted && baselineSchemasCounted
 
 	rep := &LiveTokenReport{
-		Encoding:            tk.encoding,
-		UpstreamTools:       len(upstream),
-		BaselineTokens:      baseTokens,
-		ProxySchemasCounted: proxySchemasCounted,
+		Encoding:               tk.encoding,
+		UpstreamTools:          len(upstream),
+		BaselineTokens:         baseTokens,
+		ProxySchemasCounted:    proxySchemasCounted,
+		BaselineSchemasCounted: baselineSchemasCounted,
 		Modes: []LiveModeResult{
 			{Mode: ModeBaseline, ContextTools: len(upstream), Tokens: baseTokens},
 			{Mode: ModeRetrieveTools, ContextTools: len(rt), Tokens: tk.countToolsWithSchema(rt)},
 			{Mode: ModeCodeExecution, ContextTools: len(ce), Tokens: tk.countToolsWithSchema(ce)},
 		},
 	}
-	rep.AuthoritativeHeadline = proxySchemasCounted
-	if proxySchemasCounted {
+	rep.AuthoritativeHeadline = authoritative
+	if authoritative {
 		for i := range rep.Modes {
 			m := &rep.Modes[i]
 			if m.Mode != ModeBaseline && baseTokens > 0 {
@@ -154,6 +168,10 @@ func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport {
 		rep.Notes = []string{
 			"Baseline counts upstream tools with full JSON input schemas from GET /api/v1/tools; proxy modes count the management tools with their schemas. Headline savings is authoritative.",
 		}
+	} else if !baselineSchemasCounted {
+		rep.Notes = []string{
+			"HEADLINE SAVINGS WITHHELD: no upstream baseline tool carried a JSON input schema, so the baseline is NOT the required full-schema token count — typically the /api/v1/tools response dropped upstream schemas (MCP-3167). Reporting savings now would compare a schema-less baseline against schema-counted proxy tools and DISTORT the reduction. Token totals are shown for transparency; the authoritative headline lands once the management endpoint emits upstream schemas.",
+		}
 	} else {
 		rep.Notes = []string{
 			"HEADLINE SAVINGS WITHHELD: the baseline upstream tools are counted with full schemas, but the proxy management tools (proxy_tools_v1.json) are description-only. Reporting savings now would count schemas on one side only and OVERSTATE the reduction — the exact error corrected in MCP-3161. Token totals are shown for transparency; the authoritative headline lands once proxy-tool schemas are captured live via MCP tools/list.",
@@ -162,6 +180,18 @@ func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport {
 	return rep
 }
 
+// anyHaveSchema reports whether at least one tool carries a non-empty schema.
+// Used to detect a systematically schema-less baseline (every schema dropped)
+// versus a corpus that merely contains some parameter-less tools.
+func anyHaveSchema(tools []Tool) bool {
+	for _, t := range tools {
+		if len(t.Schema) > 0 {
+			return true
+		}
+	}
+	return false
+}
+
 func allHaveSchema(tools []Tool) bool {
 	if len(tools) == 0 {
 		return false
diff --git a/bench/live_report_test.go b/bench/live_report_test.go
index 92886737..9ad1bb4e 100644
--- a/bench/live_report_test.go
+++ b/bench/live_report_test.go
@@ -132,3 +132,34 @@ func TestBuildTokenReportWithholdsWhenProxySchemasMissing(t *testing.T) {
 		t.Error("baseline tokens should still be reported for transparency")
 	}
 }
+
+// TestBuildTokenReportWithholdsWhenBaselineSchemasMissing guards the other half
+// of the MCP-3161 valve: if the baseline upstream tools were counted WITHOUT
+// schemas (e.g. the /api/v1/tools converter dropped them), the headline must be
+// withheld even when the proxy management tools do carry schemas — otherwise the
+// report falsely claims a full-schema baseline (MCP-3132/MCP-3167).
+func TestBuildTokenReportWithholdsWhenBaselineSchemasMissing(t *testing.T) {
+	tk, err := NewTokenizer(DefaultEncoding)
+	if err != nil {
+		t.Fatalf("tokenizer: %v", err)
+	}
+	upstreamSchemaless := []Tool{{Name: "big", Description: "d"}} // schema dropped
+	rt := []Tool{{Name: "retrieve_tools", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}}
+	ce := []Tool{{Name: "code_execution", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}}
+
+	rep := buildTokenReport(tk, upstreamSchemaless, rt, ce)
+	if rep.AuthoritativeHeadline {
+		t.Error("headline must be withheld when the baseline upstream tools carry no schemas")
+	}
+	if rep.BaselineSchemasCounted {
+		t.Error("BaselineSchemasCounted must be false when no upstream tool has a schema")
+	}
+	for _, m := range rep.Modes {
+		if m.SavingsRatio != 0 {
+			t.Errorf("savings ratio must be withheld (0), got %v for %q", m.SavingsRatio, m.Mode)
+		}
+	}
+	if rep.BaselineTokens <= 0 {
+		t.Error("baseline tokens should still be reported for transparency")
+	}
+}
diff --git a/internal/contracts/converters.go b/internal/contracts/converters.go
index 9ca45805..a99ab6e3 100644
--- a/internal/contracts/converters.go
+++ b/internal/contracts/converters.go
@@ -374,8 +374,14 @@ func ConvertGenericToolsToTyped(genericTools []map[string]interface{}) []Tool {
 			tool.Usage = usage
 		}
 
-		// Extract schema
-		if schema, ok := generic["schema"].(map[string]interface{}); ok {
+		// Extract schema. Every generic-map producer (runtime.GetServerTools,
+		// server.GetServerTools, mcp.go) emits the upstream input schema under the
+		// "inputSchema" key, so that is the authoritative source; "schema" is kept
+		// as a legacy fallback. Reading only "schema" silently dropped every schema
+		// from the /api/v1/tools response (MCP-3132/MCP-3167).
+		if schema, ok := generic["inputSchema"].(map[string]interface{}); ok {
+			tool.Schema = schema
+		} else if schema, ok := generic["schema"].(map[string]interface{}); ok {
 			tool.Schema = schema
 		}
 
diff --git a/internal/contracts/converters_test.go b/internal/contracts/converters_test.go
index 58afc995..41db0c7e 100644
--- a/internal/contracts/converters_test.go
+++ b/internal/contracts/converters_test.go
@@ -256,3 +256,47 @@ func TestConvertGenericServersToTyped_NoOAuth(t *testing.T) {
 	require.Len(t, servers, 1)
 	assert.Nil(t, servers[0].OAuth, "Servers without OAuth config should have nil OAuth field")
 }
+
+// TestConvertGenericToolsToTyped_PreservesInputSchema asserts the management
+// tool-list conversion keeps the upstream input schema. Every producer of the
+// generic tool map emits the schema under the "inputSchema" key (e.g.
+// internal/runtime/runtime.go:2141 GetServerTools, internal/server/server.go:2367),
+// so a converter that only reads "schema" silently drops every schema on the
+// /api/v1/tools response. Regression guard for MCP-3132/MCP-3167: without real
+// schemas the live benchmark baseline is no longer a full-schema token count.
+func TestConvertGenericToolsToTyped_PreservesInputSchema(t *testing.T) {
+	inputSchema := map[string]interface{}{
+		"type": "object",
+		"properties": map[string]interface{}{
+			"path": map[string]interface{}{"type": "string"},
+		},
+	}
+	generic := []map[string]interface{}{
+		{
+			"name":        "read_file",
+			"server_name": "fs",
+			"description": "Read a file",
+			"inputSchema": inputSchema, // key the runtime/server map builders actually emit
+		},
+	}
+
+	typed := ConvertGenericToolsToTyped(generic)
+
+	require.Len(t, typed, 1)
+	assert.Equal(t, inputSchema, typed[0].Schema, "input schema must survive conversion to the /api/v1/tools response")
+}
+
+// TestConvertGenericToolsToTyped_SchemaLegacyFallback keeps the historical
+// "schema" key working so any in-process caller that still emits it is not
+// regressed by the inputSchema fix.
+func TestConvertGenericToolsToTyped_SchemaLegacyFallback(t *testing.T) {
+	schema := map[string]interface{}{"type": "object"}
+	generic := []map[string]interface{}{
+		{"name": "t", "server_name": "s", "description": "d", "schema": schema},
+	}
+
+	typed := ConvertGenericToolsToTyped(generic)
+
+	require.Len(t, typed, 1)
+	assert.Equal(t, schema, typed[0].Schema, "legacy schema key must still be honored")
+}

From 06027863ed5fd5e074714729497b80076a9abe9e Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 22 Jun 2026 21:46:17 +0300
Subject: [PATCH 3/3] fix(bench): conform live retrieval report to Spec 065
 score-report schema

Addresses CodexReviewer finding on PR #748 / MCP-3167: the live `retrieval`
payload emitted flat metric fields, but score-report.schema.json requires
nested `retrieval.metrics` + `retrieval.gate`. Restructure RetrievalMetrics into
{metrics, gate} so live_report.json validates against the contract, proven by a
new jsonschema-validation test (TestRetrievalMetricsConformsToScoreReportSchema).

A standalone live run has no stored baseline, so gate.passed is true by
construction (CI regression-gating against a committed baseline is MCP-3133).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 bench/README.md              |  7 +++--
 bench/cmd/bench/main.go      |  2 +-
 bench/live_report_test.go    |  2 +-
 bench/metrics.go             | 55 ++++++++++++++++++++++++---------
 bench/metrics_schema_test.go | 60 ++++++++++++++++++++++++++++++++++++
 bench/metrics_test.go        | 11 ++++---
 6 files changed, 114 insertions(+), 23 deletions(-)
 create mode 100644 bench/metrics_schema_test.go

diff --git a/bench/README.md b/bench/README.md
index 24188f42..73323f51 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -122,8 +122,11 @@ What it adds over the offline token run:
 - **Accuracy.** Replays `retrieval_golden_v1.json` through the proxy's BM25
   search (`GET /api/v1/index/search`) and scores **Recall@{1,3,5,10}, MRR,
   nDCG@10, MAP** against the graded labels. Deterministic (BM25), so a single
-  run is reported (`runs_averaged: 1`). Metric field names mirror the Spec 065
-  `score-report.schema.json` `retrieval` block.
+  run is reported (`runs_averaged: 1`). The emitted `retrieval` block **conforms
+  to** the Spec 065 `score-report.schema.json` shape — nested `metrics` + `gate`
+  (verified by a schema-validation test). A standalone live run has no stored
+  baseline to regress against, so `gate.passed` is `true` by construction;
+  CI regression-gating against a committed baseline is the MCP-3133 lane.
 - **Latency.** Client-measured per-query search latency (p50/p95/p99/max) vs.
   the one-shot cost of loading all tools. Measured client-side on purpose: the
   server's `SearchToolsResponse.took` field is currently a `"0ms"` stub.
diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go
index 7ffb0122..4bfeebbb 100644
--- a/bench/cmd/bench/main.go
+++ b/bench/cmd/bench/main.go
@@ -104,7 +104,7 @@ func runLive(proxy, apiKey, goldenPath, outDir string) {
 	}
 	r := report.Retrieval
 	fmt.Fprintf(os.Stdout, "  accuracy (%d queries): Recall@1=%.3f Recall@5=%.3f MRR=%.3f nDCG@10=%.3f MAP=%.3f\n",
-		r.QueryCount, r.RecallAt[1], r.RecallAt[5], r.MRR, r.NDCGAt10, r.MAP)
+		r.QueryCount, r.Metrics.RecallAt[1], r.Metrics.RecallAt[5], r.Metrics.MRR, r.Metrics.NDCGAt10, r.Metrics.MAP)
 	l := report.Latency
 	fmt.Fprintf(os.Stdout, "  latency (%d searches): p50=%.1fms p95=%.1fms p99=%.1fms max=%.1fms; load-all-tools=%.1fms\n",
 		l.Samples, l.P50ms, l.P95ms, l.P99ms, l.MaxMs, l.LoadAllToolsMs)
diff --git a/bench/live_report_test.go b/bench/live_report_test.go
index 9ad1bb4e..618d5fa4 100644
--- a/bench/live_report_test.go
+++ b/bench/live_report_test.go
@@ -98,7 +98,7 @@ func TestRunLiveAuthoritativeHeadline(t *testing.T) {
 		}
 	}
 	// Accuracy: perfect ranking for the one query.
-	if rep.Retrieval == nil || rep.Retrieval.RecallAt[1] != 1.0 {
+	if rep.Retrieval == nil || rep.Retrieval.Metrics.RecallAt[1] != 1.0 {
 		t.Errorf("expected Recall@1=1.0, got %+v", rep.Retrieval)
 	}
 	// Latency populated.
diff --git a/bench/metrics.go b/bench/metrics.go
index ddfc1f3b..ba70e74b 100644
--- a/bench/metrics.go
+++ b/bench/metrics.go
@@ -151,18 +151,38 @@ func AveragePrecision(ranked []string, labels []Label) float64 {
 // returns the ranked tool IDs (most relevant first), limited to `limit`.
 type SearchFunc func(query string, limit int) (ranked []string, err error)
 
-// RetrievalMetrics is the aggregated retrieval-quality report over a golden
-// set. Field names mirror the Spec 065 score-report.schema.json `retrieval`
-// block so the report can be emitted to that contract.
+// RetrievalMetricValues holds the aggregated metric numbers. It is the
+// `retrieval.metrics` object of the Spec 065 score-report.schema.json contract.
+type RetrievalMetricValues struct {
+	RecallAt map[int]float64 `json:"recall_at"`
+	MRR      float64         `json:"mrr"`
+	NDCGAt10 float64         `json:"ndcg_at_10"`
+	MAP      float64         `json:"map"`
+}
+
+// RetrievalGate is the `retrieval.gate` object of the score-report contract.
+//
+// A standalone live run has no stored baseline to regress against, so the gate
+// cannot fail by construction: Passed is true and Metric/Tolerance are empty.
+// Regression gating against a committed baseline is the CI lane's job (MCP-3133)
+// — that run fills Metric/Tolerance and can set Passed=false.
+type RetrievalGate struct {
+	Passed    bool    `json:"passed"`
+	Metric    string  `json:"metric,omitempty"`
+	Tolerance float64 `json:"tolerance,omitempty"`
+}
+
+// RetrievalMetrics is the aggregated retrieval-quality report over a golden set.
+// Its JSON shape IS the Spec 065 score-report.schema.json `retrieval` block
+// (nested `metrics` + `gate`), so a live report's retrieval payload validates
+// against that contract directly.
 type RetrievalMetrics struct {
-	CorpusVersion string          `json:"corpus_version"`
-	GoldenVersion string          `json:"golden_version,omitempty"`
-	RunsAveraged  int             `json:"runs_averaged"`
-	QueryCount    int             `json:"query_count"`
-	RecallAt      map[int]float64 `json:"recall_at"`
-	MRR           float64         `json:"mrr"`
-	NDCGAt10      float64         `json:"ndcg_at_10"`
-	MAP           float64         `json:"map"`
+	CorpusVersion string                `json:"corpus_version"`
+	GoldenVersion string                `json:"golden_version,omitempty"`
+	RunsAveraged  int                   `json:"runs_averaged"`
+	QueryCount    int                   `json:"query_count,omitempty"`
+	Metrics       RetrievalMetricValues `json:"metrics"`
+	Gate          RetrievalGate         `json:"gate"`
 }
 
 // ScoreRetrieval replays every golden query through search and aggregates
@@ -204,9 +224,14 @@ func ScoreRetrieval(golden *GoldenSet, search SearchFunc, ks []int) (*RetrievalM
 		CorpusVersion: golden.CorpusVersion,
 		RunsAveraged:  1,
 		QueryCount:    len(golden.Queries),
-		RecallAt:      recallAt,
-		MRR:           mrrSum / n,
-		NDCGAt10:      ndcgSum / n,
-		MAP:           mapSum / n,
+		Metrics: RetrievalMetricValues{
+			RecallAt: recallAt,
+			MRR:      mrrSum / n,
+			NDCGAt10: ndcgSum / n,
+			MAP:      mapSum / n,
+		},
+		// No baseline compared in a standalone live run, so the regression gate
+		// cannot fail (see RetrievalGate). CI fills this in against a baseline.
+		Gate: RetrievalGate{Passed: true},
 	}, nil
 }
diff --git a/bench/metrics_schema_test.go b/bench/metrics_schema_test.go
new file mode 100644
index 00000000..a8a5eb60
--- /dev/null
+++ b/bench/metrics_schema_test.go
@@ -0,0 +1,60 @@
+package bench
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/santhosh-tekuri/jsonschema/v6"
+)
+
+// TestRetrievalMetricsConformsToScoreReportSchema proves the live retrieval
+// payload validates against the Spec 065 score-report contract — i.e. the
+// `retrieval` object carries the required nested `metrics` and `gate`
+// sub-objects, not flat fields (CodexReviewer finding on PR #748 / MCP-3167).
+func TestRetrievalMetricsConformsToScoreReportSchema(t *testing.T) {
+	golden := &GoldenSet{
+		CorpusVersion: "corpus_v1",
+		Queries: []GoldenQuery{
+			{ID: "q1", Query: "x", Labels: []Label{{ToolID: "A", Relevance: 2}}},
+		},
+	}
+	search := func(_ string, _ int) ([]string, error) { return []string{"A"}, nil }
+	m, err := ScoreRetrieval(golden, search, []int{1, 3, 5, 10})
+	if err != nil {
+		t.Fatalf("ScoreRetrieval: %v", err)
+	}
+
+	// A score report may hold the retrieval block alone (security is optional).
+	raw, err := json.Marshal(map[string]any{"retrieval": m})
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	inst, err := jsonschema.UnmarshalJSON(strings.NewReader(string(raw)))
+	if err != nil {
+		t.Fatalf("parse instance: %v", err)
+	}
+
+	schemaFile := filepath.Join("..", "specs", "065-evaluation-foundation", "contracts", "score-report.schema.json")
+	schemaRaw, err := os.ReadFile(schemaFile)
+	if err != nil {
+		t.Fatalf("read schema: %v", err)
+	}
+	schemaDoc, err := jsonschema.UnmarshalJSON(strings.NewReader(string(schemaRaw)))
+	if err != nil {
+		t.Fatalf("parse schema: %v", err)
+	}
+	c := jsonschema.NewCompiler()
+	if err := c.AddResource("score-report.schema.json", schemaDoc); err != nil {
+		t.Fatalf("add schema: %v", err)
+	}
+	sch, err := c.Compile("score-report.schema.json")
+	if err != nil {
+		t.Fatalf("compile schema: %v", err)
+	}
+	if err := sch.Validate(inst); err != nil {
+		t.Fatalf("live retrieval payload fails score-report.schema.json: %v", err)
+	}
+}
diff --git a/bench/metrics_test.go b/bench/metrics_test.go
index 48ef34e9..bee64439 100644
--- a/bench/metrics_test.go
+++ b/bench/metrics_test.go
@@ -103,12 +103,15 @@ func TestScoreRetrieval(t *testing.T) {
 	}
 	// Recall@1: q1=1/3, q2=1 -> mean = (0.3333+1)/2 = 0.66667
 	wantR1 := (1.0/3.0 + 1.0) / 2.0
-	if !almostEqual(m.RecallAt[1], wantR1) {
-		t.Errorf("mean Recall@1 = %v, want %v", m.RecallAt[1], wantR1)
+	if !almostEqual(m.Metrics.RecallAt[1], wantR1) {
+		t.Errorf("mean Recall@1 = %v, want %v", m.Metrics.RecallAt[1], wantR1)
 	}
 	// MRR: q1=1.0, q2=1.0 -> 1.0
-	if !almostEqual(m.MRR, 1.0) {
-		t.Errorf("MRR = %v, want 1.0", m.MRR)
+	if !almostEqual(m.Metrics.MRR, 1.0) {
+		t.Errorf("MRR = %v, want 1.0", m.Metrics.MRR)
+	}
+	if !m.Gate.Passed {
+		t.Error("Gate.Passed should be true for a baseline-free run")
 	}
 	if m.QueryCount != 2 {
 		t.Errorf("QueryCount = %d, want 2", m.QueryCount)