From 4778afc1f8200c9dfe863bc717c1215b05965f47 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 21:08:18 +0300 Subject: [PATCH 1/3] =?UTF-8?q?feat(bench):=20live=20benchmark=20run=20?= =?UTF-8?q?=E2=80=94=20full=20schemas=20+=20Recall@k=20+=20latency=20(MCP-?= =?UTF-8?q?42a)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the bench/ harness (PR #747) with a live run against a running proxy: - Exact token number: GET /api/v1/tools pulls upstream tools WITH full JSON input schemas; proxy-mode tools carry their live schemas via the extended server.ProxyModeToolDefs (BenchProxyToolDef.Schema). Schemas counted on BOTH sides so the headline savings is authoritative — and withheld (authoritative_headline=false) if any proxy tool lacks a schema, the MCP-3161 overstatement guard. - Accuracy: replays the Spec 065 retrieval golden set through the proxy BM25 search (GET /api/v1/index/search) and scores Recall@{1,3,5,10}/MRR/nDCG@10/MAP against graded labels (deterministic, no LLM). Field names mirror Spec 065 score-report.schema.json. - Latency: client-measured per-query search latency (p50/p95/p99/max) vs. the one-shot load-all-tools cost (server "took" is a 0ms stub). CLI: `go run ./bench/cmd/bench -live -proxy URL -api-key KEY`. Reports stay gitignored (CN-003). All metric math + the live client are unit-tested with httptest stubs; the docker-compose substrate is the live-reproduction path. Co-Authored-By: Paperclip --- bench/README.md | 50 ++++++-- bench/cmd/bench/main.go | 80 ++++++++++-- bench/live.go | 168 +++++++++++++++++++++++++ bench/live_report.go | 215 ++++++++++++++++++++++++++++++++ bench/live_report_test.go | 134 ++++++++++++++++++++ bench/live_test.go | 128 +++++++++++++++++++ bench/metrics.go | 212 +++++++++++++++++++++++++++++++ bench/metrics_test.go | 116 +++++++++++++++++ bench/proxytools.go | 1 + bench/tokens.go | 38 +++++- internal/server/bench_export.go | 23 +++- 11 files changed, 1136 insertions(+), 29 deletions(-) create mode 100644 bench/live.go create mode 100644 bench/live_report.go create mode 100644 bench/live_report_test.go create mode 100644 bench/live_test.go create mode 100644 bench/metrics.go create mode 100644 bench/metrics_test.go diff --git a/bench/README.md b/bench/README.md index c99a3868..24188f42 100644 --- a/bench/README.md +++ b/bench/README.md @@ -94,19 +94,45 @@ corpus size alongside a percentage. Reproduce with `go run ./bench/cmd/bench`. - **`cl100k_base` ≠ the pinned model's tokenizer.** Pinning the exact tokenizer for the headline model is tracked as a follow-up (see "Roadmap"). +## Live run — full schemas + accuracy + latency + +The live run boots mcpproxy over the Spec 065 reference-server config and +measures the three headline claims against a *running* proxy. Everything here is +still deterministic and LLM-free. + +```bash +# 1. Boot the reproducible substrate (proxy + 7 no-auth reference servers) +docker compose -f bench/docker-compose.yml up --build -d + +# 2. Score against the running proxy (writes bench/results/live_report.json) +go run ./bench/cmd/bench -live -proxy http://127.0.0.1:8092 -api-key eval-corpus-snapshot +``` + +What it adds over the offline token run: + +- **Exact token number (full schemas).** Pulls `GET /api/v1/tools` for the + upstream tools *with their full JSON input schemas* and counts them against + the proxy modes — whose management-tool schemas come from the same live + builders as the offline run (`server.ProxyModeToolDefs`). Because schemas are + counted on **both** sides, the savings is authoritative. + - **Safety valve (MCP-3161):** if any proxy tool is missing a schema, counting + the baseline's schemas alone would *overstate* savings, so the run + **withholds the headline %** and reports raw token totals only + (`authoritative_headline: false`). Never quote a withheld run. +- **Accuracy.** Replays `retrieval_golden_v1.json` through the proxy's BM25 + search (`GET /api/v1/index/search`) and scores **Recall@{1,3,5,10}, MRR, + nDCG@10, MAP** against the graded labels. Deterministic (BM25), so a single + run is reported (`runs_averaged: 1`). Metric field names mirror the Spec 065 + `score-report.schema.json` `retrieval` block. +- **Latency.** Client-measured per-query search latency (p50/p95/p99/max) vs. + the one-shot cost of loading all tools. Measured client-side on purpose: the + server's `SearchToolsResponse.took` field is currently a `"0ms"` stub. + ## What is scoped but not yet built (follow-ups) These require decisions and/or other roles, so they are tracked as child issues rather than landed here: -- **Live run with full schemas + accuracy + latency** — boot mcpproxy over the - Spec 065 `snapshot-servers.config.json` (see `docker-compose.yml`), pull - `GET /api/v1/tools` for exact schemas, and: - - **Accuracy**: replay the Spec 065 retrieval golden set - (`retrieval_golden_v1.json`) through `retrieve_tools` and score Recall@k / - MRR / nDCG (deterministic, no LLM) — reuses the D1 scorer. - - **Latency**: measure proxy-side `retrieve_tools` search latency vs. the - fixed cost of loading all tools. - **End-to-end task success with a pinned LLM** — requires a pinned model + an LLM-call budget; this is the only part that costs spend. - **CI publish-on-release-tag → public static dashboard** — Release/DevOps lane. @@ -123,11 +149,13 @@ rather than landed here: `internal/server.ProxyModeToolDefs`). No hand-maintained fixture — the benchmark cannot drift from the tools the proxy actually serves. -## Reproducible live run (skeleton) +## Reproducible live run `docker-compose.yml` boots mcpproxy over the frozen reference-server config so -the corpus and live tool list are reproducible across machines. Wiring the live -accuracy/latency scorers into it is the follow-up above. +the corpus and live tool list are reproducible across machines. The live +accuracy/latency/full-schema scorers attach to it via `-live` (see "Live run" +above). Pin the upstream-server images before publishing headline numbers +(image drift can change the tool corpus). ## Reviewer contact diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go index a5e924b2..7ffb0122 100644 --- a/bench/cmd/bench/main.go +++ b/bench/cmd/bench/main.go @@ -1,16 +1,22 @@ -// Command bench runs the mcpproxy token-reduction benchmark over a frozen tool -// corpus and writes a JSON report plus a static HTML dashboard. +// Command bench runs the mcpproxy benchmark. // -// Usage: +// Default (offline) mode scores the committed Spec 065 frozen corpus for +// token reduction and writes a JSON report plus a static HTML dashboard: // // go run ./bench/cmd/bench [-corpus PATH] [-out DIR] [-encoding NAME] // -// With no flags it scores the committed Spec 065 frozen corpus and writes the -// reports to bench/results/ (gitignored — reports are never committed, per the -// Spec 065 CN-003 repo rule). +// Live mode boots against a running proxy (see bench/docker-compose.yml) to add +// the exact-token comparison (full schemas), retrieval accuracy (Recall@k / MRR +// / nDCG over the golden set), and search latency: +// +// go run ./bench/cmd/bench -live [-proxy URL] [-api-key KEY] [-golden PATH] +// +// Reports land in bench/results/ (gitignored — reports are never committed, per +// the Spec 065 CN-003 repo rule). package main import ( + "context" "flag" "fmt" "log" @@ -21,21 +27,33 @@ import ( func main() { corpusPath := flag.String("corpus", "specs/065-evaluation-foundation/datasets/corpus_v1.tools.json", "path to the frozen tool corpus snapshot") - outDir := flag.String("out", "bench/results", "output directory for report.json and dashboard.html") + outDir := flag.String("out", "bench/results", "output directory for reports") encoding := flag.String("encoding", bench.DefaultEncoding, "tiktoken encoding name") + live := flag.Bool("live", false, "run the live benchmark against a running proxy (full schemas + accuracy + latency)") + proxy := flag.String("proxy", "http://127.0.0.1:8092", "live proxy base URL") + apiKey := flag.String("api-key", "eval-corpus-snapshot", "live proxy API key (X-API-Key)") + goldenPath := flag.String("golden", "specs/065-evaluation-foundation/datasets/retrieval_golden_v1.json", "path to the retrieval golden set") flag.Parse() - tk, err := bench.NewTokenizer(*encoding) + if *live { + runLive(*proxy, *apiKey, *goldenPath, *outDir) + return + } + runOffline(*corpusPath, *encoding, *outDir) +} + +func runOffline(corpusPath, encoding, outDir string) { + tk, err := bench.NewTokenizer(encoding) if err != nil { log.Fatalf("bench: %v", err) } - corpus, err := bench.LoadCorpus(*corpusPath) + corpus, err := bench.LoadCorpus(corpusPath) if err != nil { log.Fatalf("bench: %v", err) } report := bench.ComputeReport(tk, corpus) - jsonPath, htmlPath, err := report.WriteReports(*outDir) + jsonPath, htmlPath, err := report.WriteReports(outDir) if err != nil { log.Fatalf("bench: %v", err) } @@ -50,3 +68,45 @@ func main() { } fmt.Fprintf(os.Stdout, "wrote %s and %s\n", jsonPath, htmlPath) } + +func runLive(proxy, apiKey, goldenPath, outDir string) { + golden, err := bench.LoadGoldenSet(goldenPath) + if err != nil { + log.Fatalf("bench: %v", err) + } + client := bench.NewLiveClient(proxy, apiKey) + report, err := bench.RunLive(context.Background(), client, golden) + if err != nil { + log.Fatalf("bench: %v", err) + } + jsonPath, err := report.WriteJSON(outDir) + if err != nil { + log.Fatalf("bench: %v", err) + } + + fmt.Fprintf(os.Stdout, "mcpproxy LIVE benchmark (proxy %s, %s)\n", report.Proxy, report.Encoding) + tr := report.Tokens + fmt.Fprintf(os.Stdout, " tokens: %d upstream tools, baseline %d tokens (with full schemas)\n", tr.UpstreamTools, tr.BaselineTokens) + for _, m := range tr.Modes { + if m.Mode == bench.ModeBaseline { + continue + } + if tr.AuthoritativeHeadline { + fmt.Fprintf(os.Stdout, " %-16s %6d tokens %.1f%% fewer\n", m.Mode, m.Tokens, m.SavingsRatio*100) + } else { + fmt.Fprintf(os.Stdout, " %-16s %6d tokens (savings withheld — see notes)\n", m.Mode, m.Tokens) + } + } + if !tr.AuthoritativeHeadline { + for _, n := range tr.Notes { + fmt.Fprintf(os.Stdout, " NOTE: %s\n", n) + } + } + r := report.Retrieval + fmt.Fprintf(os.Stdout, " accuracy (%d queries): Recall@1=%.3f Recall@5=%.3f MRR=%.3f nDCG@10=%.3f MAP=%.3f\n", + r.QueryCount, r.RecallAt[1], r.RecallAt[5], r.MRR, r.NDCGAt10, r.MAP) + l := report.Latency + fmt.Fprintf(os.Stdout, " latency (%d searches): p50=%.1fms p95=%.1fms p99=%.1fms max=%.1fms; load-all-tools=%.1fms\n", + l.Samples, l.P50ms, l.P95ms, l.P99ms, l.MaxMs, l.LoadAllToolsMs) + fmt.Fprintf(os.Stdout, "wrote %s\n", jsonPath) +} diff --git a/bench/live.go b/bench/live.go new file mode 100644 index 00000000..fd82faa7 --- /dev/null +++ b/bench/live.go @@ -0,0 +1,168 @@ +package bench + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strconv" + "time" +) + +// LiveClient talks to a running mcpproxy instance (e.g. the bench +// docker-compose substrate on 127.0.0.1:8092) over its REST API. It is used by +// the live benchmark run to pull the exact tool definitions (with schemas) and +// to replay the retrieval golden set through the proxy's BM25 search. +type LiveClient struct { + BaseURL string + APIKey string + HTTP *http.Client +} + +// NewLiveClient builds a LiveClient for baseURL (e.g. "http://127.0.0.1:8092") +// authenticating with apiKey via the X-API-Key header. +func NewLiveClient(baseURL, apiKey string) *LiveClient { + return &LiveClient{ + BaseURL: baseURL, + APIKey: apiKey, + HTTP: &http.Client{Timeout: 30 * time.Second}, + } +} + +// successEnvelope is the standard mcpproxy REST response wrapper +// ({"success":true,"data":{...}}). Data is decoded lazily by each caller. +type successEnvelope struct { + Success bool `json:"success"` + Data json.RawMessage `json:"data"` + Error string `json:"error,omitempty"` +} + +// getJSON performs an authenticated GET and unmarshals the envelope's data +// field into out. +func (c *LiveClient) getJSON(ctx context.Context, path string, out interface{}) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.BaseURL+path, nil) + if err != nil { + return fmt.Errorf("build request %q: %w", path, err) + } + if c.APIKey != "" { + req.Header.Set("X-API-Key", c.APIKey) + } + resp, err := c.HTTP.Do(req) + if err != nil { + return fmt.Errorf("GET %q: %w", path, err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("read %q: %w", path, err) + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("GET %q: status %d: %s", path, resp.StatusCode, string(body)) + } + var env successEnvelope + if err := json.Unmarshal(body, &env); err != nil { + return fmt.Errorf("decode envelope %q: %w", path, err) + } + if !env.Success { + return fmt.Errorf("GET %q: api error: %s", path, env.Error) + } + if err := json.Unmarshal(env.Data, out); err != nil { + return fmt.Errorf("decode data %q: %w", path, err) + } + return nil +} + +// apiTool mirrors contracts.Tool for the fields the benchmark needs. The schema +// is kept raw so its exact serialized form is what gets tokenized. +type apiTool struct { + Name string `json:"name"` + ServerName string `json:"server_name"` + Description string `json:"description"` + Schema json.RawMessage `json:"schema,omitempty"` +} + +// FetchUpstreamTools pulls the consolidated tool list (GET /api/v1/tools) and +// returns every upstream tool with its full JSON input schema, ready to feed +// into schema-aware token counting for the baseline. +func (c *LiveClient) FetchUpstreamTools(ctx context.Context) ([]Tool, error) { + var resp struct { + Tools []apiTool `json:"tools"` + } + if err := c.getJSON(ctx, "/api/v1/tools", &resp); err != nil { + return nil, err + } + tools := make([]Tool, 0, len(resp.Tools)) + for _, t := range resp.Tools { + tools = append(tools, Tool{ + ToolID: t.ServerName + ":" + t.Name, + Server: t.ServerName, + Name: t.Name, + Description: t.Description, + Schema: normalizeSchema(t.Schema), + }) + } + return tools, nil +} + +// normalizeSchema treats an empty JSON object ("{}") or JSON null the same as an +// absent schema so a tool with no real parameters does not inflate token counts. +func normalizeSchema(raw json.RawMessage) json.RawMessage { + switch string(raw) { + case "", "null", "{}": + return nil + default: + return raw + } +} + +// Search replays one query through the proxy's BM25 tool search +// (GET /api/v1/index/search) and returns the ranked tool IDs (server:tool, +// best first) plus the client-measured round-trip latency. +// +// Latency is measured client-side on purpose: the server's SearchToolsResponse +// "took" field is currently a hardcoded "0ms" stub (internal/httpapi +// handleSearchTools), so it cannot be trusted as the proxy-side timing. +func (c *LiveClient) Search(ctx context.Context, query string, limit int) (ranked []string, latency time.Duration, err error) { + q := url.Values{} + q.Set("q", query) + q.Set("limit", strconv.Itoa(limit)) + path := "/api/v1/index/search?" + q.Encode() + + var resp struct { + Results []struct { + Tool apiTool `json:"tool"` + Score float64 `json:"score"` + } `json:"results"` + } + start := time.Now() + err = c.getJSON(ctx, path, &resp) + latency = time.Since(start) + if err != nil { + return nil, latency, err + } + ranked = make([]string, 0, len(resp.Results)) + for _, r := range resp.Results { + ranked = append(ranked, r.Tool.ServerName+":"+r.Tool.Name) + } + return ranked, latency, nil +} + +// LoadGoldenSet reads the Spec 065 retrieval golden set +// (retrieval_golden_v1.json) from disk. +func LoadGoldenSet(path string) (*GoldenSet, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read golden set %q: %w", path, err) + } + var g GoldenSet + if err := json.Unmarshal(data, &g); err != nil { + return nil, fmt.Errorf("parse golden set %q: %w", path, err) + } + if len(g.Queries) == 0 { + return nil, fmt.Errorf("golden set %q contains no queries", path) + } + return &g, nil +} diff --git a/bench/live_report.go b/bench/live_report.go new file mode 100644 index 00000000..3dc65df7 --- /dev/null +++ b/bench/live_report.go @@ -0,0 +1,215 @@ +package bench + +import ( + "context" + "encoding/json" + "fmt" + "math" + "os" + "path/filepath" + "sort" + "time" +) + +// LiveModeResult is the per-mode context-token cost from the live run. +type LiveModeResult struct { + Mode string `json:"mode"` + ContextTools int `json:"context_tools"` + Tokens int `json:"tokens"` + SavingsRatio float64 `json:"savings_vs_baseline,omitempty"` +} + +// LiveTokenReport is the exact-token comparison from a live proxy, with the +// baseline upstream tools counted WITH their full JSON input schemas. +// +// AuthoritativeHeadline gates the savings percentage: it is only true when the +// proxy management tools were ALSO counted with their schemas. Counting schemas +// on the baseline but not the proxy side overstates savings — the exact error +// corrected in MCP-3161 — so when proxy schemas are absent the savings ratio is +// withheld and only raw token totals are reported. +type LiveTokenReport struct { + Encoding string `json:"encoding"` + UpstreamTools int `json:"upstream_tools"` + BaselineTokens int `json:"baseline_tokens"` + Modes []LiveModeResult `json:"modes"` + ProxySchemasCounted bool `json:"proxy_schemas_counted"` + AuthoritativeHeadline bool `json:"authoritative_headline"` + Notes []string `json:"notes"` +} + +// LatencyReport summarizes proxy-side retrieve_tools search latency versus the +// fixed one-shot cost of loading every tool. Times are client-measured +// (milliseconds); the server's SearchToolsResponse "took" field is a "0ms" stub. +type LatencyReport struct { + Samples int `json:"samples"` + P50ms float64 `json:"p50_ms"` + P95ms float64 `json:"p95_ms"` + P99ms float64 `json:"p99_ms"` + MaxMs float64 `json:"max_ms"` + LoadAllToolsMs float64 `json:"load_all_tools_ms"` +} + +// LiveReport is the full live benchmark result: exact-token comparison, +// retrieval accuracy, and search latency, all gathered from one running proxy. +type LiveReport struct { + Proxy string `json:"proxy"` + Encoding string `json:"encoding"` + Tokens *LiveTokenReport `json:"tokens"` + Retrieval *RetrievalMetrics `json:"retrieval"` + Latency *LatencyReport `json:"latency"` +} + +// recallCutoffs are the standard Recall@k cutoffs reported (matches Spec 065 +// score-report.schema.json recall_at keys). +var recallCutoffs = []int{1, 3, 5, 10} + +// WriteJSON writes the live report as indented JSON into dir/live_report.json +// (the dir is gitignored — reports are never committed, per Spec 065 CN-003). +func (r *LiveReport) WriteJSON(dir string) (string, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return "", fmt.Errorf("mkdir %q: %w", dir, err) + } + path := filepath.Join(dir, "live_report.json") + data, err := json.MarshalIndent(r, "", " ") + if err != nil { + return "", fmt.Errorf("marshal live report: %w", err) + } + if err := os.WriteFile(path, append(data, '\n'), 0o644); err != nil { + return "", fmt.Errorf("write %q: %w", path, err) + } + return path, nil +} + +// RunLive gathers the full live benchmark from a running proxy: it pulls the +// exact tool definitions (with schemas) for the token comparison, replays the +// golden set through the proxy's BM25 search for accuracy, and records the +// per-query search latency. +func RunLive(ctx context.Context, client *LiveClient, golden *GoldenSet) (*LiveReport, error) { + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + return nil, err + } + + // 1. Exact-token: fetch upstream tools with schemas (also times "load all"). + loadStart := time.Now() + upstream, err := client.FetchUpstreamTools(ctx) + loadAll := time.Since(loadStart) + if err != nil { + return nil, fmt.Errorf("fetch upstream tools: %w", err) + } + tokenRep := buildTokenReport(tk, upstream, + ProxyToolsForMode(ModeRetrieveTools), ProxyToolsForMode(ModeCodeExecution)) + + // 2. Accuracy + 3. Latency: replay the golden set, capturing search latency. + var latencies []time.Duration + searchFn := func(query string, limit int) ([]string, error) { + ranked, lat, serr := client.Search(ctx, query, limit) + latencies = append(latencies, lat) + return ranked, serr + } + metrics, err := ScoreRetrieval(golden, searchFn, recallCutoffs) + if err != nil { + return nil, fmt.Errorf("score retrieval: %w", err) + } + + return &LiveReport{ + Proxy: client.BaseURL, + Encoding: tk.encoding, + Tokens: tokenRep, + Retrieval: metrics, + Latency: computeLatency(latencies, loadAll), + }, nil +} + +// buildTokenReport counts the baseline upstream tools WITH schemas against each +// proxy routing mode (rt = retrieve_tools, ce = code_execution), also counted +// with schemas. The headline savings is only emitted when EVERY proxy tool +// carries a schema; otherwise counting schemas on the baseline alone would +// overstate savings (MCP-3161), so the ratio is withheld and only raw token +// totals are reported. +func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport { + baseTokens := tk.countToolsWithSchema(upstream) + + proxySchemasCounted := allHaveSchema(rt) && allHaveSchema(ce) + + rep := &LiveTokenReport{ + Encoding: tk.encoding, + UpstreamTools: len(upstream), + BaselineTokens: baseTokens, + ProxySchemasCounted: proxySchemasCounted, + Modes: []LiveModeResult{ + {Mode: ModeBaseline, ContextTools: len(upstream), Tokens: baseTokens}, + {Mode: ModeRetrieveTools, ContextTools: len(rt), Tokens: tk.countToolsWithSchema(rt)}, + {Mode: ModeCodeExecution, ContextTools: len(ce), Tokens: tk.countToolsWithSchema(ce)}, + }, + } + rep.AuthoritativeHeadline = proxySchemasCounted + if proxySchemasCounted { + for i := range rep.Modes { + m := &rep.Modes[i] + if m.Mode != ModeBaseline && baseTokens > 0 { + m.SavingsRatio = 1.0 - float64(m.Tokens)/float64(baseTokens) + } + } + rep.Notes = []string{ + "Baseline counts upstream tools with full JSON input schemas from GET /api/v1/tools; proxy modes count the management tools with their schemas. Headline savings is authoritative.", + } + } else { + rep.Notes = []string{ + "HEADLINE SAVINGS WITHHELD: the baseline upstream tools are counted with full schemas, but the proxy management tools (proxy_tools_v1.json) are description-only. Reporting savings now would count schemas on one side only and OVERSTATE the reduction — the exact error corrected in MCP-3161. Token totals are shown for transparency; the authoritative headline lands once proxy-tool schemas are captured live via MCP tools/list.", + } + } + return rep +} + +func allHaveSchema(tools []Tool) bool { + if len(tools) == 0 { + return false + } + for _, t := range tools { + if len(t.Schema) == 0 { + return false + } + } + return true +} + +// computeLatency summarizes search-call latencies with nearest-rank +// percentiles, plus the fixed one-shot cost of loading all tools. +func computeLatency(samples []time.Duration, loadAll time.Duration) *LatencyReport { + rep := &LatencyReport{ + Samples: len(samples), + LoadAllToolsMs: ms(loadAll), + } + if len(samples) == 0 { + return rep + } + sorted := make([]time.Duration, len(samples)) + copy(sorted, samples) + sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) + rep.P50ms = ms(percentile(sorted, 50)) + rep.P95ms = ms(percentile(sorted, 95)) + rep.P99ms = ms(percentile(sorted, 99)) + rep.MaxMs = ms(sorted[len(sorted)-1]) + return rep +} + +// percentile returns the nearest-rank percentile p (0-100) of a sorted slice. +func percentile(sorted []time.Duration, p float64) time.Duration { + if len(sorted) == 0 { + return 0 + } + rank := int(math.Ceil(p / 100.0 * float64(len(sorted)))) + if rank < 1 { + rank = 1 + } + if rank > len(sorted) { + rank = len(sorted) + } + return sorted[rank-1] +} + +// ms converts a duration to milliseconds as a float. +func ms(d time.Duration) float64 { + return float64(d.Microseconds()) / 1000.0 +} diff --git a/bench/live_report_test.go b/bench/live_report_test.go new file mode 100644 index 00000000..92886737 --- /dev/null +++ b/bench/live_report_test.go @@ -0,0 +1,134 @@ +package bench + +import ( + "context" + "encoding/json" + "path/filepath" + "testing" + "time" +) + +// goldenPath locates the committed Spec 065 golden set relative to the repo +// root (tests run from the bench/ package dir). +func goldenPath() string { + return filepath.Join("..", "specs", "065-evaluation-foundation", "datasets", "retrieval_golden_v1.json") +} + +func TestLoadGoldenSetReal(t *testing.T) { + g, err := LoadGoldenSet(goldenPath()) + if err != nil { + t.Fatalf("LoadGoldenSet: %v", err) + } + if g.CorpusVersion == "" { + t.Error("corpus_version empty") + } + if len(g.Queries) < 10 { + t.Errorf("expected a substantial golden set, got %d queries", len(g.Queries)) + } + for _, q := range g.Queries { + if q.ID == "" || q.Query == "" { + t.Errorf("query missing id/text: %+v", q) + } + if relevantCount(q.Labels) == 0 { + t.Errorf("query %q has no relevant labels", q.ID) + } + } +} + +func TestPercentiles(t *testing.T) { + ds := []time.Duration{ + 10 * time.Millisecond, 20 * time.Millisecond, 30 * time.Millisecond, + 40 * time.Millisecond, 50 * time.Millisecond, 60 * time.Millisecond, + 70 * time.Millisecond, 80 * time.Millisecond, 90 * time.Millisecond, + 100 * time.Millisecond, + } + lat := computeLatency(ds, 5*time.Millisecond) + if lat.Samples != 10 { + t.Errorf("Samples = %d, want 10", lat.Samples) + } + // nearest-rank: p50 -> ceil(0.5*10)=5th value (50ms); p95 -> 10th (100ms) + if lat.P50ms != 50 { + t.Errorf("P50ms = %v, want 50", lat.P50ms) + } + if lat.P95ms != 100 { + t.Errorf("P95ms = %v, want 100", lat.P95ms) + } + if lat.MaxMs != 100 { + t.Errorf("MaxMs = %v, want 100", lat.MaxMs) + } + if lat.LoadAllToolsMs != 5 { + t.Errorf("LoadAllToolsMs = %v, want 5", lat.LoadAllToolsMs) + } +} + +func TestRunLiveAuthoritativeHeadline(t *testing.T) { + srv := stubProxy(t) + defer srv.Close() + + c := NewLiveClient(srv.URL, "test-key") + golden := &GoldenSet{ + CorpusVersion: "corpus_v1", + Queries: []GoldenQuery{ + {ID: "q1", Query: "read a file", Labels: []Label{{ToolID: "filesystem:read_text_file", Relevance: 2}}}, + }, + } + rep, err := RunLive(context.Background(), c, golden) + if err != nil { + t.Fatalf("RunLive: %v", err) + } + // Token report: baseline counted with schemas AND proxy tools carry their + // live schemas (from server.ProxyModeToolDefs), so the headline is + // authoritative — schemas on BOTH sides, no MCP-3161 overstatement. + if rep.Tokens == nil || rep.Tokens.UpstreamTools != 2 { + t.Fatalf("expected 2 upstream tools, got %+v", rep.Tokens) + } + if !rep.Tokens.ProxySchemasCounted { + t.Error("proxy tools should carry schemas from the live builders") + } + if !rep.Tokens.AuthoritativeHeadline { + t.Error("headline should be authoritative when both sides count schemas") + } + if rep.Tokens.BaselineTokens <= 0 { + t.Error("baseline tokens should be counted with schemas") + } + // A savings ratio must be present for the proxy modes. + for _, m := range rep.Tokens.Modes { + if m.Mode != ModeBaseline && m.SavingsRatio == 0 { + t.Errorf("expected a savings ratio for mode %q", m.Mode) + } + } + // Accuracy: perfect ranking for the one query. + if rep.Retrieval == nil || rep.Retrieval.RecallAt[1] != 1.0 { + t.Errorf("expected Recall@1=1.0, got %+v", rep.Retrieval) + } + // Latency populated. + if rep.Latency == nil || rep.Latency.Samples != 1 { + t.Errorf("expected 1 latency sample, got %+v", rep.Latency) + } +} + +// TestBuildTokenReportWithholdsWhenProxySchemasMissing guards the MCP-3161 +// safety valve: if any proxy tool lacks a schema, counting the baseline's +// schemas alone would overstate savings, so the headline is withheld. +func TestBuildTokenReportWithholdsWhenProxySchemasMissing(t *testing.T) { + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + t.Fatalf("tokenizer: %v", err) + } + upstream := []Tool{{Name: "big", Description: "d", Schema: json.RawMessage(`{"type":"object","properties":{"x":{"type":"string"}}}`)}} + rtSchemaless := []Tool{{Name: "retrieve_tools", Description: "d"}} // no schema + ce := []Tool{{Name: "code_execution", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}} + + rep := buildTokenReport(tk, upstream, rtSchemaless, ce) + if rep.AuthoritativeHeadline { + t.Error("headline must be withheld when a proxy tool lacks a schema") + } + for _, m := range rep.Modes { + if m.SavingsRatio != 0 { + t.Errorf("savings ratio must be withheld (0), got %v for %q", m.SavingsRatio, m.Mode) + } + } + if rep.BaselineTokens <= 0 { + t.Error("baseline tokens should still be reported for transparency") + } +} diff --git a/bench/live_test.go b/bench/live_test.go new file mode 100644 index 00000000..437370bc --- /dev/null +++ b/bench/live_test.go @@ -0,0 +1,128 @@ +package bench + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" +) + +// stubProxy returns an httptest server that mimics the two mcpproxy REST +// endpoints the live benchmark uses, wrapping payloads in the standard +// {success, data} envelope. +func stubProxy(t *testing.T) *httptest.Server { + t.Helper() + mux := http.NewServeMux() + mux.HandleFunc("/api/v1/tools", func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "success": true, + "data": map[string]any{ + "tools": []map[string]any{ + { + "name": "read_text_file", + "server_name": "filesystem", + "description": "Read a file as text", + "schema": map[string]any{ + "type": "object", + "properties": map[string]any{"path": map[string]any{"type": "string"}}, + "required": []string{"path"}, + }, + }, + { + "name": "echo", + "server_name": "memory", + "description": "Echo input", + }, + }, + }, + }) + }) + mux.HandleFunc("/api/v1/index/search", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Query().Get("q") == "" { + http.Error(w, "missing q", http.StatusBadRequest) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "success": true, + "data": map[string]any{ + "query": r.URL.Query().Get("q"), + "results": []map[string]any{ + {"tool": map[string]any{"name": "read_text_file", "server_name": "filesystem"}, "score": 0.9}, + {"tool": map[string]any{"name": "echo", "server_name": "memory"}, "score": 0.1}, + }, + "total": 2, + "took": "0ms", + }, + }) + }) + return httptest.NewServer(mux) +} + +func TestLiveClientFetchUpstreamTools(t *testing.T) { + srv := stubProxy(t) + defer srv.Close() + + c := NewLiveClient(srv.URL, "test-key") + tools, err := c.FetchUpstreamTools(context.Background()) + if err != nil { + t.Fatalf("FetchUpstreamTools: %v", err) + } + if len(tools) != 2 { + t.Fatalf("got %d tools, want 2", len(tools)) + } + if tools[0].ToolID != "filesystem:read_text_file" { + t.Errorf("ToolID = %q, want filesystem:read_text_file", tools[0].ToolID) + } + if len(tools[0].Schema) == 0 { + t.Errorf("expected schema captured for tool with input schema, got none") + } + if len(tools[1].Schema) != 0 { + t.Errorf("expected no schema for schemaless tool, got %s", tools[1].Schema) + } +} + +func TestLiveClientSearch(t *testing.T) { + srv := stubProxy(t) + defer srv.Close() + + c := NewLiveClient(srv.URL, "test-key") + ranked, latency, err := c.Search(context.Background(), "read a file", 10) + if err != nil { + t.Fatalf("Search: %v", err) + } + want := []string{"filesystem:read_text_file", "memory:echo"} + if len(ranked) != len(want) { + t.Fatalf("ranked = %v, want %v", ranked, want) + } + for i := range want { + if ranked[i] != want[i] { + t.Errorf("ranked[%d] = %q, want %q", i, ranked[i], want[i]) + } + } + if latency < 0 { + t.Errorf("latency should be non-negative, got %v", latency) + } +} + +func TestSchemaAwareTokenCountExceedsDescOnly(t *testing.T) { + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + t.Fatalf("tokenizer: %v", err) + } + withSchema := Tool{ + Name: "read_text_file", + Description: "Read a file as text", + Schema: json.RawMessage(`{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}`), + } + descOnly := Tool{Name: withSchema.Name, Description: withSchema.Description} + if tk.CountToolWithSchema(withSchema) <= tk.CountTool(descOnly) { + t.Errorf("schema-aware count (%d) must exceed desc-only count (%d)", + tk.CountToolWithSchema(withSchema), tk.CountTool(descOnly)) + } + // A schemaless tool must count identically under both methods. + if tk.CountToolWithSchema(descOnly) != tk.CountTool(descOnly) { + t.Errorf("schemaless tool should count identically: %d vs %d", + tk.CountToolWithSchema(descOnly), tk.CountTool(descOnly)) + } +} diff --git a/bench/metrics.go b/bench/metrics.go new file mode 100644 index 00000000..ddfc1f3b --- /dev/null +++ b/bench/metrics.go @@ -0,0 +1,212 @@ +package bench + +import ( + "fmt" + "math" +) + +// Label is a graded relevance judgement for one tool against one query, taken +// from the Spec 065 retrieval golden set (relevance 2 = primary, 1 = related, +// 0 / absent = irrelevant). +type Label struct { + ToolID string `json:"tool_id"` + Relevance int `json:"relevance"` +} + +// GoldenQuery is one labelled query -> relevant-tool(s) judgement. +type GoldenQuery struct { + ID string `json:"id"` + Query string `json:"query"` + Labels []Label `json:"labels"` +} + +// GoldenSet is the frozen Spec 065 retrieval golden set +// (retrieval_golden_v1.json). +type GoldenSet struct { + CorpusVersion string `json:"corpus_version"` + Queries []GoldenQuery `json:"queries"` +} + +// relevanceOf returns the graded relevance of toolID for the given labels (0 if +// the tool is not a labelled relevant result). +func relevanceOf(toolID string, labels []Label) int { + for _, l := range labels { + if l.ToolID == toolID { + return l.Relevance + } + } + return 0 +} + +// relevantCount is the number of tools with relevance >= 1 for a query. +func relevantCount(labels []Label) int { + n := 0 + for _, l := range labels { + if l.Relevance >= 1 { + n++ + } + } + return n +} + +// RecallAtK is the fraction of the query's relevant tools (relevance >= 1) that +// appear in the top-k of the ranking. Returns 0 when there are no relevant +// tools (a degenerate query that should not be scored). +func RecallAtK(ranked []string, labels []Label, k int) float64 { + total := relevantCount(labels) + if total == 0 { + return 0 + } + hits := 0 + for i, id := range ranked { + if i >= k { + break + } + if relevanceOf(id, labels) >= 1 { + hits++ + } + } + return float64(hits) / float64(total) +} + +// ReciprocalRank is 1/rank of the first relevant tool in the ranking, or 0 if +// none of the ranked tools are relevant. +func ReciprocalRank(ranked []string, labels []Label) float64 { + for i, id := range ranked { + if relevanceOf(id, labels) >= 1 { + return 1.0 / float64(i+1) + } + } + return 0 +} + +// NDCGAtK is the normalized discounted cumulative gain at k using the graded +// relevance as the gain (linear gain, log2 position discount). 1.0 means the +// ranking is in ideal (relevance-descending) order; 0 means no gain in top-k. +func NDCGAtK(ranked []string, labels []Label, k int) float64 { + dcg := 0.0 + for i, id := range ranked { + if i >= k { + break + } + rel := relevanceOf(id, labels) + if rel == 0 { + continue + } + dcg += float64(rel) / math.Log2(float64(i+2)) // position i (0-based) -> log2(i+2) + } + idcg := idealDCG(labels, k) + if idcg == 0 { + return 0 + } + return dcg / idcg +} + +// idealDCG is the DCG of the best possible ordering (relevances sorted +// descending) capped at k. +func idealDCG(labels []Label, k int) float64 { + rels := make([]int, 0, len(labels)) + for _, l := range labels { + if l.Relevance >= 1 { + rels = append(rels, l.Relevance) + } + } + // descending sort (small slice; insertion sort keeps deps minimal) + for i := 1; i < len(rels); i++ { + for j := i; j > 0 && rels[j] > rels[j-1]; j-- { + rels[j], rels[j-1] = rels[j-1], rels[j] + } + } + idcg := 0.0 + for i, rel := range rels { + if i >= k { + break + } + idcg += float64(rel) / math.Log2(float64(i+2)) + } + return idcg +} + +// AveragePrecision is the mean of the precision values computed at each rank +// where a relevant tool is retrieved, divided by the total number of relevant +// tools (so unretrieved relevant tools lower the score). Binary relevance +// (relevance >= 1) is used, matching the standard MAP definition. +func AveragePrecision(ranked []string, labels []Label) float64 { + total := relevantCount(labels) + if total == 0 { + return 0 + } + hits := 0 + sumPrec := 0.0 + for i, id := range ranked { + if relevanceOf(id, labels) >= 1 { + hits++ + sumPrec += float64(hits) / float64(i+1) + } + } + return sumPrec / float64(total) +} + +// SearchFunc replays one query through the retrieval system under test and +// returns the ranked tool IDs (most relevant first), limited to `limit`. +type SearchFunc func(query string, limit int) (ranked []string, err error) + +// RetrievalMetrics is the aggregated retrieval-quality report over a golden +// set. Field names mirror the Spec 065 score-report.schema.json `retrieval` +// block so the report can be emitted to that contract. +type RetrievalMetrics struct { + CorpusVersion string `json:"corpus_version"` + GoldenVersion string `json:"golden_version,omitempty"` + RunsAveraged int `json:"runs_averaged"` + QueryCount int `json:"query_count"` + RecallAt map[int]float64 `json:"recall_at"` + MRR float64 `json:"mrr"` + NDCGAt10 float64 `json:"ndcg_at_10"` + MAP float64 `json:"map"` +} + +// ScoreRetrieval replays every golden query through search and aggregates +// Recall@k (for each k in ks), MRR, nDCG@10 and MAP as the mean over all +// queries. The search is deterministic (BM25), so a single run is averaged. +func ScoreRetrieval(golden *GoldenSet, search SearchFunc, ks []int) (*RetrievalMetrics, error) { + if golden == nil || len(golden.Queries) == 0 { + return nil, fmt.Errorf("golden set is empty") + } + // The largest k we must retrieve to score every requested cutoff and nDCG@10. + maxK := 10 + for _, k := range ks { + if k > maxK { + maxK = k + } + } + + recallSum := make(map[int]float64, len(ks)) + var mrrSum, ndcgSum, mapSum float64 + for _, q := range golden.Queries { + ranked, err := search(q.Query, maxK) + if err != nil { + return nil, fmt.Errorf("search %q: %w", q.ID, err) + } + for _, k := range ks { + recallSum[k] += RecallAtK(ranked, q.Labels, k) + } + mrrSum += ReciprocalRank(ranked, q.Labels) + ndcgSum += NDCGAtK(ranked, q.Labels, 10) + mapSum += AveragePrecision(ranked, q.Labels) + } + + n := float64(len(golden.Queries)) + recallAt := make(map[int]float64, len(ks)) + for _, k := range ks { + recallAt[k] = recallSum[k] / n + } + return &RetrievalMetrics{ + CorpusVersion: golden.CorpusVersion, + RunsAveraged: 1, + QueryCount: len(golden.Queries), + RecallAt: recallAt, + MRR: mrrSum / n, + NDCGAt10: ndcgSum / n, + MAP: mapSum / n, + }, nil +} diff --git a/bench/metrics_test.go b/bench/metrics_test.go new file mode 100644 index 00000000..48ef34e9 --- /dev/null +++ b/bench/metrics_test.go @@ -0,0 +1,116 @@ +package bench + +import ( + "math" + "testing" +) + +// almostEqual compares floats within a small tolerance (metric math involves +// log2 divisions, so exact equality is brittle). +func almostEqual(a, b float64) bool { + return math.Abs(a-b) < 1e-6 +} + +// worked example reused across the metric tests: +// +// relevant labels: A(rel 2), B(rel 1), C(rel 1) -> 3 relevant tools +// ranking returned: [A, X, B, Y] -> X, Y are irrelevant +var ( + exLabels = []Label{ + {ToolID: "A", Relevance: 2}, + {ToolID: "B", Relevance: 1}, + {ToolID: "C", Relevance: 1}, + } + exRanked = []string{"A", "X", "B", "Y"} +) + +func TestRecallAtK(t *testing.T) { + cases := []struct { + k int + want float64 + }{ + {1, 1.0 / 3.0}, // top-1 {A}: 1 of 3 relevant + {3, 2.0 / 3.0}, // top-3 {A,X,B}: 2 of 3 relevant + {5, 2.0 / 3.0}, // only 4 results; {A,B} retrieved: 2 of 3 + } + for _, c := range cases { + got := RecallAtK(exRanked, exLabels, c.k) + if !almostEqual(got, c.want) { + t.Errorf("RecallAtK(k=%d) = %v, want %v", c.k, got, c.want) + } + } +} + +func TestReciprocalRank(t *testing.T) { + // First relevant (A) is at rank 1 -> RR = 1.0 + if got := ReciprocalRank(exRanked, exLabels); !almostEqual(got, 1.0) { + t.Errorf("ReciprocalRank = %v, want 1.0", got) + } + // First relevant (B) at rank 2 -> RR = 0.5 + if got := ReciprocalRank([]string{"Z", "B", "A"}, exLabels); !almostEqual(got, 0.5) { + t.Errorf("ReciprocalRank(B@2) = %v, want 0.5", got) + } + // No relevant retrieved -> RR = 0 + if got := ReciprocalRank([]string{"Z", "Y"}, exLabels); !almostEqual(got, 0.0) { + t.Errorf("ReciprocalRank(none) = %v, want 0", got) + } +} + +func TestNDCGAtK(t *testing.T) { + // DCG = 2/log2(2) + 0 + 1/log2(4) = 2 + 0.5 = 2.5 + // IDCG = 2/log2(2) + 1/log2(3) + 1/log2(4) = 2 + 0.63093 + 0.5 = 3.13093 + // nDCG = 2.5 / 3.13093 = 0.798486 + want := 2.5 / (2.0 + 1.0/math.Log2(3) + 0.5) + if got := NDCGAtK(exRanked, exLabels, 10); !almostEqual(got, want) { + t.Errorf("NDCGAtK(10) = %v, want %v", got, want) + } + // Perfect ranking -> nDCG = 1.0 + if got := NDCGAtK([]string{"A", "B", "C"}, exLabels, 10); !almostEqual(got, 1.0) { + t.Errorf("NDCGAtK(perfect) = %v, want 1.0", got) + } +} + +func TestAveragePrecision(t *testing.T) { + // A@1 -> precision 1/1 = 1.0 ; B@3 -> precision 2/3 ; C not retrieved -> 0 + // AP = (1.0 + 0.6667 + 0) / 3 = 0.555556 + want := (1.0 + 2.0/3.0) / 3.0 + if got := AveragePrecision(exRanked, exLabels); !almostEqual(got, want) { + t.Errorf("AveragePrecision = %v, want %v", got, want) + } +} + +func TestScoreRetrieval(t *testing.T) { + golden := &GoldenSet{ + CorpusVersion: "corpus_v1", + Queries: []GoldenQuery{ + {ID: "q1", Query: "find A", Labels: exLabels}, + {ID: "q2", Query: "find D", Labels: []Label{{ToolID: "D", Relevance: 2}}}, + }, + } + // Deterministic fake search: q1 -> exRanked, q2 -> perfect [D] + search := func(query string, _ int) ([]string, error) { + if query == "find A" { + return exRanked, nil + } + return []string{"D"}, nil + } + m, err := ScoreRetrieval(golden, search, []int{1, 3, 5, 10}) + if err != nil { + t.Fatalf("ScoreRetrieval error: %v", err) + } + if m.RunsAveraged != 1 { + t.Errorf("RunsAveraged = %d, want 1", m.RunsAveraged) + } + // Recall@1: q1=1/3, q2=1 -> mean = (0.3333+1)/2 = 0.66667 + wantR1 := (1.0/3.0 + 1.0) / 2.0 + if !almostEqual(m.RecallAt[1], wantR1) { + t.Errorf("mean Recall@1 = %v, want %v", m.RecallAt[1], wantR1) + } + // MRR: q1=1.0, q2=1.0 -> 1.0 + if !almostEqual(m.MRR, 1.0) { + t.Errorf("MRR = %v, want 1.0", m.MRR) + } + if m.QueryCount != 2 { + t.Errorf("QueryCount = %d, want 2", m.QueryCount) + } +} diff --git a/bench/proxytools.go b/bench/proxytools.go index dda5edd4..555c9f3e 100644 --- a/bench/proxytools.go +++ b/bench/proxytools.go @@ -34,6 +34,7 @@ func ProxyToolsForMode(mode string) []Tool { ToolID: "mcpproxy:" + d.Name, Name: d.Name, Description: d.Description, + Schema: d.Schema, }) } return out diff --git a/bench/tokens.go b/bench/tokens.go index e61b3ed4..1fa74bad 100644 --- a/bench/tokens.go +++ b/bench/tokens.go @@ -46,12 +46,15 @@ const ( // Tool is a single tool definition the benchmark scores token cost over. It // matches the shape of both the Spec 065 corpus snapshot and the embedded -// proxy-tool fixture. +// proxy-tool fixture. Schema is optional: the committed corpus snapshot is +// description-only (nil schema), while the live run (live.go) populates it with +// each tool's full JSON input schema for the exact-token headline. type Tool struct { - ToolID string `json:"tool_id"` - Server string `json:"server"` - Name string `json:"tool"` - Description string `json:"description"` + ToolID string `json:"tool_id"` + Server string `json:"server"` + Name string `json:"tool"` + Description string `json:"description"` + Schema json.RawMessage `json:"schema,omitempty"` } // Corpus is a frozen, versioned set of tool definitions. @@ -111,6 +114,22 @@ func (t *Tokenizer) CountTool(tl Tool) int { return t.Count(tl.Name + "\n" + tl.Description) } +// CountToolWithSchema returns the context-token cost of a tool definition +// INCLUDING its JSON input schema (name + description + schema). This is the +// authoritative per-tool context cost an agent actually pays. A tool with no +// schema counts identically to CountTool, so mixing schema-bearing (live) and +// schemaless tools in one report is well-defined. Used by the live run, where +// both the baseline upstream tools AND the proxy management tools carry their +// real schemas — counting schemas on BOTH sides is what keeps the headline +// savings honest rather than overstated. +func (t *Tokenizer) CountToolWithSchema(tl Tool) int { + s := tl.Name + "\n" + tl.Description + if len(tl.Schema) > 0 { + s += "\n" + string(tl.Schema) + } + return t.Count(s) +} + func (t *Tokenizer) countTools(tools []Tool) int { total := 0 for _, tl := range tools { @@ -119,6 +138,15 @@ func (t *Tokenizer) countTools(tools []Tool) int { return total } +// countToolsWithSchema sums CountToolWithSchema over tools. +func (t *Tokenizer) countToolsWithSchema(tools []Tool) int { + total := 0 + for _, tl := range tools { + total += t.CountToolWithSchema(tl) + } + return total +} + // ModeResult is the per-mode context-cost outcome. type ModeResult struct { Mode string `json:"mode"` diff --git a/internal/server/bench_export.go b/internal/server/bench_export.go index 95987195..7020bb4d 100644 --- a/internal/server/bench_export.go +++ b/internal/server/bench_export.go @@ -1,6 +1,8 @@ package server import ( + "encoding/json" + mcpserver "github.com/mark3labs/mcp-go/server" "go.uber.org/zap" @@ -8,7 +10,8 @@ import ( ) // BenchProxyToolDef is a static built-in proxy/management tool definition -// (name + description) exposed for the in-repo benchmark harness (bench/). +// (name + description + JSON input schema) exposed for the in-repo benchmark +// harness (bench/). // // The benchmark scores the per-mode context cost an agent pays for mcpproxy's // own tools. That cost MUST reflect every tool the live routing-mode servers @@ -16,9 +19,16 @@ import ( // quarantine_security, search_servers, list_registries) that both modes append // via buildManagementTools — or the benchmark overstates the token savings // (MCP-3161 / Codex finding on PR #747). +// +// Schema is the exact JSON input schema the proxy advertises via tools/list, +// captured from the live builder. The benchmark's exact-token headline counts +// schemas on BOTH the baseline upstream tools and these proxy tools; omitting +// the proxy schemas while counting the baseline's would overstate savings (the +// MCP-3161 error), so Schema makes the headline honest without drift. type BenchProxyToolDef struct { Name string Description string + Schema json.RawMessage } // ProxyModeToolDefs returns the static built-in proxy + management tool @@ -48,10 +58,17 @@ func ProxyModeToolDefs(routingMode string) []BenchProxyToolDef { defs := make([]BenchProxyToolDef, 0, len(serverTools)) for _, st := range serverTools { - defs = append(defs, BenchProxyToolDef{ + def := BenchProxyToolDef{ Name: st.Tool.Name, Description: st.Tool.Description, - }) + } + // InputSchema marshals to the exact {"type":"object","properties":...} + // an agent receives via tools/list. A marshal failure leaves Schema nil + // (the benchmark then withholds the headline rather than undercount). + if raw, err := json.Marshal(st.Tool.InputSchema); err == nil { + def.Schema = raw + } + defs = append(defs, def) } return defs } From d48c2b7bc2866d1a5724aee7abefe51a8f024fa4 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 21:18:37 +0300 Subject: [PATCH 2/3] fix(bench): preserve upstream schemas through /api/v1/tools baseline ConvertGenericToolsToTyped read generic["schema"], but every producer of the generic tool map (runtime/server GetServerTools, mcp.go) emits the upstream input schema under "inputSchema". The /api/v1/tools response therefore dropped every schema, so the MCP-42a live benchmark baseline was silently a description-only token count instead of the required full-schema count, while still able to emit authoritative_headline=true. - Read "inputSchema" first in the converter, keep "schema" as a legacy fallback. - Gate the live headline on baseline schemas too (BaselineSchemasCounted via anyHaveSchema): a systematically schema-less baseline now withholds the headline instead of claiming a full-schema baseline it never had. - Tests: converter preserves inputSchema (+legacy schema fallback); headline withheld when the baseline carries no schemas. Related #748 --- bench/live_report.go | 74 +++++++++++++++++++-------- bench/live_report_test.go | 31 +++++++++++ internal/contracts/converters.go | 10 +++- internal/contracts/converters_test.go | 44 ++++++++++++++++ 4 files changed, 135 insertions(+), 24 deletions(-) diff --git a/bench/live_report.go b/bench/live_report.go index 3dc65df7..be8a4131 100644 --- a/bench/live_report.go +++ b/bench/live_report.go @@ -22,19 +22,23 @@ type LiveModeResult struct { // LiveTokenReport is the exact-token comparison from a live proxy, with the // baseline upstream tools counted WITH their full JSON input schemas. // -// AuthoritativeHeadline gates the savings percentage: it is only true when the -// proxy management tools were ALSO counted with their schemas. Counting schemas -// on the baseline but not the proxy side overstates savings — the exact error -// corrected in MCP-3161 — so when proxy schemas are absent the savings ratio is -// withheld and only raw token totals are reported. +// AuthoritativeHeadline gates the savings percentage: it is only true when +// schemas were counted on BOTH sides — the proxy management tools carry schemas +// (ProxySchemasCounted) AND the baseline upstream tools carry schemas +// (BaselineSchemasCounted). Counting schemas on one side only overstates or +// distorts savings — the exact error corrected in MCP-3161 — so when either side +// is schema-less the savings ratio is withheld and only raw token totals are +// reported. BaselineSchemasCounted also guards against a /api/v1/tools response +// that silently dropped upstream schemas (MCP-3167). type LiveTokenReport struct { - Encoding string `json:"encoding"` - UpstreamTools int `json:"upstream_tools"` - BaselineTokens int `json:"baseline_tokens"` - Modes []LiveModeResult `json:"modes"` - ProxySchemasCounted bool `json:"proxy_schemas_counted"` - AuthoritativeHeadline bool `json:"authoritative_headline"` - Notes []string `json:"notes"` + Encoding string `json:"encoding"` + UpstreamTools int `json:"upstream_tools"` + BaselineTokens int `json:"baseline_tokens"` + Modes []LiveModeResult `json:"modes"` + ProxySchemasCounted bool `json:"proxy_schemas_counted"` + BaselineSchemasCounted bool `json:"baseline_schemas_counted"` + AuthoritativeHeadline bool `json:"authoritative_headline"` + Notes []string `json:"notes"` } // LatencyReport summarizes proxy-side retrieve_tools search latency versus the @@ -123,28 +127,38 @@ func RunLive(ctx context.Context, client *LiveClient, golden *GoldenSet) (*LiveR // buildTokenReport counts the baseline upstream tools WITH schemas against each // proxy routing mode (rt = retrieve_tools, ce = code_execution), also counted -// with schemas. The headline savings is only emitted when EVERY proxy tool -// carries a schema; otherwise counting schemas on the baseline alone would -// overstate savings (MCP-3161), so the ratio is withheld and only raw token -// totals are reported. +// with schemas. The headline savings is only emitted when schemas were counted +// on BOTH sides: every proxy tool carries a schema AND the baseline upstream +// tools actually carry schemas. Counting schemas on only one side overstates (or +// distorts) savings — the exact error corrected in MCP-3161 — so otherwise the +// ratio is withheld and only raw token totals are reported. The baseline guard +// also catches a silently schema-less /api/v1/tools response (MCP-3167): if the +// management endpoint drops upstream schemas, no upstream tool has one and the +// headline is withheld rather than claiming a full-schema baseline it never had. func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport { baseTokens := tk.countToolsWithSchema(upstream) proxySchemasCounted := allHaveSchema(rt) && allHaveSchema(ce) + // A correct full-schema baseline has schemas on at least some upstream tools. + // Requiring ALL would wrongly fail on legitimately parameter-less tools, so + // "any" is the signal that schemas were not systematically dropped. + baselineSchemasCounted := anyHaveSchema(upstream) + authoritative := proxySchemasCounted && baselineSchemasCounted rep := &LiveTokenReport{ - Encoding: tk.encoding, - UpstreamTools: len(upstream), - BaselineTokens: baseTokens, - ProxySchemasCounted: proxySchemasCounted, + Encoding: tk.encoding, + UpstreamTools: len(upstream), + BaselineTokens: baseTokens, + ProxySchemasCounted: proxySchemasCounted, + BaselineSchemasCounted: baselineSchemasCounted, Modes: []LiveModeResult{ {Mode: ModeBaseline, ContextTools: len(upstream), Tokens: baseTokens}, {Mode: ModeRetrieveTools, ContextTools: len(rt), Tokens: tk.countToolsWithSchema(rt)}, {Mode: ModeCodeExecution, ContextTools: len(ce), Tokens: tk.countToolsWithSchema(ce)}, }, } - rep.AuthoritativeHeadline = proxySchemasCounted - if proxySchemasCounted { + rep.AuthoritativeHeadline = authoritative + if authoritative { for i := range rep.Modes { m := &rep.Modes[i] if m.Mode != ModeBaseline && baseTokens > 0 { @@ -154,6 +168,10 @@ func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport { rep.Notes = []string{ "Baseline counts upstream tools with full JSON input schemas from GET /api/v1/tools; proxy modes count the management tools with their schemas. Headline savings is authoritative.", } + } else if !baselineSchemasCounted { + rep.Notes = []string{ + "HEADLINE SAVINGS WITHHELD: no upstream baseline tool carried a JSON input schema, so the baseline is NOT the required full-schema token count — typically the /api/v1/tools response dropped upstream schemas (MCP-3167). Reporting savings now would compare a schema-less baseline against schema-counted proxy tools and DISTORT the reduction. Token totals are shown for transparency; the authoritative headline lands once the management endpoint emits upstream schemas.", + } } else { rep.Notes = []string{ "HEADLINE SAVINGS WITHHELD: the baseline upstream tools are counted with full schemas, but the proxy management tools (proxy_tools_v1.json) are description-only. Reporting savings now would count schemas on one side only and OVERSTATE the reduction — the exact error corrected in MCP-3161. Token totals are shown for transparency; the authoritative headline lands once proxy-tool schemas are captured live via MCP tools/list.", @@ -162,6 +180,18 @@ func buildTokenReport(tk *Tokenizer, upstream, rt, ce []Tool) *LiveTokenReport { return rep } +// anyHaveSchema reports whether at least one tool carries a non-empty schema. +// Used to detect a systematically schema-less baseline (every schema dropped) +// versus a corpus that merely contains some parameter-less tools. +func anyHaveSchema(tools []Tool) bool { + for _, t := range tools { + if len(t.Schema) > 0 { + return true + } + } + return false +} + func allHaveSchema(tools []Tool) bool { if len(tools) == 0 { return false diff --git a/bench/live_report_test.go b/bench/live_report_test.go index 92886737..9ad1bb4e 100644 --- a/bench/live_report_test.go +++ b/bench/live_report_test.go @@ -132,3 +132,34 @@ func TestBuildTokenReportWithholdsWhenProxySchemasMissing(t *testing.T) { t.Error("baseline tokens should still be reported for transparency") } } + +// TestBuildTokenReportWithholdsWhenBaselineSchemasMissing guards the other half +// of the MCP-3161 valve: if the baseline upstream tools were counted WITHOUT +// schemas (e.g. the /api/v1/tools converter dropped them), the headline must be +// withheld even when the proxy management tools do carry schemas — otherwise the +// report falsely claims a full-schema baseline (MCP-3132/MCP-3167). +func TestBuildTokenReportWithholdsWhenBaselineSchemasMissing(t *testing.T) { + tk, err := NewTokenizer(DefaultEncoding) + if err != nil { + t.Fatalf("tokenizer: %v", err) + } + upstreamSchemaless := []Tool{{Name: "big", Description: "d"}} // schema dropped + rt := []Tool{{Name: "retrieve_tools", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}} + ce := []Tool{{Name: "code_execution", Description: "d", Schema: json.RawMessage(`{"type":"object"}`)}} + + rep := buildTokenReport(tk, upstreamSchemaless, rt, ce) + if rep.AuthoritativeHeadline { + t.Error("headline must be withheld when the baseline upstream tools carry no schemas") + } + if rep.BaselineSchemasCounted { + t.Error("BaselineSchemasCounted must be false when no upstream tool has a schema") + } + for _, m := range rep.Modes { + if m.SavingsRatio != 0 { + t.Errorf("savings ratio must be withheld (0), got %v for %q", m.SavingsRatio, m.Mode) + } + } + if rep.BaselineTokens <= 0 { + t.Error("baseline tokens should still be reported for transparency") + } +} diff --git a/internal/contracts/converters.go b/internal/contracts/converters.go index 9ca45805..a99ab6e3 100644 --- a/internal/contracts/converters.go +++ b/internal/contracts/converters.go @@ -374,8 +374,14 @@ func ConvertGenericToolsToTyped(genericTools []map[string]interface{}) []Tool { tool.Usage = usage } - // Extract schema - if schema, ok := generic["schema"].(map[string]interface{}); ok { + // Extract schema. Every generic-map producer (runtime.GetServerTools, + // server.GetServerTools, mcp.go) emits the upstream input schema under the + // "inputSchema" key, so that is the authoritative source; "schema" is kept + // as a legacy fallback. Reading only "schema" silently dropped every schema + // from the /api/v1/tools response (MCP-3132/MCP-3167). + if schema, ok := generic["inputSchema"].(map[string]interface{}); ok { + tool.Schema = schema + } else if schema, ok := generic["schema"].(map[string]interface{}); ok { tool.Schema = schema } diff --git a/internal/contracts/converters_test.go b/internal/contracts/converters_test.go index 58afc995..41db0c7e 100644 --- a/internal/contracts/converters_test.go +++ b/internal/contracts/converters_test.go @@ -256,3 +256,47 @@ func TestConvertGenericServersToTyped_NoOAuth(t *testing.T) { require.Len(t, servers, 1) assert.Nil(t, servers[0].OAuth, "Servers without OAuth config should have nil OAuth field") } + +// TestConvertGenericToolsToTyped_PreservesInputSchema asserts the management +// tool-list conversion keeps the upstream input schema. Every producer of the +// generic tool map emits the schema under the "inputSchema" key (e.g. +// internal/runtime/runtime.go:2141 GetServerTools, internal/server/server.go:2367), +// so a converter that only reads "schema" silently drops every schema on the +// /api/v1/tools response. Regression guard for MCP-3132/MCP-3167: without real +// schemas the live benchmark baseline is no longer a full-schema token count. +func TestConvertGenericToolsToTyped_PreservesInputSchema(t *testing.T) { + inputSchema := map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "path": map[string]interface{}{"type": "string"}, + }, + } + generic := []map[string]interface{}{ + { + "name": "read_file", + "server_name": "fs", + "description": "Read a file", + "inputSchema": inputSchema, // key the runtime/server map builders actually emit + }, + } + + typed := ConvertGenericToolsToTyped(generic) + + require.Len(t, typed, 1) + assert.Equal(t, inputSchema, typed[0].Schema, "input schema must survive conversion to the /api/v1/tools response") +} + +// TestConvertGenericToolsToTyped_SchemaLegacyFallback keeps the historical +// "schema" key working so any in-process caller that still emits it is not +// regressed by the inputSchema fix. +func TestConvertGenericToolsToTyped_SchemaLegacyFallback(t *testing.T) { + schema := map[string]interface{}{"type": "object"} + generic := []map[string]interface{}{ + {"name": "t", "server_name": "s", "description": "d", "schema": schema}, + } + + typed := ConvertGenericToolsToTyped(generic) + + require.Len(t, typed, 1) + assert.Equal(t, schema, typed[0].Schema, "legacy schema key must still be honored") +} From 06027863ed5fd5e074714729497b80076a9abe9e Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 22 Jun 2026 21:46:17 +0300 Subject: [PATCH 3/3] fix(bench): conform live retrieval report to Spec 065 score-report schema Addresses CodexReviewer finding on PR #748 / MCP-3167: the live `retrieval` payload emitted flat metric fields, but score-report.schema.json requires nested `retrieval.metrics` + `retrieval.gate`. Restructure RetrievalMetrics into {metrics, gate} so live_report.json validates against the contract, proven by a new jsonschema-validation test (TestRetrievalMetricsConformsToScoreReportSchema). A standalone live run has no stored baseline, so gate.passed is true by construction (CI regression-gating against a committed baseline is MCP-3133). Co-Authored-By: Paperclip --- bench/README.md | 7 +++-- bench/cmd/bench/main.go | 2 +- bench/live_report_test.go | 2 +- bench/metrics.go | 55 ++++++++++++++++++++++++--------- bench/metrics_schema_test.go | 60 ++++++++++++++++++++++++++++++++++++ bench/metrics_test.go | 11 ++++--- 6 files changed, 114 insertions(+), 23 deletions(-) create mode 100644 bench/metrics_schema_test.go diff --git a/bench/README.md b/bench/README.md index 24188f42..73323f51 100644 --- a/bench/README.md +++ b/bench/README.md @@ -122,8 +122,11 @@ What it adds over the offline token run: - **Accuracy.** Replays `retrieval_golden_v1.json` through the proxy's BM25 search (`GET /api/v1/index/search`) and scores **Recall@{1,3,5,10}, MRR, nDCG@10, MAP** against the graded labels. Deterministic (BM25), so a single - run is reported (`runs_averaged: 1`). Metric field names mirror the Spec 065 - `score-report.schema.json` `retrieval` block. + run is reported (`runs_averaged: 1`). The emitted `retrieval` block **conforms + to** the Spec 065 `score-report.schema.json` shape — nested `metrics` + `gate` + (verified by a schema-validation test). A standalone live run has no stored + baseline to regress against, so `gate.passed` is `true` by construction; + CI regression-gating against a committed baseline is the MCP-3133 lane. - **Latency.** Client-measured per-query search latency (p50/p95/p99/max) vs. the one-shot cost of loading all tools. Measured client-side on purpose: the server's `SearchToolsResponse.took` field is currently a `"0ms"` stub. diff --git a/bench/cmd/bench/main.go b/bench/cmd/bench/main.go index 7ffb0122..4bfeebbb 100644 --- a/bench/cmd/bench/main.go +++ b/bench/cmd/bench/main.go @@ -104,7 +104,7 @@ func runLive(proxy, apiKey, goldenPath, outDir string) { } r := report.Retrieval fmt.Fprintf(os.Stdout, " accuracy (%d queries): Recall@1=%.3f Recall@5=%.3f MRR=%.3f nDCG@10=%.3f MAP=%.3f\n", - r.QueryCount, r.RecallAt[1], r.RecallAt[5], r.MRR, r.NDCGAt10, r.MAP) + r.QueryCount, r.Metrics.RecallAt[1], r.Metrics.RecallAt[5], r.Metrics.MRR, r.Metrics.NDCGAt10, r.Metrics.MAP) l := report.Latency fmt.Fprintf(os.Stdout, " latency (%d searches): p50=%.1fms p95=%.1fms p99=%.1fms max=%.1fms; load-all-tools=%.1fms\n", l.Samples, l.P50ms, l.P95ms, l.P99ms, l.MaxMs, l.LoadAllToolsMs) diff --git a/bench/live_report_test.go b/bench/live_report_test.go index 9ad1bb4e..618d5fa4 100644 --- a/bench/live_report_test.go +++ b/bench/live_report_test.go @@ -98,7 +98,7 @@ func TestRunLiveAuthoritativeHeadline(t *testing.T) { } } // Accuracy: perfect ranking for the one query. - if rep.Retrieval == nil || rep.Retrieval.RecallAt[1] != 1.0 { + if rep.Retrieval == nil || rep.Retrieval.Metrics.RecallAt[1] != 1.0 { t.Errorf("expected Recall@1=1.0, got %+v", rep.Retrieval) } // Latency populated. diff --git a/bench/metrics.go b/bench/metrics.go index ddfc1f3b..ba70e74b 100644 --- a/bench/metrics.go +++ b/bench/metrics.go @@ -151,18 +151,38 @@ func AveragePrecision(ranked []string, labels []Label) float64 { // returns the ranked tool IDs (most relevant first), limited to `limit`. type SearchFunc func(query string, limit int) (ranked []string, err error) -// RetrievalMetrics is the aggregated retrieval-quality report over a golden -// set. Field names mirror the Spec 065 score-report.schema.json `retrieval` -// block so the report can be emitted to that contract. +// RetrievalMetricValues holds the aggregated metric numbers. It is the +// `retrieval.metrics` object of the Spec 065 score-report.schema.json contract. +type RetrievalMetricValues struct { + RecallAt map[int]float64 `json:"recall_at"` + MRR float64 `json:"mrr"` + NDCGAt10 float64 `json:"ndcg_at_10"` + MAP float64 `json:"map"` +} + +// RetrievalGate is the `retrieval.gate` object of the score-report contract. +// +// A standalone live run has no stored baseline to regress against, so the gate +// cannot fail by construction: Passed is true and Metric/Tolerance are empty. +// Regression gating against a committed baseline is the CI lane's job (MCP-3133) +// — that run fills Metric/Tolerance and can set Passed=false. +type RetrievalGate struct { + Passed bool `json:"passed"` + Metric string `json:"metric,omitempty"` + Tolerance float64 `json:"tolerance,omitempty"` +} + +// RetrievalMetrics is the aggregated retrieval-quality report over a golden set. +// Its JSON shape IS the Spec 065 score-report.schema.json `retrieval` block +// (nested `metrics` + `gate`), so a live report's retrieval payload validates +// against that contract directly. type RetrievalMetrics struct { - CorpusVersion string `json:"corpus_version"` - GoldenVersion string `json:"golden_version,omitempty"` - RunsAveraged int `json:"runs_averaged"` - QueryCount int `json:"query_count"` - RecallAt map[int]float64 `json:"recall_at"` - MRR float64 `json:"mrr"` - NDCGAt10 float64 `json:"ndcg_at_10"` - MAP float64 `json:"map"` + CorpusVersion string `json:"corpus_version"` + GoldenVersion string `json:"golden_version,omitempty"` + RunsAveraged int `json:"runs_averaged"` + QueryCount int `json:"query_count,omitempty"` + Metrics RetrievalMetricValues `json:"metrics"` + Gate RetrievalGate `json:"gate"` } // ScoreRetrieval replays every golden query through search and aggregates @@ -204,9 +224,14 @@ func ScoreRetrieval(golden *GoldenSet, search SearchFunc, ks []int) (*RetrievalM CorpusVersion: golden.CorpusVersion, RunsAveraged: 1, QueryCount: len(golden.Queries), - RecallAt: recallAt, - MRR: mrrSum / n, - NDCGAt10: ndcgSum / n, - MAP: mapSum / n, + Metrics: RetrievalMetricValues{ + RecallAt: recallAt, + MRR: mrrSum / n, + NDCGAt10: ndcgSum / n, + MAP: mapSum / n, + }, + // No baseline compared in a standalone live run, so the regression gate + // cannot fail (see RetrievalGate). CI fills this in against a baseline. + Gate: RetrievalGate{Passed: true}, }, nil } diff --git a/bench/metrics_schema_test.go b/bench/metrics_schema_test.go new file mode 100644 index 00000000..a8a5eb60 --- /dev/null +++ b/bench/metrics_schema_test.go @@ -0,0 +1,60 @@ +package bench + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/santhosh-tekuri/jsonschema/v6" +) + +// TestRetrievalMetricsConformsToScoreReportSchema proves the live retrieval +// payload validates against the Spec 065 score-report contract — i.e. the +// `retrieval` object carries the required nested `metrics` and `gate` +// sub-objects, not flat fields (CodexReviewer finding on PR #748 / MCP-3167). +func TestRetrievalMetricsConformsToScoreReportSchema(t *testing.T) { + golden := &GoldenSet{ + CorpusVersion: "corpus_v1", + Queries: []GoldenQuery{ + {ID: "q1", Query: "x", Labels: []Label{{ToolID: "A", Relevance: 2}}}, + }, + } + search := func(_ string, _ int) ([]string, error) { return []string{"A"}, nil } + m, err := ScoreRetrieval(golden, search, []int{1, 3, 5, 10}) + if err != nil { + t.Fatalf("ScoreRetrieval: %v", err) + } + + // A score report may hold the retrieval block alone (security is optional). + raw, err := json.Marshal(map[string]any{"retrieval": m}) + if err != nil { + t.Fatalf("marshal: %v", err) + } + inst, err := jsonschema.UnmarshalJSON(strings.NewReader(string(raw))) + if err != nil { + t.Fatalf("parse instance: %v", err) + } + + schemaFile := filepath.Join("..", "specs", "065-evaluation-foundation", "contracts", "score-report.schema.json") + schemaRaw, err := os.ReadFile(schemaFile) + if err != nil { + t.Fatalf("read schema: %v", err) + } + schemaDoc, err := jsonschema.UnmarshalJSON(strings.NewReader(string(schemaRaw))) + if err != nil { + t.Fatalf("parse schema: %v", err) + } + c := jsonschema.NewCompiler() + if err := c.AddResource("score-report.schema.json", schemaDoc); err != nil { + t.Fatalf("add schema: %v", err) + } + sch, err := c.Compile("score-report.schema.json") + if err != nil { + t.Fatalf("compile schema: %v", err) + } + if err := sch.Validate(inst); err != nil { + t.Fatalf("live retrieval payload fails score-report.schema.json: %v", err) + } +} diff --git a/bench/metrics_test.go b/bench/metrics_test.go index 48ef34e9..bee64439 100644 --- a/bench/metrics_test.go +++ b/bench/metrics_test.go @@ -103,12 +103,15 @@ func TestScoreRetrieval(t *testing.T) { } // Recall@1: q1=1/3, q2=1 -> mean = (0.3333+1)/2 = 0.66667 wantR1 := (1.0/3.0 + 1.0) / 2.0 - if !almostEqual(m.RecallAt[1], wantR1) { - t.Errorf("mean Recall@1 = %v, want %v", m.RecallAt[1], wantR1) + if !almostEqual(m.Metrics.RecallAt[1], wantR1) { + t.Errorf("mean Recall@1 = %v, want %v", m.Metrics.RecallAt[1], wantR1) } // MRR: q1=1.0, q2=1.0 -> 1.0 - if !almostEqual(m.MRR, 1.0) { - t.Errorf("MRR = %v, want 1.0", m.MRR) + if !almostEqual(m.Metrics.MRR, 1.0) { + t.Errorf("MRR = %v, want 1.0", m.Metrics.MRR) + } + if !m.Gate.Passed { + t.Error("Gate.Passed should be true for a baseline-free run") } if m.QueryCount != 2 { t.Errorf("QueryCount = %d, want 2", m.QueryCount)