-
Notifications
You must be signed in to change notification settings - Fork 35
143 lines (125 loc) · 5.89 KB
/
Copy pathbench.yml
File metadata and controls
143 lines (125 loc) · 5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
name: Benchmark Dashboard
# Triggered on stable release tags — runs the live benchmark and publishes the
# dashboard to Cloudflare Pages (mcpproxy-bench project).
#
# Non-blocking: bench failure never gates the release pipeline.
#
# Why host binary instead of bench/docker-compose.yml:
# The Dockerfile uses a distroless runtime image that lacks npx/uvx. The 7
# snapshot-server configs spawn stdio servers via npx/uvx, which need to run
# in the same environment as mcpproxy. The eval.yml retrieval-d1 job solves
# this by building the binary and running it on the host runner (where Node.js
# and uv are installed). We follow the same pattern here.
# The docker-compose.yml is kept for local development; a future PR can add a
# bench-specific image that includes the runtime tools.
#
# Reports are never committed (Spec 065 CN-003) — published as CI artifacts and
# Cloudflare Pages deployments only.
on:
push:
tags: ["v*"]
workflow_dispatch:
permissions:
contents: read
jobs:
bench-dashboard:
name: Run benchmark and publish dashboard
runs-on: ubuntu-latest
environment: production
# Stable releases only — RC/prerelease tags (v*-rc.*, v*-next.*) are handled
# by prerelease.yml. workflow_dispatch allows manual runs from any ref.
if: "github.event_name == 'workflow_dispatch' || (startsWith(github.ref, 'refs/tags/v') && !contains(github.ref_name, '-') && github.repository == 'smart-mcp-proxy/mcpproxy-go')"
# Non-blocking: bench failure never blocks the release.
continue-on-error: true
steps:
- name: Checkout
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Set up Go
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
with:
go-version: "1.25"
cache: true
- name: Set up Node.js (npx-launched MCP reference servers)
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: "22"
- name: Set up uv (uvx-launched MCP reference servers)
uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
- name: Build mcpproxy (personal edition)
run: go build -o mcpproxy ./cmd/mcpproxy
# Offline benchmark: deterministic token-reduction scores, no live servers.
# Writes bench/results/report.json and bench/results/dashboard.html.
- name: Run offline benchmark
run: go run ./bench/cmd/bench -out bench/results
# Live benchmark: boot mcpproxy with the 7 no-auth reference servers, wait
# for the full tool catalog, then score accuracy + latency + full-schema tokens.
# Writes bench/results/live_report.json.
- name: Boot mcpproxy with reference servers and run live benchmark
env:
DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
BASE: http://127.0.0.1:8092
KEY: eval-corpus-snapshot
run: |
set -uo pipefail
mkdir -p "$RUNNER_TEMP/bench"
./mcpproxy serve \
--config "$DS/snapshot-servers.config.json" \
--data-dir "$RUNNER_TEMP/bench" \
--listen 127.0.0.1:8092 \
--log-level info > "$RUNNER_TEMP/mcpproxy-bench.log" 2>&1 &
server_pid=$!
trap 'kill "$server_pid" 2>/dev/null || true' EXIT
# Wait for the full tool catalog before scoring: the retrieval index is
# built after all servers connect (~45 tools across 7 reference servers).
ready=0
expected=44
for i in $(seq 1 60); do
if ! kill -0 "$server_pid" 2>/dev/null; then
echo "::error::mcpproxy exited during startup"
tail -40 "$RUNNER_TEMP/mcpproxy-bench.log" || true
exit 1
fi
t="$(curl -fsS -H "X-API-Key: $KEY" "$BASE/api/v1/tools" \
| python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)"
echo "attempt $i: catalog has $t tool(s)"
if [ "$t" -ge "$expected" ]; then
echo "Catalog full ($t tools); settling 8s for index build."
sleep 8
ready=1
break
fi
sleep 5
done
if [ "$ready" != "1" ]; then
echo "::error::mcpproxy catalog did not reach ${expected} tools in 5 minutes"
tail -80 "$RUNNER_TEMP/mcpproxy-bench.log" || true
exit 1
fi
go run ./bench/cmd/bench \
-live \
-proxy "$BASE" \
-api-key "$KEY" \
-out bench/results
kill "$server_pid" 2>/dev/null || true
# Serve dashboard.html at the root URL for Cloudflare Pages.
- name: Prepare dashboard index
run: cp bench/results/dashboard.html bench/results/index.html
# Always upload results as a CI artifact — available even if Pages deploy fails.
- name: Upload dashboard artifact
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: bench-dashboard-${{ github.ref_name }}
path: bench/results/
retention-days: 90
if-no-files-found: warn
# Publish to Cloudflare Pages (mcpproxy-bench project).
# On first deploy, wrangler auto-creates the project; subsequent deploys
# update it. The Pages URL will be mcpproxy-bench.pages.dev (or a custom
# domain such as bench.mcpproxy.app once configured in Cloudflare).
- name: Deploy benchmark dashboard to Cloudflare Pages
uses: cloudflare/wrangler-action@ebbaa1584979971c8614a24965b4405ff95890e0 # v4.0.0
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
command: pages deploy bench/results --project-name=mcpproxy-bench