diff --git a/examples/eval/eval_delegate.py b/examples/eval/eval_delegate.py index c52c1c9c2..cdea6c158 100644 --- a/examples/eval/eval_delegate.py +++ b/examples/eval/eval_delegate.py @@ -91,6 +91,12 @@ def _rebuild_delegate_config( env_cfg = build_skills_eval_env_config(args, env, defaults) if env_cfg is not None: envs.append(env_cfg) + elif env_name == "terminal_bench": + from examples.eval.terminal_bench.tb_config import build_terminal_bench_config + + env_cfg = build_terminal_bench_config(args, env, defaults) + if env_cfg is not None: + envs.append(env_cfg) else: raise ValueError(f"Unknown delegate environment: {env_name}") return envs @@ -151,6 +157,10 @@ def _create_delegate(env_cfg: EvalEnvConfig, router_addr: str): from examples.eval.nemo_skills.skills_client import SkillsEvalClient return SkillsEvalClient.from_config(env_cfg, router_addr) + elif env_name == "terminal_bench": + from examples.eval.terminal_bench.tb_client import TerminalBenchClient + + return TerminalBenchClient.from_config(env_cfg, router_addr) logger.warning("No delegate client registered for environment: %s", env_name) return None diff --git a/examples/eval/README.md b/examples/eval/nemo_skills/README.md similarity index 100% rename from examples/eval/README.md rename to examples/eval/nemo_skills/README.md diff --git a/examples/eval/scripts/eval_tb_example.yaml b/examples/eval/scripts/eval_tb_example.yaml new file mode 100644 index 000000000..5104ae6e1 --- /dev/null +++ b/examples/eval/scripts/eval_tb_example.yaml @@ -0,0 +1,29 @@ +eval: + defaults: + n_samples_per_eval_prompt: 1 + temperature: 0.6 + top_p: 0.95 + top_k: -1 + max_response_len: 24576 + datasets: # these eval tasks go through slime dataset config and default rollout function (slime.rollout.sglang_rollout.generate_rollout) + - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa + path: /root/gpqa/gpqa_eval.jsonl + rm_type: gpqa + n_samples_per_eval_prompt: 2 + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench + path: /root/ifbench/IFBench_eval.jsonl + rm_type: ifbench + n_samples_per_eval_prompt: 1 + delegate: + - name: terminal_bench + url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine + timeout_secs: 86400 # 24 hours + max_retries: 1 # HTTP request retries from Slime to the TB server + model_name: qwen3-8b + api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh + dataset_path: /mnt/data/xinyu/program/slime-tb/terminal-bench/tasks # Dataset path on the host machine + # task_ids: + # - hello-world + # n_tasks: 10 + n_attempts: 1 # TB task-level retries (per task within tb run) + n_concurrent: 8 \ No newline at end of file diff --git a/examples/eval/scripts/run-eval-tb-qwen.sh b/examples/eval/scripts/run-eval-tb-qwen.sh new file mode 100644 index 000000000..67434f8ec --- /dev/null +++ b/examples/eval/scripts/run-eval-tb-qwen.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# Example launcher that reuses the Qwen3-8B recipe but delegates evaluation to an +# external Terminal Bench server via the eval_delegate_rollout wrapper. + +# Clean up any stale processes from a previous run. +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex + +export PYTHONBUFFERED=16 +export SLIME_HOST_IP=${SLIME_HOST_IP:-"127.0.0.1"} + +MODEL_DIR="${MODEL_DIR:-/root/.cache}" +export MODEL_DIR + +NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) +if [ "$NVLINK_COUNT" -gt 0 ]; then + HAS_NVLINK=1 +else + HAS_NVLINK=0 +fi +echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)" +source "${REPO_ROOT}/scripts/models/qwen3-8B.sh" + +# Store eval/delegate settings in a YAML config similar to examples/eval_multi_task. +EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/eval_tb_example.yaml"} + +CKPT_ARGS=( + --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1 + --ref-load ${MODEL_DIR}/OpenThinker-Agent-v1_torch_dist + # --load ${MODEL_DIR}/OpenThinker-Agent-v1_slime/ + --save ${MODEL_DIR}/OpenThinker-Agent-v1_slime/ + --save-interval 20 +) + +ROLLOUT_ARGS=( + --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --rm-type deepscaler + --num-rollout 3000 + --rollout-batch-size 32 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-temperature 0.8 + --global-batch-size 256 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 5 + --eval-config "${EVAL_CONFIG_PATH}" + --eval-function-path examples.eval.eval_delegate_rollout.generate_rollout +) + +PERF_ARGS=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 9216 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.00 + --kl-loss-type low_var_kl + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 +) + +WANDB_ARGS=( + --use-wandb + --wandb-project slime-eval + --wandb-group qwen3-8b-eval + --wandb-key ${WANDB_KEY} # export WANDB_KEY="your_key" +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 1 + --sglang-mem-fraction-static 0.7 + --sglang-router-port 30005 +) + +MISC_ARGS=( + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash +) + +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +export CUDA_VISIBLE_DEVICES=0,1 + +ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \ + --disable-usage-stats \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8266 \ + --dashboard-agent-listen-port 52366 \ + --dashboard-agent-grpc-port 52367 \ + --runtime-env-agent-port 52368 + + +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/root/Megatron-LM/\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\" + } +}" + +ray job submit --address="http://${MASTER_ADDR}:8266" \ + --working-dir "${REPO_ROOT}" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 train.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node 2 \ + --colocate \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} diff --git a/examples/eval/terminal_bench/README-cn.md b/examples/eval/terminal_bench/README-cn.md new file mode 100644 index 000000000..057a945b2 --- /dev/null +++ b/examples/eval/terminal_bench/README-cn.md @@ -0,0 +1,122 @@ +# Terminal Bench 评估集成 + +本目录将 Terminal Bench (TB) 封装为 Slime 的评估委托(Eval Delegate)。评估过程在宿主机(Host)上通过 `tb` CLI 执行,Slime 负责读取并汇总各项指标,包括 `accuracy`、`n_resolved`、`n_unresolved`、`pass_at_k/*` 以及 Token 统计数据(如 `total_input_tokens_mean/median` 和 `total_output_tokens_mean/median`)。 + +## 运行架构 + +* **Slime 内部**:运行训练/评估主循环;调用 TB delegate client。 +* **宿主机(Host)**:运行 TB delegate server (`tb_server.py`),由其执行 `tb run ...`。 +* **Server逻辑**:读取最新的 TB JSON 结果并将各项指标返回给 Slime。 + +## 1) 获取代码 (宿主机) + +```bash +mkdir slime-tb +cd slime-tb +git clone https://github.com/THUDM/slime.git +git clone https://github.com/laude-institute/terminal-bench +``` + +## 2) 启动 Slime 容器 + +```bash +docker run \ + -itd \ + --gpus all \ + --shm-size 32g \ + --network host \ + --ipc=host \ + --privileged \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + -v /mnt/data/.cache:/root/.cache \ + -v $(pwd):/shared/slime-tb \ + --name \ + slimerl/slime:latest \ + /bin/bash +``` + +## 3) 进入 Slime 容器 + +```bash +docker exec -it /bin/bash +``` + +## 4) 配置 Terminal Bench 环境 (宿主机) + +在运行 `tb_server.py` 的宿主机上执行: + +```bash +# 在宿主机终端执行(非 Docker 内部) +uv venv --python 3.13 .venv +source .venv/bin/activate +uv pip install terminal-bench/. +uv pip install -r slime/examples/eval/terminal_bench/requirements.txt +``` + +*如果仓库路径不是 `./slime` 和 `./terminal-bench`,请根据实际路径调整。* + +## 5) 启动 Terminal Bench server + +在宿主机上启动(即 `tb` 命令可用的环境): + +```bash +python slime/examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9051 \ + --output-root tb_eval_output +``` + +**该脚本的功能:** + +* 默认设置 `OPENAI_API_KEY=EMPTY`。 +* 执行 `tb run -a terminus-2 -m openai/ ... --n-concurrent 8`。 +* 等待运行完成后,返回 `accuracy`、`pass_at_k` 以及 Token 消耗等统计数据。 + +## 6) 运行评估脚本 (示例) + +如果使用提供的 Qwen 评估启动脚本 (`run-eval-tb-qwen.sh`),请按以下步骤操作: + +**更新路径**:将 `eval_tb_example.yaml` 中的 `dataset_path` 修改为宿主机上 `terminal-bench/tasks` 的**绝对路径**(注意不是 Docker 内部路径)。 + +**下载模型**:在 Slime 容器内下载 HuggingFace 权重: +```bash +huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ +--local-dir /root/.cache/OpenThinker-Agent-v1 +``` + +**格式转换**:将 HuggingFace 权重转换为 Slime 的 torch distributed 格式。在 Slime 根目录下执行: +```bash +cd /shared/slime-tb/slime +source scripts/models/qwen3-8B.sh + +export PYTHONPATH=/root/Megatron-LM:/shared/slime-tb/slime + +python tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ + --save /root/.cache/OpenThinker-Agent-v1_torch_dist +``` + +**开始评估**:在 Slime 容器内运行: +```bash +bash slime/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log +``` + +*为了快速测试,可以在 `eval_tb_example.yaml` 中通过 `task_ids` 指定特定任务,或通过 `n_tasks` 限制评估任务的数量。* + +## 7) 常见问题 + +当在 Docker 容器中使用 `--network host` 运行 Slime 时,Ray 可能由于与宿主机共享网络而出现端口冲突。 + +这会导致 Ray 启动失败,或报 Redis/会话相关错误。通常可以在启动 Ray head 时显式指定未占用端口来解决,比如设置非默认的 `--port` 和 `--dashboard-port`。 + +有时甚至会导致 Ray job 提交失败,提示没有可用 agent 接受任务。这通常是 dashboard agent 或 runtime env agent 的端口也发生冲突。此时可在启动 Ray 时指定这些端口(如 `--dashboard-agent-listen-port`、`--dashboard-agent-grpc-port`、`--runtime-env-agent-port`)来解决。 + +如果 TB server无法通过 sglang router 连接到 Slime(`InternalServerError`),请检查 router 端口(例如 30005)实际监听的地址,并更新 `eval_tb_example.yaml` 中的 `api_base`: + +```bash +ss -lntp | grep 30005 +``` + +TB server开始接受请求后,可能会在输出中看到 `Parser warnings`、`Context length exceeded`、`Command 1 should end with newline`、`Harness execution failed`等。这些是Terminal Bench 的警告,如果正常运行可以忽略。 \ No newline at end of file diff --git a/examples/eval/terminal_bench/README.md b/examples/eval/terminal_bench/README.md new file mode 100644 index 000000000..125bb1756 --- /dev/null +++ b/examples/eval/terminal_bench/README.md @@ -0,0 +1,129 @@ +# Terminal Bench Eval + +This folder wires Terminal Bench (TB) into Slime as an eval delegate. The TB run happens on the host via the `tb` CLI, and Slime reads back aggregated metrics such as `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats like `total_input_tokens_mean/median` and `total_output_tokens_mean/median`. + +## What runs where + +- Slime runs your training/eval loop inside the Docker container. +- Slime calls the TB delegate client. +- The TB delegate server (`tb_server.py`) runs `tb run ...` on the host. +- The server reads the latest TB JSON results and returns metrics to Slime. + +## 1) Get the code (host) + +```bash +mkdir slime-tb +cd slime-tb +git clone https://github.com/THUDM/slime.git +git clone https://github.com/laude-institute/terminal-bench +``` + +## 2) Launch the Slime container + +```bash +docker run \ + -itd \ + --gpus all \ + --shm-size 32g \ + --network host \ + --ipc=host \ + --privileged \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + -v /mnt/data/.cache:/root/.cache \ + -v $(pwd):/shared/slime-tb \ + --name \ + slimerl/slime:latest \ + /bin/bash +``` + +## 3) Inside the Slime container + +```bash +docker exec -it /bin/bash +``` + +## 4) Terminal Bench environment (host) + +Run on the machine that will host `tb_server.py` (where you cloned both repos): + +```bash +# Host machine terminal (outside Docker) +uv venv --python 3.13 .venv +source .venv/bin/activate + +uv pip install terminal-bench/. +uv pip install -r slime/examples/eval/terminal_bench/requirements.txt +``` + +Notes: +- Use your local repo paths if they are not `./slime` and `./terminal-bench`. + +## 5) Start the Terminal Bench server + +Run on the host (same machine where `tb` works): + +```bash +python slime/examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9051 \ + --output-root tb_eval_output +``` + +What it does: +- Uses `OPENAI_API_KEY=EMPTY` +- Runs `tb run -a terminus-2 -m openai/ ... --n-concurrent 8` +- Waits for completion, then returns `accuracy`, `n_resolved`, + `n_unresolved`, `pass_at_k/*`, and token stats such as + `total_input_tokens_mean/median` and `total_output_tokens_mean/median` + +## 6) Run the eval script (example) + +If you use the provided Qwen eval launcher (`run-eval-tb-qwen.sh`), follow the steps below to run Terminal-Bench evaluation. + +First, update the `dataset_path` in `eval_tb_example.yaml` to the local path of `terminal-bench/tasks` on your host (not an internal Docker-only path). + +Then download the HuggingFace model checkpoint inside the Slime container: + +```bash +huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ +--local-dir /root/.cache/OpenThinker-Agent-v1 +``` + +After downloading, convert the HuggingFace checkpoint to Slime's torch distributed format. From the Slime root directory, run: + +```bash +cd /shared/slime-tb/slime +source scripts/models/qwen3-8B.sh + +export PYTHONPATH=/root/Megatron-LM:/shared/slime-tb/slime + +python tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ + --save /root/.cache/OpenThinker-Agent-v1_torch_dist +``` + +Finally, run the following command inside the Slime container: + +```bash +bash slime/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log +``` + +For convenience, you can restrict the evaluation scope in `eval_tb_example.yaml`, either by specifying a single task or multiple tasks (`task_ids`), or by limiting the number of tasks via `n_tasks`. + +## 7) Common Issues + +When running Slime inside a Docker container with `--network host`, Ray may encounter port conflicts due to shared networking with the host. + +In some cases, this manifests as Ray failing to start or reporting Redis- or session-related errors. This can usually be resolved by explicitly assigning unused ports when starting the Ray head node, for example by setting a non-default `--port` and `--dashboard-port`. + +In more severe cases, Ray job submission may fail with errors indicating that no available agent can accept jobs. This typically happens when the dashboard agent or runtime environment agent ports are also in conflict. In such situations, explicitly specifying the agent-related ports (e.g. `--dashboard-agent-listen-port`, `--dashboard-agent-grpc-port`, and `--runtime-env-agent-port`) when starting Ray can resolve the issue. + +If the TB server cannot connect to the Slime server through the sglang router (`InternalServerError`), check which address is actually listening on the router port (e.g. 30005 in this example) and update the `api_base` in `eval_tb_example.yaml` accordingly: + +```bash +ss -lntp | grep 30005 +``` + +You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. \ No newline at end of file diff --git a/examples/eval/terminal_bench/__init__.py b/examples/eval/terminal_bench/__init__.py new file mode 100644 index 000000000..6d2704250 --- /dev/null +++ b/examples/eval/terminal_bench/__init__.py @@ -0,0 +1 @@ +"""Terminal Bench evaluation helpers.""" diff --git a/examples/eval/terminal_bench/requirements.txt b/examples/eval/terminal_bench/requirements.txt new file mode 100644 index 000000000..1a0006c93 --- /dev/null +++ b/examples/eval/terminal_bench/requirements.txt @@ -0,0 +1,3 @@ +flask +omegaconf +requests diff --git a/examples/eval/terminal_bench/tb_client.py b/examples/eval/terminal_bench/tb_client.py new file mode 100644 index 000000000..2a93b7161 --- /dev/null +++ b/examples/eval/terminal_bench/tb_client.py @@ -0,0 +1,104 @@ +import logging +import time +from typing import Any + +import requests +from examples.eval.eval_delegate import EvalClient, EvalDelegateError +from examples.eval.terminal_bench.tb_config import TerminalBenchConfig + +logger = logging.getLogger(__name__) + + +class TerminalBenchClient(EvalClient): + """HTTP client that proxies evaluation requests to the Terminal Bench server.""" + + def __init__(self, config: TerminalBenchConfig, router_url: str): + super().__init__(config.name or "terminal_bench") + self._config = config + endpoint = (config.url or "").rstrip("/") + if endpoint.endswith("/evaluate"): + base_endpoint = endpoint[: -len("/evaluate")] + else: + base_endpoint = endpoint + self._endpoint = f"{base_endpoint}/evaluate" if base_endpoint else "" + self._status_endpoint = f"{base_endpoint}/status" if base_endpoint else "" + self._timeout_secs = float(config.timeout_secs) + self._max_retries = max(1, int(config.max_retries)) + self._headers = dict(config.headers or {}) + self._session = requests.Session() + + @classmethod + def from_config(cls, config: TerminalBenchConfig, router_url: str): + if not config.url: + return None + return cls(config, router_url) + + def evaluate(self, args, rollout_id: int) -> tuple[dict[str, Any], dict[str, Any]]: + payload = self._build_payload(args, rollout_id) + response = self._request(payload) + metrics = response.get("raw_metrics", {}) + return metrics, response + + def _build_payload(self, args, rollout_id: int) -> dict[str, Any]: + payload = { + "model_name": self._config.model_name, + "api_base": self._config.api_base, + "n_tasks": self._config.n_tasks, + "n_concurrent": self._config.n_concurrent, + "metric_prefix": self._config.name, + } + if self._config.dataset_path: + payload["dataset_path"] = self._config.dataset_path + if self._config.task_ids: + payload["task_ids"] = list(self._config.task_ids) + if self._config.n_attempts is not None: + payload["n_attempts"] = self._config.n_attempts + return payload + + def _request(self, payload: dict[str, Any]) -> dict[str, Any]: + last_error: Exception | None = None + for attempt in range(1, self._max_retries + 1): + try: + response = self._session.post( + self._endpoint, + json=payload, + timeout=self._timeout_secs, + headers=self._headers, + ) + response.raise_for_status() + if not response.content: + return {} + body = response.json() + if body.get("status") == "completed": + return body + job_id = body.get("job_id") + if not job_id: + return body + return self._poll_status(job_id) + except requests.RequestException as exc: + last_error = exc + logger.warning( + "Terminal Bench delegate request failed (attempt %s/%s): %s", attempt, self._max_retries, exc + ) + if attempt < self._max_retries: + time.sleep(min(2**attempt, 30)) + raise EvalDelegateError("Terminal Bench evaluation request failed") from last_error + + def _poll_status(self, job_id: str) -> dict[str, Any]: + status_url = f"{self._status_endpoint}/{job_id}" + deadline = time.time() + self._timeout_secs + while time.time() < deadline: + response = self._session.get(status_url, timeout=min(self._timeout_secs, 30), headers=self._headers) + response.raise_for_status() + if not response.content: + time.sleep(2) + continue + body = response.json() + status = body.get("status") + if status == "completed": + return body + if status == "failed": + error = body.get("error") or "Terminal Bench job failed" + raise EvalDelegateError(error) + time.sleep(2) + raise EvalDelegateError("Terminal Bench evaluation timed out") diff --git a/examples/eval/terminal_bench/tb_config.py b/examples/eval/terminal_bench/tb_config.py new file mode 100644 index 000000000..adb4f2c30 --- /dev/null +++ b/examples/eval/terminal_bench/tb_config.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Any + +from examples.eval.eval_delegate import EvalEnvConfig + + +@dataclass +class TerminalBenchConfig(EvalEnvConfig): + """Environment configuration shared by the Terminal Bench client/server.""" + + model_name: str = "qwen3-8b" + api_base: str = "http://127.0.1.1:30001/v1" + dataset_path: str | None = None + n_tasks: int | None = None + task_ids: list[str] = field(default_factory=list) + n_attempts: int | None = None + n_concurrent: int = 8 + + @classmethod + def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]) -> TerminalBenchConfig: + clean_raw = dict(raw_env_config or {}) + clean_raw.pop("type", None) + base_cfg: TerminalBenchConfig = super().parse(clean_raw, defaults) + + field_casts = { + "model_name": str, + "api_base": str, + "n_attempts": int, + "n_tasks": int, + "n_concurrent": int, + "dataset_path": str, + } + + for key, caster in field_casts.items(): + value = clean_raw.get(key) + if value is not None: + setattr(base_cfg, key, caster(value)) + + task_ids = clean_raw.get("task_ids") + if isinstance(task_ids, (list, tuple)): + base_cfg.task_ids = [str(item) for item in task_ids if item] + elif task_ids is not None: + raise ValueError("task_ids must be a list") + + return base_cfg + + + +def build_terminal_bench_config(args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]): + return TerminalBenchConfig.parse(args, raw_env_config, defaults) diff --git a/examples/eval/terminal_bench/tb_server.py b/examples/eval/terminal_bench/tb_server.py new file mode 100644 index 000000000..a43537faa --- /dev/null +++ b/examples/eval/terminal_bench/tb_server.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +""" +Simple HTTP server that proxies Slime evaluation requests to the `tb run` +command shipped with Terminal Bench. + +Usage: + python examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9050 \ + --output-root /opt/tb-eval + +Slime (or Slime-compatible runners) should POST the payload described in +`EvalRequestPayload` to http://:/evaluate. The server blocks until +`tb run` finishes, then returns aggregated metrics along with paths to the +generated artifacts (logs + raw metrics). +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shlex +import subprocess +import sys +import threading +import time +import uuid +import statistics +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[3] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from flask import Flask, jsonify, request +from omegaconf import OmegaConf +from omegaconf.errors import OmegaConfBaseException + +logger = logging.getLogger("terminal_bench_server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") + + +# --------------------------------------------------------------------------- +# Request payload helpers +# --------------------------------------------------------------------------- + + +@dataclass +class EvalRequestPayload: + model_name: str = "" + api_base: str = "" + n_tasks: int | None = None + n_concurrent: int | None = None + dataset_path: str | None = None + task_ids: list[str] | None = None + n_attempts: int | None = None + metric_prefix: str | None = None + + +@dataclass +class JobRecord: + job_id: str + status: str + run_id: str + command: str + output_dir: str + log_path: str + raw_metrics: dict[str, Any] | None = None + error: str | None = None + created_at: float = field(default_factory=time.time) + started_at: float | None = None + finished_at: float | None = None + + def to_dict(self) -> dict[str, Any]: + payload: dict[str, Any] = { + "job_id": self.job_id, + "status": self.status, + "run_id": self.run_id, + "command": self.command, + "output_dir": self.output_dir, + "log_path": self.log_path, + "created_at": self.created_at, + "started_at": self.started_at, + "finished_at": self.finished_at, + } + if self.raw_metrics is not None: + payload["raw_metrics"] = self.raw_metrics + if self.error: + payload["error"] = self.error + return payload + + +# --------------------------------------------------------------------------- +# Configuration + command helpers +# --------------------------------------------------------------------------- + + +def _normalize_model_name(model_name: str) -> str: + name = (model_name or "").strip() + if not name: + return "" + if "/" in name: + return name + return f"openai/{name}" + + +@dataclass +class ServerConfig: + output_root: Path + + @classmethod + def from_args(cls, args: argparse.Namespace) -> "ServerConfig": + return cls(output_root=Path(args.output_root).expanduser().resolve()) + + +class TerminalBenchEvaluator: + def __init__(self, config: ServerConfig): + self._config = config + self._lock = threading.Lock() + self._jobs_lock = threading.Lock() + self._jobs: dict[str, JobRecord] = {} + self._config.output_root.mkdir(parents=True, exist_ok=True) + self._log_root = REPO_ROOT.parent / "tb_eval_logs" + self._log_root.mkdir(parents=True, exist_ok=True) + + def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: + if not payload.model_name: + raise ValueError("Missing `model_name` in request payload.") + if not payload.api_base: + raise ValueError("Missing `api_base` in request payload.") + + job_id = uuid.uuid4().hex + run_id = f"{int(time.time())}-{job_id[:8]}" + run_dir = self._config.output_root / run_id + + command = self._build_command(payload, run_id) + command_str = " ".join(shlex.quote(part) for part in command) + log_path = self._log_root / f"{run_id}.log" + + record = JobRecord( + job_id=job_id, + status="queued", + run_id=run_id, + command=command_str, + output_dir=str(run_dir), + log_path=str(log_path), + ) + with self._jobs_lock: + self._jobs[job_id] = record + + thread = threading.Thread( + target=self._run_job, + args=(job_id, payload, run_dir, command, log_path), + daemon=True, + ) + thread.start() + + return { + "job_id": job_id, + "status": "queued", + "status_url": f"/status/{job_id}", + "run_id": run_id, + "command": command_str, + "output_dir": str(run_dir), + "log_path": str(log_path), + } + + def _run_job( + self, + job_id: str, + payload: EvalRequestPayload, + run_dir: Path, + command: list[str], + log_path: Path, + ) -> None: + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "running" + record.started_at = time.time() + + env = self._build_env() + logger.info("Starting Terminal Bench run: %s", " ".join(shlex.quote(part) for part in command)) + try: + with self._lock: + self._run_command(command, env=env, log_path=log_path) + metrics = self._collect_metrics(run_dir) + if payload.metric_prefix: + metrics = {payload.metric_prefix: metrics} + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "completed" + record.raw_metrics = metrics + record.finished_at = time.time() + except Exception as exc: # noqa: BLE001 + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "failed" + record.error = str(exc) + record.finished_at = time.time() + + def get_job_status(self, job_id: str) -> dict[str, Any] | None: + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return None + return record.to_dict() + + def _build_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: + # 1. Normalize model name (add openai/ prefix) + model_name = _normalize_model_name(payload.model_name) + + cmd = [ + "tb", + "run", + "-a", + "terminus-2", # Added Agent flag + "--output-path", + str(self._config.output_root), + "--run-id", + run_id, + ] + + # 2. Add model + if model_name: + cmd.extend(["--model", model_name]) + + # 3. Add Agent kwargs (Use api_base exactly like the CLI command) + if payload.api_base: + cmd.extend(["--agent-kwarg", f"api_base={payload.api_base}"]) + + if payload.dataset_path: + cmd.extend(["--dataset-path", payload.dataset_path]) + + if payload.n_attempts is not None: + cmd.extend(["--n-attempts", str(payload.n_attempts)]) + + # 4. Add n_tasks if present + task_ids = [] + if payload.task_ids: + task_ids.extend([str(item) for item in payload.task_ids if item]) + if task_ids: + for task_id in task_ids: + cmd.extend(["--task-id", task_id]) + elif payload.n_tasks is not None: + cmd.extend(["--n-tasks", str(payload.n_tasks)]) + + # 5. Add concurrency + n_concurrent = payload.n_concurrent + if n_concurrent is None: + n_concurrent = 1 + cmd.extend(["--n-concurrent", str(n_concurrent)]) + + return cmd + + def _build_env(self) -> dict[str, str]: + env = os.environ.copy() + # Inject env var to simulate "OPENAI_API_KEY=EMPTY" + env["OPENAI_API_KEY"] = "EMPTY" + return env + + @staticmethod + def _run_command(cmd: list[str], *, env: dict[str, str], log_path: Path): + with open(log_path, "w", encoding="utf-8") as log_file: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + text=True, + bufsize=1, + ) + assert process.stdout is not None + for line in process.stdout: + log_file.write(line) + log_file.flush() + sys.stdout.write(line) + sys.stdout.flush() + retcode = process.wait() + if retcode != 0: + with open(log_path, encoding="utf-8", errors="ignore") as log_file: + tail = "".join(log_file.readlines()[-200:]) + raise RuntimeError(f"`tb run` failed with exit code {retcode}. See {log_path}\n{tail}") + + @staticmethod + def _collect_metrics(run_dir: Path) -> dict[str, Any]: + metrics_path = run_dir / "results.json" + if not metrics_path.exists(): + logger.warning("Results file missing at %s", metrics_path) + return {} + + metrics = TerminalBenchEvaluator._extract_metrics(metrics_path) + if not metrics: + logger.warning("No accuracy/n_resolved metrics found in %s", metrics_path) + return metrics + + @staticmethod + def _extract_metrics(metrics_path: Path) -> dict[str, Any]: + try: + with open(metrics_path, encoding="utf-8") as fp: + metrics_data = json.load(fp) + except json.JSONDecodeError as exc: + logger.warning("Failed to parse %s: %s", metrics_path, exc) + return {} + + metrics: dict[str, Any] = {} + + # core metrics + accuracy = metrics_data.get("accuracy") + if isinstance(accuracy, (int, float)): + metrics["accuracy"] = float(accuracy) + + n_resolved = metrics_data.get("n_resolved") + if isinstance(n_resolved, (int, float)): + metrics["n_resolved"] = int(n_resolved) + + n_unresolved = metrics_data.get("n_unresolved") + if isinstance(n_unresolved, (int, float)): + metrics["n_unresolved"] = int(n_unresolved) + + # pass@k flatten + pass_at_k = metrics_data.get("pass_at_k") + if isinstance(pass_at_k, dict): + for k, v in pass_at_k.items(): + if isinstance(v, (int, float)): + metrics[f"pass_at_k/{k}"] = float(v) + + # token stats from per-task results + results = metrics_data.get("results") + if isinstance(results, list): + input_tokens = [ + r.get("total_input_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_input_tokens"), (int, float)) + ] + output_tokens = [ + r.get("total_output_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_output_tokens"), (int, float)) + ] + + if input_tokens: + metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) + metrics["total_input_tokens_median"] = float(statistics.median(input_tokens)) + if output_tokens: + metrics["total_output_tokens_mean"] = float(statistics.mean(output_tokens)) + metrics["total_output_tokens_median"] = float(statistics.median(output_tokens)) + + return metrics + + +# --------------------------------------------------------------------------- +# HTTP server +# --------------------------------------------------------------------------- + + +def build_app(evaluator: TerminalBenchEvaluator) -> Flask: + app = Flask(__name__) + + @app.get("/health") + def health_check(): + return jsonify({"status": "ok"}) + + @app.post("/evaluate") + def evaluate_endpoint(): + try: + raw_payload = request.get_json(force=True, silent=False) + cfg = OmegaConf.merge( + OmegaConf.structured(EvalRequestPayload), + OmegaConf.create(raw_payload or {}), + ) + payload = OmegaConf.to_object(cfg) + result = evaluator.evaluate(payload) + return jsonify(result) + except OmegaConfBaseException as exc: + logger.exception("Invalid request payload") + return jsonify({"error": str(exc)}), 400 + except Exception as exc: # noqa: BLE001 + logger.exception("Evaluation failed") + return jsonify({"error": str(exc)}), 500 + + @app.get("/status/") + def status_endpoint(job_id: str): + status = evaluator.get_job_status(job_id) + if status is None: + return jsonify({"error": "job not found"}), 404 + return jsonify(status) + + return app + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run the Terminal Bench evaluation HTTP server.") + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=9050) + parser.add_argument( + "--output-root", + type=str, + default="./terminal-bench-output", + help="Directory to store `tb run` outputs.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + config = ServerConfig.from_args(args) + evaluator = TerminalBenchEvaluator(config) + app = build_app(evaluator) + logger.info( + "Starting Terminal Bench evaluation server on %s:%s (output root=%s)", + args.host, + args.port, + config.output_root, + ) + app.run(host=args.host, port=args.port) + + +if __name__ == "__main__": + main()