THUDM · guapisolo · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 3, 2025
diff --git a/examples/eval/README.md b/examples/eval/README.md
@@ -43,25 +43,27 @@ docker run \
 ```
 
 ## 4) Inside the Skills container
+
+Set openai api key:
+```bash
+export OPENAI_API_KEY=none
+```
+
 Clone repos and install the Skills package:
 ```bash
-git clone -b slime_skills https://github.com/guapisolo/slime.git /opt/slime
+git clone -b main https://github.com/THUDM/slime /opt/slime
 git clone -b slime https://github.com/guapisolo/Skills.git /opt/Skills
 
 cd /opt/Skills
 pip install -e .
-```
 
-Download/prepare datasets:
-```bash
+# Download/prepare datasets:
 cd /opt/Skills/nemo_skills/dataset
 python3 aime25/prepare.py
 python3 hle/prepare.py
 python3 arena-hard/prepare.py
-```
 
-Start the skills server:
-```bash
+# Start the skills server:
 cd /opt/slime
 python examples/eval/nemo_skills/skills_server.py \
   --host 0.0.0.0 \

diff --git a/examples/eval/scripts/multi_tasks.yaml b/examples/eval/scripts/multi_tasks.yaml
@@ -14,6 +14,16 @@ eval:
       path: /root/ifbench/IFBench_eval.jsonl
       rm_type: ifbench
       n_samples_per_eval_prompt: 1
+    - name: tau2-airline
+      path: /root/tau2-bench/data/tau2/airline_test_tasks.jsonl
+      custom_generate_function_path: examples.tau2-bench.generate_with_tau2.generate
+      input_key: task_id
+      label_key: null
+      apply_chat_template: False
+      top_k: 1
+      max_response_len: 1024
+      max_context_len: 40000
+      n_samples_per_eval_prompt: 1
   delegate: # these tasks go through delegate eval function (examples.eval.eval_delegate_rollout.generate_rollout)
     - name: skills 
       # this url should align with env docker network alias
@@ -24,9 +34,9 @@ eval:
       datasets:
         - name: aime25
           max_response_len: 8192
-          n_samples_per_eval_prompt: 8
-        - name: arena-hard
           n_samples_per_eval_prompt: 2
+        - name: arena-hard
+          n_samples_per_eval_prompt: 1
+          max_response_len: 24576
         - name: hle
-          max_response_len: 32768
-
+          max_response_len: 24576
diff --git a/examples/eval/scripts/run-qwen3-4B.sh b/examples/eval/scripts/run-qwen3-4B.sh
@@ -122,8 +122,8 @@ MISC_ARGS=(
 )
 
 export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
-export CUDA_VISIBLE_DEVICES=6,7
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 2 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
 
 RUNTIME_ENV_JSON="{
   \"env_vars\": {
@@ -136,7 +136,7 @@ ray job submit --address="http://127.0.0.1:8265" \
    --runtime-env-json="${RUNTIME_ENV_JSON}" \
    -- python3 train.py \
    --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 2 \
+   --actor-num-gpus-per-node 8 \
    --colocate \
    ${MODEL_ARGS[@]} \
    ${CKPT_ARGS[@]} \

diff --git a/examples/tau2-bench/README.md b/examples/tau2-bench/README.md
@@ -0,0 +1,72 @@
+# Tau2 bench with slime
+
+This example mirrors `examples/tau-bench`, but plugs the newer tau2 gym environment into slime rollouts.
+
+## Setup
+
+Use the `zhuzilin/slime:latest` image and initialize the environment required for Tau2-Bench:
+```bash
+cd /root/
+git clone https://github.com/slimerl/slime.git
+cd slime
+pip install -e .
+# for tau2 bench 
+cd /root/
+git clone https://github.com/sierra-research/tau2-bench.git
+cd tau2-bench
+pip install -e .
+```
+
+Use the following script to generate mock data for slime training. 
+
+```bash
+cd /root/slime
+python examples/tau2-bench/tau2_mock.py \
+  --output-dir /root/tau2-bench/data/tau2
+```
+Initialize the Qwen2.5-3B-Instruct model needed for tool use:
+
+```bash
+# hf checkpoint
+huggingface-cli download Qwen/Qwen3-4B-Instruct-2507 --local-dir /root/Qwen3-4B-Instruct-2507
+
+# mcore checkpoint
+cd /root/slime
+source scripts/models/qwen3-4B-Instruct-2507.sh
+PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
+    ${MODEL_ARGS[@]} \
+    --hf-checkpoint /root/Qwen3-4B-Instruct-2507 \
+    --save /root/Qwen3-4B-Instruct-2507_torch_dist
+```
+
+## Running the Script
+
+The custom rollout entrypoint is `examples.tau2-bench.generate_with_tau2.generate`. A sample launcher is provided in `examples/tau2-bench/run_tau2_qwen3_4B.sh`; the important CLI flags are:
+
+```bash
+--prompt-data /root/tau2-bench/data/tau2/airline_train_tasks.jsonl
+--input-key task_id
+--custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
+```
+
+You need to configure your litellm API in `generate_with_tau2.py` for user simulation:
+
+```python
+TAU2_CONFIGS = {
+    "domain": "airline",  # tau2 domain: airline | retail | telecom | mock
+    "task_split": "train",  # task split within the domain
+    "max_steps": 100,  # safety cap on interaction steps
+    "user_llm": "gpt-4.1-mini",  # LiteLLM model name for user simulator
+    "solo_mode": False,  # set True to disable user simulator
+}
+# Replace with your actual API key for user sim
+GEMINI_API_KEY = "YOUR_GEMINI_KEY"
+os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
+```
+
+And run:
+
+```bash
+cd /root/slime
+bash examples/tau2-bench/run_tau2_qwen3_4B.sh
+```
diff --git a/examples/tau2-bench/configs/strip_think.yaml b/examples/tau2-bench/configs/strip_think.yaml
@@ -0,0 +1 @@
+rollout_strip_think: true
diff --git a/examples/tau2-bench/generate_with_tau2.py b/examples/tau2-bench/generate_with_tau2.py
@@ -0,0 +1,59 @@
+"""
+Tau2-Bench integration for slime Training.
+
+Configure the domain/task split below, point slime at this file via
+--custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
+"""
+
+import logging
+import os
+from typing import Any
+
+from slime.utils.types import Sample
+
+from .trainable_agent import Tau2TrainableAgent, res_to_sample
+
+logger = logging.getLogger(__name__)
+
+# Base configuration (edit here as needed).
+TAU2_CONFIGS: dict[str, Any] = {
+    "domain": "airline",  # tau2 domain: airline | retail | telecom | mock
+    "task_split": "train",  # task split within the domain
+    "max_steps": 100,  # safety cap on interaction steps
+    # Explicit gemini provider prefix to avoid Vertex ADC path.
+    # "user_llm": "gemini/gemini-2.5-flash-lite",
+    "user_llm": "gpt-4.1",
+    "user_llm_args": {},  # will inject api_key below
+    "solo_mode": False,  # set True to disable user simulator
+}
+
+# Replace with your actual API key for user simulator (LiteLLM)
+API_KEY = "NONE"
+if API_KEY == "NONE":
+    API_KEY = os.getenv("OPENAI_API_KEY")
+# Also pass through args to force gemini path
+TAU2_CONFIGS["user_llm_args"] = {"api_key": API_KEY}
+
+
+async def generate(args: dict[str, Any], sample: Sample, sampling_params: dict) -> Sample:
+    assert not args.partial_rollout, "Partial rollout is not supported for tau2."
+
+    agent = Tau2TrainableAgent(
+        args=args,
+        sampling_params=sampling_params,
+        domain=TAU2_CONFIGS["domain"],
+        task_split=TAU2_CONFIGS["task_split"],
+        max_steps=TAU2_CONFIGS["max_steps"],
+        user_llm=TAU2_CONFIGS["user_llm"],
+        user_llm_args=TAU2_CONFIGS.get("user_llm_args") or {},
+        solo_mode=TAU2_CONFIGS["solo_mode"],
+    )
+
+    task_id, task_index = agent._resolve_task_id(sample.prompt)  # noqa: SLF001 - simple helper
+    logger.info("Starting tau2 rollout for task_id=%s (index=%s)", task_id, task_index)
+
+    interaction_result = await agent.run_episode(task_id)
+    result_sample = res_to_sample(interaction_result, task_index)
+
+    logger.info("Finished tau2 rollout for task_id=%s", task_id)
+    return result_sample
diff --git a/examples/tau2-bench/run_tau2_qwen3_4B.sh b/examples/tau2-bench/run_tau2_qwen3_4B.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/../../scripts/models/qwen3-4B-Instruct-2507.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Qwen3-4B-Instruct-2507/
+   --ref-load /root/Qwen3-4B-Instruct-2507_torch_dist/
+   --load /root/Qwen3-4B-Instruct-2507_slime/
+   --save /root/Qwen3-4B-Instruct-2507_slime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/tau2-bench/data/tau2/airline_train_tasks.jsonl
+   --input-key task_id
+   --rollout-shuffle
+   --num-rollout 500
+   --rollout-batch-size 16
+   --n-samples-per-prompt 4
+   --rollout-max-response-len 1024
+   --rollout-temperature 0.8
+   --global-batch-size 64
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 5
+   --eval-prompt-data airline-test /root/tau2-bench/data/tau2/airline_test_tasks.jsonl
+   --n-samples-per-eval-prompt 1
+   --eval-max-response-len 1024
+   --eval-top-k 1
+   --eval-input-key task_id
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+WANDB_ARGS=(
+   --use-wandb
+   --wandb-project slime-tau2
+   --wandb-group qwen3-4B
+   --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.7
+   # If gemini API reports concurrency limit error, set this parameter to reduce the concurrency
+   # --sglang-server-concurrency 32
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+CUSTOM_ARGS=(
+   --custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
+)
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+# If you want more or less GPUs, change this parameter
+NUM_GPUS=2
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 --temp-dir /root/shared/ray_temp 
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/:${SCRIPT_DIR}:/root/tau2-bench\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node ${NUM_GPUS} \
+   --rollout-num-gpus ${NUM_GPUS} \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${DISTRIBUTED_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${CUSTOM_ARGS[@]}