Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions examples/eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,27 @@ docker run \
```

## 4) Inside the Skills container

Set openai api key:
```bash
export OPENAI_API_KEY=none
```

Clone repos and install the Skills package:
```bash
git clone -b slime_skills https://github.com/guapisolo/slime.git /opt/slime
git clone -b main https://github.com/THUDM/slime /opt/slime
git clone -b slime https://github.com/guapisolo/Skills.git /opt/Skills

cd /opt/Skills
pip install -e .
```

Download/prepare datasets:
```bash
# Download/prepare datasets:
cd /opt/Skills/nemo_skills/dataset
python3 aime25/prepare.py
python3 hle/prepare.py
python3 arena-hard/prepare.py
```

Start the skills server:
```bash
# Start the skills server:
cd /opt/slime
python examples/eval/nemo_skills/skills_server.py \
--host 0.0.0.0 \
Expand Down
18 changes: 14 additions & 4 deletions examples/eval/scripts/multi_tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ eval:
path: /root/ifbench/IFBench_eval.jsonl
rm_type: ifbench
n_samples_per_eval_prompt: 1
- name: tau2-airline
path: /root/tau2-bench/data/tau2/airline_test_tasks.jsonl
custom_generate_function_path: examples.tau2-bench.generate_with_tau2.generate
input_key: task_id
label_key: null
apply_chat_template: False
top_k: 1
max_response_len: 1024
max_context_len: 40000
n_samples_per_eval_prompt: 1
delegate: # these tasks go through delegate eval function (examples.eval.eval_delegate_rollout.generate_rollout)
- name: skills
# this url should align with env docker network alias
Expand All @@ -24,9 +34,9 @@ eval:
datasets:
- name: aime25
max_response_len: 8192
n_samples_per_eval_prompt: 8
- name: arena-hard
n_samples_per_eval_prompt: 2
- name: arena-hard
n_samples_per_eval_prompt: 1
max_response_len: 24576
- name: hle
max_response_len: 32768

max_response_len: 24576
6 changes: 3 additions & 3 deletions examples/eval/scripts/run-qwen3-4B.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ MISC_ARGS=(
)

export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
export CUDA_VISIBLE_DEVICES=6,7
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 2 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
# export CUDA_VISIBLE_DEVICES=4,5,6,7
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265

RUNTIME_ENV_JSON="{
\"env_vars\": {
Expand All @@ -136,7 +136,7 @@ ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 2 \
--actor-num-gpus-per-node 8 \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
Expand Down
72 changes: 72 additions & 0 deletions examples/tau2-bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Tau2 bench with slime

This example mirrors `examples/tau-bench`, but plugs the newer tau2 gym environment into slime rollouts.

## Setup

Use the `zhuzilin/slime:latest` image and initialize the environment required for Tau2-Bench:
```bash
cd /root/
git clone https://github.com/slimerl/slime.git
cd slime
pip install -e .
# for tau2 bench
cd /root/
git clone https://github.com/sierra-research/tau2-bench.git
cd tau2-bench
pip install -e .
```

Use the following script to generate mock data for slime training.

```bash
cd /root/slime
python examples/tau2-bench/tau2_mock.py \
--output-dir /root/tau2-bench/data/tau2
```
Initialize the Qwen2.5-3B-Instruct model needed for tool use:

```bash
# hf checkpoint
huggingface-cli download Qwen/Qwen3-4B-Instruct-2507 --local-dir /root/Qwen3-4B-Instruct-2507

# mcore checkpoint
cd /root/slime
source scripts/models/qwen3-4B-Instruct-2507.sh
PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
${MODEL_ARGS[@]} \
--hf-checkpoint /root/Qwen3-4B-Instruct-2507 \
--save /root/Qwen3-4B-Instruct-2507_torch_dist
```

## Running the Script

The custom rollout entrypoint is `examples.tau2-bench.generate_with_tau2.generate`. A sample launcher is provided in `examples/tau2-bench/run_tau2_qwen3_4B.sh`; the important CLI flags are:

```bash
--prompt-data /root/tau2-bench/data/tau2/airline_train_tasks.jsonl
--input-key task_id
--custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
```

You need to configure your litellm API in `generate_with_tau2.py` for user simulation:

```python
TAU2_CONFIGS = {
"domain": "airline", # tau2 domain: airline | retail | telecom | mock
"task_split": "train", # task split within the domain
"max_steps": 100, # safety cap on interaction steps
"user_llm": "gpt-4.1-mini", # LiteLLM model name for user simulator
"solo_mode": False, # set True to disable user simulator
}
# Replace with your actual API key for user sim
GEMINI_API_KEY = "YOUR_GEMINI_KEY"
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
```

And run:

```bash
cd /root/slime
bash examples/tau2-bench/run_tau2_qwen3_4B.sh
```
1 change: 1 addition & 0 deletions examples/tau2-bench/configs/strip_think.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
rollout_strip_think: true
59 changes: 59 additions & 0 deletions examples/tau2-bench/generate_with_tau2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Tau2-Bench integration for slime Training.

Configure the domain/task split below, point slime at this file via
--custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
"""

import logging
import os
from typing import Any

from slime.utils.types import Sample

from .trainable_agent import Tau2TrainableAgent, res_to_sample

logger = logging.getLogger(__name__)

# Base configuration (edit here as needed).
TAU2_CONFIGS: dict[str, Any] = {
"domain": "airline", # tau2 domain: airline | retail | telecom | mock
"task_split": "train", # task split within the domain
"max_steps": 100, # safety cap on interaction steps
# Explicit gemini provider prefix to avoid Vertex ADC path.
# "user_llm": "gemini/gemini-2.5-flash-lite",
"user_llm": "gpt-4.1",
"user_llm_args": {}, # will inject api_key below
"solo_mode": False, # set True to disable user simulator
}

# Replace with your actual API key for user simulator (LiteLLM)
API_KEY = "NONE"
if API_KEY == "NONE":
API_KEY = os.getenv("OPENAI_API_KEY")
# Also pass through args to force gemini path
TAU2_CONFIGS["user_llm_args"] = {"api_key": API_KEY}


async def generate(args: dict[str, Any], sample: Sample, sampling_params: dict) -> Sample:
assert not args.partial_rollout, "Partial rollout is not supported for tau2."

agent = Tau2TrainableAgent(
args=args,
sampling_params=sampling_params,
domain=TAU2_CONFIGS["domain"],
task_split=TAU2_CONFIGS["task_split"],
max_steps=TAU2_CONFIGS["max_steps"],
user_llm=TAU2_CONFIGS["user_llm"],
user_llm_args=TAU2_CONFIGS.get("user_llm_args") or {},
solo_mode=TAU2_CONFIGS["solo_mode"],
)

task_id, task_index = agent._resolve_task_id(sample.prompt) # noqa: SLF001 - simple helper
logger.info("Starting tau2 rollout for task_id=%s (index=%s)", task_id, task_index)

interaction_result = await agent.run_episode(task_id)
result_sample = res_to_sample(interaction_result, task_index)

logger.info("Finished tau2 rollout for task_id=%s", task_id)
return result_sample
152 changes: 152 additions & 0 deletions examples/tau2-bench/run_tau2_qwen3_4B.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/bin/bash

# for rerun the task
pkill -9 sglang
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python

set -ex

# will prevent ray from buffering stdout/stderr
export PYTHONBUFFERED=16

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/../../scripts/models/qwen3-4B-Instruct-2507.sh"

CKPT_ARGS=(
--hf-checkpoint /root/Qwen3-4B-Instruct-2507/
--ref-load /root/Qwen3-4B-Instruct-2507_torch_dist/
--load /root/Qwen3-4B-Instruct-2507_slime/
--save /root/Qwen3-4B-Instruct-2507_slime/
--save-interval 20
)

ROLLOUT_ARGS=(
--prompt-data /root/tau2-bench/data/tau2/airline_train_tasks.jsonl
--input-key task_id
--rollout-shuffle
--num-rollout 500
--rollout-batch-size 16
--n-samples-per-prompt 4
--rollout-max-response-len 1024
--rollout-temperature 0.8
--global-batch-size 64
--balance-data
)

EVAL_ARGS=(
--eval-interval 5
--eval-prompt-data airline-test /root/tau2-bench/data/tau2/airline_test_tasks.jsonl
--n-samples-per-eval-prompt 1
--eval-max-response-len 1024
--eval-top-k 1
--eval-input-key task_id
)

PERF_ARGS=(
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 1
--expert-tensor-parallel-size 1
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
--use-dynamic-batch-size
--max-tokens-per-gpu 9216
)

GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.00
--kl-loss-type low_var_kl
--entropy-coef 0.00
--eps-clip 0.2
--eps-clip-high 0.28
)

OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98
)

WANDB_ARGS=(
--use-wandb
--wandb-project slime-tau2
--wandb-group qwen3-4B
--wandb-key ${WANDB_KEY}
)

SGLANG_ARGS=(
--rollout-num-gpus-per-engine 1
--sglang-mem-fraction-static 0.7
# If gemini API reports concurrency limit error, set this parameter to reduce the concurrency
# --sglang-server-concurrency 32
)

MISC_ARGS=(
# default dropout in megatron is 0.1
--attention-dropout 0.0
--hidden-dropout 0.0
# should be good for model performance
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
# need to comment this when using model with MLA
--attention-backend flash
)

CUSTOM_ARGS=(
--custom-generate-function-path examples.tau2-bench.generate_with_tau2.generate
)
# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}

# If you want more or less GPUs, change this parameter
NUM_GPUS=2
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 --temp-dir /root/shared/ray_temp

RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/:${SCRIPT_DIR}:/root/tau2-bench\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\"
}
}"

ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node ${NUM_GPUS} \
--rollout-num-gpus ${NUM_GPUS} \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${SGLANG_ARGS[@]} \
${MISC_ARGS[@]} \
${CUSTOM_ARGS[@]}
Loading