Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions scripts/run-qwen3-14B-b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#!/bin/bash

# Qwen3-14B on B300 (sm_103a)
# ref: https://github.com/radixark/miles/issues/530
# ref: https://github.com/radixark/miles/issues/533

# --- Host-side launcher ---
# If not running inside a container, pull the B300 image and re-execute
# this script inside it.
if [ ! -f /.dockerenv ]; then
set -ex

IMAGE=fy121415/miles:b300
CONTAINER_NAME=qwen3-14b-b300
DATA_DIR=${DATA_DIR:-/root}
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

docker pull "$IMAGE"
docker rm -f "$CONTAINER_NAME" 2>/dev/null || true

docker run --rm \
-i \
--gpus all \
--ipc=host \
--network=host \
--name "$CONTAINER_NAME" \
-v "$DATA_DIR":/data \
-v "$SCRIPT_DIR":/root/miles/scripts \
"$IMAGE" \
bash /root/miles/scripts/run-qwen3-14B-b300.sh

exit $?
fi

# --- Container-side training ---

# for rerun the task
pkill -9 sglang
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python

set -ex

export PYTHONUNBUFFERED=1

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/models/qwen3-14B.sh"

CKPT_ARGS=(
--hf-checkpoint /data/Qwen3-14B
--ref-load /data/Qwen3-14B_torch_dist
--load /data/Qwen3-14B_miles/
--save /data/Qwen3-14B_miles/
--save-interval 20
)

ROLLOUT_ARGS=(
--prompt-data /data/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--apply-chat-template
--rollout-shuffle
--rm-type deepscaler
--num-rollout 3000
--rollout-batch-size 16
--n-samples-per-prompt 8
--rollout-max-response-len 8192
--rollout-temperature 1

--global-batch-size 128
--balance-data
)

EVAL_ARGS=(
--eval-interval 20
--eval-prompt-data aime /data/aime-2024/aime-2024.jsonl
--n-samples-per-eval-prompt 16
--eval-max-response-len 16384
--eval-top-p 1
)

PERF_ARGS=(
--tensor-model-parallel-size 4
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 1
--expert-tensor-parallel-size 1

--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1

--use-dynamic-batch-size
--max-tokens-per-gpu 4608
)

GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.00
--kl-loss-type low_var_kl
--entropy-coef 0.00
--eps-clip 0.2
--eps-clip-high 0.28
)

OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98
)

WANDB_ARGS=(
# --use-wandb
# --wandb-project miles-test
# --wandb-group qwen3-14B-b300
# --wandb-key ${WANDB_API_KEY}
)

SGLANG_ARGS=(
--rollout-num-gpus-per-engine 4
--sglang-mem-fraction-static 0.7
# B300 workaround: triton cuda graph compile fails on sm_103a (#533)
--sglang-disable-cuda-graph
)

MISC_ARGS=(
# default dropout in megatron is 0.1
--attention-dropout 0.0
--hidden-dropout 0.0
# should be good for model performance
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
# need to comment this when using model with MLA
--attention-backend flash
)

# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
\"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\"
}
}"

ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
--working-dir /root/miles \
-- python3 train.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 8 \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${SGLANG_ARGS[@]} \
${MISC_ARGS[@]}