diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh new file mode 100755 index 000000000..41503cfab --- /dev/null +++ b/scripts/run-qwen3-14B-b300.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +# Qwen3-14B on B300 (sm_103a) +# ref: https://github.com/radixark/miles/issues/530 +# ref: https://github.com/radixark/miles/issues/533 + +# --- Host-side launcher --- +# If not running inside a container, pull the B300 image and re-execute +# this script inside it. +if [ ! -f /.dockerenv ]; then + set -ex + + IMAGE=fy121415/miles:b300 + CONTAINER_NAME=qwen3-14b-b300 + DATA_DIR=${DATA_DIR:-/root} + SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + + docker pull "$IMAGE" + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + + docker run --rm \ + -i \ + --gpus all \ + --ipc=host \ + --network=host \ + --name "$CONTAINER_NAME" \ + -v "$DATA_DIR":/data \ + -v "$SCRIPT_DIR":/root/miles/scripts \ + "$IMAGE" \ + bash /root/miles/scripts/run-qwen3-14B-b300.sh + + exit $? +fi + +# --- Container-side training --- + +# for rerun the task +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex + +export PYTHONUNBUFFERED=1 + +NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) +if [ "$NVLINK_COUNT" -gt 0 ]; then + HAS_NVLINK=1 +else + HAS_NVLINK=0 +fi +echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +source "${SCRIPT_DIR}/models/qwen3-14B.sh" + +CKPT_ARGS=( + --hf-checkpoint /data/Qwen3-14B + --ref-load /data/Qwen3-14B_torch_dist + --load /data/Qwen3-14B_miles/ + --save /data/Qwen3-14B_miles/ + --save-interval 20 +) + +ROLLOUT_ARGS=( + --prompt-data /data/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --rm-type deepscaler + --num-rollout 3000 + --rollout-batch-size 16 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-temperature 1 + + --global-batch-size 128 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 20 + --eval-prompt-data aime /data/aime-2024/aime-2024.jsonl + --n-samples-per-eval-prompt 16 + --eval-max-response-len 16384 + --eval-top-p 1 +) + +PERF_ARGS=( + --tensor-model-parallel-size 4 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 4608 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.00 + --kl-loss-type low_var_kl + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 +) + +WANDB_ARGS=( + # --use-wandb + # --wandb-project miles-test + # --wandb-group qwen3-14B-b300 + # --wandb-key ${WANDB_API_KEY} +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 4 + --sglang-mem-fraction-static 0.7 + # B300 workaround: triton cuda graph compile fails on sm_103a (#533) + --sglang-disable-cuda-graph +) + +MISC_ARGS=( + # default dropout in megatron is 0.1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + # should be good for model performance + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + # need to comment this when using model with MLA + --attention-backend flash +) + +# launch the master node of ray in container +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +# Build the runtime environment JSON with proper variable substitution +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/root/Megatron-LM/\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", + \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\" + } +}" + +ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + --working-dir /root/miles \ + -- python3 train.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node 8 \ + --colocate \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]}