From 8a827d063f86e3dd4eb54f2878e04c87b8a70166 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <1277646412@qq.com> Date: Sat, 31 Jan 2026 07:59:43 +0000 Subject: [PATCH 1/3] Add Qwen3-14B B300 training script --- scripts/run-qwen3-14B-b300.sh | 157 ++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 scripts/run-qwen3-14B-b300.sh diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh new file mode 100755 index 000000000..27675d7ff --- /dev/null +++ b/scripts/run-qwen3-14B-b300.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +# Qwen3-14B on B300 (sm_103a) +# Tested on 8x B300 SXM6 275GB with fy121415/miles:b300 +# ref: https://github.com/radixark/miles/issues/530 +# ref: https://github.com/radixark/miles/issues/533 + +# for rerun the task +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex + +# will prevent ray from buffering stdout/stderr +export PYTHONBUFFERED=16 + +NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) +if [ "$NVLINK_COUNT" -gt 0 ]; then + HAS_NVLINK=1 +else + HAS_NVLINK=0 +fi +echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +source "${SCRIPT_DIR}/models/qwen3-14B.sh" + +CKPT_ARGS=( + --hf-checkpoint /root/Qwen3-14B + --ref-load /root/Qwen3-14B_torch_dist + --load /root/Qwen3-14B_miles/ + --save /root/Qwen3-14B_miles/ + --save-interval 20 +) + +ROLLOUT_ARGS=( + --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --rm-type deepscaler + --num-rollout 3000 + --rollout-batch-size 16 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-temperature 1 + + --global-batch-size 128 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 20 + --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl + --n-samples-per-eval-prompt 16 + --eval-max-response-len 16384 + --eval-top-p 1 +) + +PERF_ARGS=( + --tensor-model-parallel-size 4 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 4608 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.00 + --kl-loss-type low_var_kl + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 +) + +WANDB_ARGS=( + # --use-wandb + # --wandb-project miles-test + # --wandb-group qwen3-14B-b300 + # --wandb-key ${WANDB_API_KEY} +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 4 + --sglang-mem-fraction-static 0.7 + # B300 workaround: triton cuda graph compile fails on sm_103a (#533) + --sglang-disable-cuda-graph +) + +MISC_ARGS=( + # default dropout in megatron is 0.1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + # should be good for model performance + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + # need to comment this when using model with MLA + --attention-backend flash +) + +# launch the master node of ray in container +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +# Build the runtime environment JSON with proper variable substitution +# B300 workaround: disable TP memory imbalance check (#533) +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/root/Megatron-LM/\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", + \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\" + } +}" + +ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 train.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node 8 \ + --colocate \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} From ae0427748eb94c9c7a5e12d68d70fe352d40d705 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <1277646412@qq.com> Date: Sat, 31 Jan 2026 17:27:07 +0000 Subject: [PATCH 2/3] Rewrite B300 script as host-side docker workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Start from docker pull instead of assuming running inside container - Remove pkill cleanup and NVLink detection (hardcode NCCL_NVLS_ENABLE=1) - Fix PYTHONBUFFERED typo → PYTHONUNBUFFERED - Quote MASTER_ADDR properly - Inline MODEL_ARGS (container heredoc can't source host files) - Data paths use /data mount point (DATA_DIR configurable) --- scripts/run-qwen3-14B-b300.sh | 98 +++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh index 27675d7ff..a9df55e29 100755 --- a/scripts/run-qwen3-14B-b300.sh +++ b/scripts/run-qwen3-14B-b300.sh @@ -1,46 +1,59 @@ #!/bin/bash # Qwen3-14B on B300 (sm_103a) -# Tested on 8x B300 SXM6 275GB with fy121415/miles:b300 # ref: https://github.com/radixark/miles/issues/530 # ref: https://github.com/radixark/miles/issues/533 -# for rerun the task -pkill -9 sglang -sleep 3 -ray stop --force -pkill -9 ray -pkill -9 python -sleep 3 -pkill -9 ray -pkill -9 python - set -ex -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) -if [ "$NVLINK_COUNT" -gt 0 ]; then - HAS_NVLINK=1 -else - HAS_NVLINK=0 -fi -echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" +IMAGE=fy121415/miles:b300 +CONTAINER_NAME=qwen3-14b-b300 +DATA_DIR=${DATA_DIR:-/root} + +docker pull "$IMAGE" +docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + +docker run --rm \ + --gpus all \ + --ipc=host \ + --network=host \ + --name "$CONTAINER_NAME" \ + -v "$DATA_DIR":/data \ + "$IMAGE" \ + bash -s <<'EOF' +set -ex -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -source "${SCRIPT_DIR}/models/qwen3-14B.sh" +export PYTHONUNBUFFERED=1 + +MODEL_ARGS=( + --swiglu + --num-layers 40 + --hidden-size 5120 + --ffn-hidden-size 17408 + --num-attention-heads 40 + --group-query-attention + --num-query-groups 8 + --use-rotary-position-embeddings + --disable-bias-linear + --normalization "RMSNorm" + --norm-epsilon 1e-6 + --rotary-base 1000000 + --vocab-size 151936 + --kv-channels 128 + --qk-layernorm + --untie-embeddings-and-output-weights +) CKPT_ARGS=( - --hf-checkpoint /root/Qwen3-14B - --ref-load /root/Qwen3-14B_torch_dist - --load /root/Qwen3-14B_miles/ - --save /root/Qwen3-14B_miles/ + --hf-checkpoint /data/Qwen3-14B + --ref-load /data/Qwen3-14B_torch_dist + --load /data/Qwen3-14B_miles/ + --save /data/Qwen3-14B_miles/ --save-interval 20 ) ROLLOUT_ARGS=( - --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl + --prompt-data /data/dapo-math-17k/dapo-math-17k.jsonl --input-key prompt --label-key label --apply-chat-template @@ -58,7 +71,7 @@ ROLLOUT_ARGS=( EVAL_ARGS=( --eval-interval 20 - --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl + --eval-prompt-data aime /data/aime-2024/aime-2024.jsonl --n-samples-per-eval-prompt 16 --eval-max-response-len 16384 --eval-top-p 1 @@ -114,30 +127,24 @@ SGLANG_ARGS=( ) MISC_ARGS=( - # default dropout in megatron is 0.1 --attention-dropout 0.0 --hidden-dropout 0.0 - # should be good for model performance --accumulate-allreduce-grads-in-fp32 --attention-softmax-in-fp32 - # need to comment this when using model with MLA --attention-backend flash ) -# launch the master node of ray in container -export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 - -# Build the runtime environment JSON with proper variable substitution -# B300 workaround: disable TP memory imbalance check (#533) -RUNTIME_ENV_JSON="{ - \"env_vars\": { - \"PYTHONPATH\": \"/root/Megatron-LM/\", - \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", - \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", - \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\" +export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" +ray start --head --node-ip-address "${MASTER_ADDR}" --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +RUNTIME_ENV_JSON='{ + "env_vars": { + "PYTHONPATH": "/root/Megatron-LM/", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + "NCCL_NVLS_ENABLE": "1", + "SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK": "false" } -}" +}' ray job submit --address="http://127.0.0.1:8265" \ --runtime-env-json="${RUNTIME_ENV_JSON}" \ @@ -155,3 +162,4 @@ ray job submit --address="http://127.0.0.1:8265" \ ${EVAL_ARGS[@]} \ ${SGLANG_ARGS[@]} \ ${MISC_ARGS[@]} +EOF From 435e1d4ae2cdf382e3cc193ee39b80e45dd95dd3 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <1277646412@qq.com> Date: Sun, 1 Feb 2026 07:43:36 +0000 Subject: [PATCH 3/3] Rewrite B300 script to align with other training scripts - Use /.dockerenv detection for single-file host/container dual mode - Add process cleanup (pkill sglang/ray/python) matching other scripts - Source model args from scripts/models/qwen3-14B.sh instead of inlining - Detect NVLink dynamically instead of hardcoding NCCL_NVLS_ENABLE - Fix heredoc stdin issue by adding docker run -i flag - Add --working-dir /root/miles for correct train.py resolution --- scripts/run-qwen3-14B-b300.sh | 110 ++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 45 deletions(-) diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh index a9df55e29..41503cfab 100755 --- a/scripts/run-qwen3-14B-b300.sh +++ b/scripts/run-qwen3-14B-b300.sh @@ -4,45 +4,60 @@ # ref: https://github.com/radixark/miles/issues/530 # ref: https://github.com/radixark/miles/issues/533 -set -ex +# --- Host-side launcher --- +# If not running inside a container, pull the B300 image and re-execute +# this script inside it. +if [ ! -f /.dockerenv ]; then + set -ex + + IMAGE=fy121415/miles:b300 + CONTAINER_NAME=qwen3-14b-b300 + DATA_DIR=${DATA_DIR:-/root} + SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + + docker pull "$IMAGE" + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + + docker run --rm \ + -i \ + --gpus all \ + --ipc=host \ + --network=host \ + --name "$CONTAINER_NAME" \ + -v "$DATA_DIR":/data \ + -v "$SCRIPT_DIR":/root/miles/scripts \ + "$IMAGE" \ + bash /root/miles/scripts/run-qwen3-14B-b300.sh + + exit $? +fi + +# --- Container-side training --- + +# for rerun the task +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python -IMAGE=fy121415/miles:b300 -CONTAINER_NAME=qwen3-14b-b300 -DATA_DIR=${DATA_DIR:-/root} - -docker pull "$IMAGE" -docker rm -f "$CONTAINER_NAME" 2>/dev/null || true - -docker run --rm \ - --gpus all \ - --ipc=host \ - --network=host \ - --name "$CONTAINER_NAME" \ - -v "$DATA_DIR":/data \ - "$IMAGE" \ - bash -s <<'EOF' set -ex export PYTHONUNBUFFERED=1 -MODEL_ARGS=( - --swiglu - --num-layers 40 - --hidden-size 5120 - --ffn-hidden-size 17408 - --num-attention-heads 40 - --group-query-attention - --num-query-groups 8 - --use-rotary-position-embeddings - --disable-bias-linear - --normalization "RMSNorm" - --norm-epsilon 1e-6 - --rotary-base 1000000 - --vocab-size 151936 - --kv-channels 128 - --qk-layernorm - --untie-embeddings-and-output-weights -) +NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) +if [ "$NVLINK_COUNT" -gt 0 ]; then + HAS_NVLINK=1 +else + HAS_NVLINK=0 +fi +echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +source "${SCRIPT_DIR}/models/qwen3-14B.sh" CKPT_ARGS=( --hf-checkpoint /data/Qwen3-14B @@ -127,27 +142,33 @@ SGLANG_ARGS=( ) MISC_ARGS=( + # default dropout in megatron is 0.1 --attention-dropout 0.0 --hidden-dropout 0.0 + # should be good for model performance --accumulate-allreduce-grads-in-fp32 --attention-softmax-in-fp32 + # need to comment this when using model with MLA --attention-backend flash ) -export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" -ray start --head --node-ip-address "${MASTER_ADDR}" --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 - -RUNTIME_ENV_JSON='{ - "env_vars": { - "PYTHONPATH": "/root/Megatron-LM/", - "CUDA_DEVICE_MAX_CONNECTIONS": "1", - "NCCL_NVLS_ENABLE": "1", - "SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK": "false" +# launch the master node of ray in container +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +# Build the runtime environment JSON with proper variable substitution +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/root/Megatron-LM/\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", + \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\" } -}' +}" ray job submit --address="http://127.0.0.1:8265" \ --runtime-env-json="${RUNTIME_ENV_JSON}" \ + --working-dir /root/miles \ -- python3 train.py \ --actor-num-nodes 1 \ --actor-num-gpus-per-node 8 \ @@ -162,4 +183,3 @@ ray job submit --address="http://127.0.0.1:8265" \ ${EVAL_ARGS[@]} \ ${SGLANG_ARGS[@]} \ ${MISC_ARGS[@]} -EOF