From 8a827d063f86e3dd4eb54f2878e04c87b8a70166 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <1277646412@qq.com>
Date: Sat, 31 Jan 2026 07:59:43 +0000
Subject: [PATCH 1/3] Add Qwen3-14B B300 training script

---
 scripts/run-qwen3-14B-b300.sh | 157 ++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100755 scripts/run-qwen3-14B-b300.sh

diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh
new file mode 100755
index 000000000..27675d7ff
--- /dev/null
+++ b/scripts/run-qwen3-14B-b300.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+# Qwen3-14B on B300 (sm_103a)
+# Tested on 8x B300 SXM6 275GB with fy121415/miles:b300
+# ref: https://github.com/radixark/miles/issues/530
+# ref: https://github.com/radixark/miles/issues/533
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/models/qwen3-14B.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Qwen3-14B
+   --ref-load /root/Qwen3-14B_torch_dist
+   --load /root/Qwen3-14B_miles/
+   --save /root/Qwen3-14B_miles/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type deepscaler
+   --num-rollout 3000
+   --rollout-batch-size 16
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 1
+
+   --global-batch-size 128
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 20
+   --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-top-p 1
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 4
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 4608
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+WANDB_ARGS=(
+   # --use-wandb
+   # --wandb-project miles-test
+   # --wandb-group qwen3-14B-b300
+   # --wandb-key ${WANDB_API_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 4
+   --sglang-mem-fraction-static 0.7
+   # B300 workaround: triton cuda graph compile fails on sm_103a (#533)
+   --sglang-disable-cuda-graph
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+# Build the runtime environment JSON with proper variable substitution
+# B300 workaround: disable TP memory imbalance check (#533)
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 8 \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}

From ae0427748eb94c9c7a5e12d68d70fe352d40d705 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <1277646412@qq.com>
Date: Sat, 31 Jan 2026 17:27:07 +0000
Subject: [PATCH 2/3] Rewrite B300 script as host-side docker workflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Start from docker pull instead of assuming running inside container
- Remove pkill cleanup and NVLink detection (hardcode NCCL_NVLS_ENABLE=1)
- Fix PYTHONBUFFERED typo → PYTHONUNBUFFERED
- Quote MASTER_ADDR properly
- Inline MODEL_ARGS (container heredoc can't source host files)
- Data paths use /data mount point (DATA_DIR configurable)
---
 scripts/run-qwen3-14B-b300.sh | 98 +++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh
index 27675d7ff..a9df55e29 100755
--- a/scripts/run-qwen3-14B-b300.sh
+++ b/scripts/run-qwen3-14B-b300.sh
@@ -1,46 +1,59 @@
 #!/bin/bash
 
 # Qwen3-14B on B300 (sm_103a)
-# Tested on 8x B300 SXM6 275GB with fy121415/miles:b300
 # ref: https://github.com/radixark/miles/issues/530
 # ref: https://github.com/radixark/miles/issues/533
 
-# for rerun the task
-pkill -9 sglang
-sleep 3
-ray stop --force
-pkill -9 ray
-pkill -9 python
-sleep 3
-pkill -9 ray
-pkill -9 python
-
 set -ex
 
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
-if [ "$NVLINK_COUNT" -gt 0 ]; then
-    HAS_NVLINK=1
-else
-    HAS_NVLINK=0
-fi
-echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+IMAGE=fy121415/miles:b300
+CONTAINER_NAME=qwen3-14b-b300
+DATA_DIR=${DATA_DIR:-/root}
+
+docker pull "$IMAGE"
+docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+
+docker run --rm \
+   --gpus all \
+   --ipc=host \
+   --network=host \
+   --name "$CONTAINER_NAME" \
+   -v "$DATA_DIR":/data \
+   "$IMAGE" \
+   bash -s <<'EOF'
+set -ex
 
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-source "${SCRIPT_DIR}/models/qwen3-14B.sh"
+export PYTHONUNBUFFERED=1
+
+MODEL_ARGS=(
+   --swiglu
+   --num-layers 40
+   --hidden-size 5120
+   --ffn-hidden-size 17408
+   --num-attention-heads 40
+   --group-query-attention
+   --num-query-groups 8
+   --use-rotary-position-embeddings
+   --disable-bias-linear
+   --normalization "RMSNorm"
+   --norm-epsilon 1e-6
+   --rotary-base 1000000
+   --vocab-size 151936
+   --kv-channels 128
+   --qk-layernorm
+   --untie-embeddings-and-output-weights
+)
 
 CKPT_ARGS=(
-   --hf-checkpoint /root/Qwen3-14B
-   --ref-load /root/Qwen3-14B_torch_dist
-   --load /root/Qwen3-14B_miles/
-   --save /root/Qwen3-14B_miles/
+   --hf-checkpoint /data/Qwen3-14B
+   --ref-load /data/Qwen3-14B_torch_dist
+   --load /data/Qwen3-14B_miles/
+   --save /data/Qwen3-14B_miles/
    --save-interval 20
 )
 
 ROLLOUT_ARGS=(
-   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --prompt-data /data/dapo-math-17k/dapo-math-17k.jsonl
    --input-key prompt
    --label-key label
    --apply-chat-template
@@ -58,7 +71,7 @@ ROLLOUT_ARGS=(
 
 EVAL_ARGS=(
    --eval-interval 20
-   --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
+   --eval-prompt-data aime /data/aime-2024/aime-2024.jsonl
    --n-samples-per-eval-prompt 16
    --eval-max-response-len 16384
    --eval-top-p 1
@@ -114,30 +127,24 @@ SGLANG_ARGS=(
 )
 
 MISC_ARGS=(
-   # default dropout in megatron is 0.1
    --attention-dropout 0.0
    --hidden-dropout 0.0
-   # should be good for model performance
    --accumulate-allreduce-grads-in-fp32
    --attention-softmax-in-fp32
-   # need to comment this when using model with MLA
    --attention-backend flash
 )
 
-# launch the master node of ray in container
-export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
-
-# Build the runtime environment JSON with proper variable substitution
-# B300 workaround: disable TP memory imbalance check (#533)
-RUNTIME_ENV_JSON="{
-  \"env_vars\": {
-    \"PYTHONPATH\": \"/root/Megatron-LM/\",
-    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
-    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
-    \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\"
+export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+ray start --head --node-ip-address "${MASTER_ADDR}" --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+RUNTIME_ENV_JSON='{
+  "env_vars": {
+    "PYTHONPATH": "/root/Megatron-LM/",
+    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+    "NCCL_NVLS_ENABLE": "1",
+    "SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK": "false"
   }
-}"
+}'
 
 ray job submit --address="http://127.0.0.1:8265" \
    --runtime-env-json="${RUNTIME_ENV_JSON}" \
@@ -155,3 +162,4 @@ ray job submit --address="http://127.0.0.1:8265" \
    ${EVAL_ARGS[@]} \
    ${SGLANG_ARGS[@]} \
    ${MISC_ARGS[@]}
+EOF

From 435e1d4ae2cdf382e3cc193ee39b80e45dd95dd3 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <1277646412@qq.com>
Date: Sun, 1 Feb 2026 07:43:36 +0000
Subject: [PATCH 3/3] Rewrite B300 script to align with other training scripts

- Use /.dockerenv detection for single-file host/container dual mode
- Add process cleanup (pkill sglang/ray/python) matching other scripts
- Source model args from scripts/models/qwen3-14B.sh instead of inlining
- Detect NVLink dynamically instead of hardcoding NCCL_NVLS_ENABLE
- Fix heredoc stdin issue by adding docker run -i flag
- Add --working-dir /root/miles for correct train.py resolution
---
 scripts/run-qwen3-14B-b300.sh | 110 ++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/scripts/run-qwen3-14B-b300.sh b/scripts/run-qwen3-14B-b300.sh
index a9df55e29..41503cfab 100755
--- a/scripts/run-qwen3-14B-b300.sh
+++ b/scripts/run-qwen3-14B-b300.sh
@@ -4,45 +4,60 @@
 # ref: https://github.com/radixark/miles/issues/530
 # ref: https://github.com/radixark/miles/issues/533
 
-set -ex
+# --- Host-side launcher ---
+# If not running inside a container, pull the B300 image and re-execute
+# this script inside it.
+if [ ! -f /.dockerenv ]; then
+   set -ex
+
+   IMAGE=fy121415/miles:b300
+   CONTAINER_NAME=qwen3-14b-b300
+   DATA_DIR=${DATA_DIR:-/root}
+   SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+   docker pull "$IMAGE"
+   docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+
+   docker run --rm \
+      -i \
+      --gpus all \
+      --ipc=host \
+      --network=host \
+      --name "$CONTAINER_NAME" \
+      -v "$DATA_DIR":/data \
+      -v "$SCRIPT_DIR":/root/miles/scripts \
+      "$IMAGE" \
+      bash /root/miles/scripts/run-qwen3-14B-b300.sh
+
+   exit $?
+fi
+
+# --- Container-side training ---
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
 
-IMAGE=fy121415/miles:b300
-CONTAINER_NAME=qwen3-14b-b300
-DATA_DIR=${DATA_DIR:-/root}
-
-docker pull "$IMAGE"
-docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
-
-docker run --rm \
-   --gpus all \
-   --ipc=host \
-   --network=host \
-   --name "$CONTAINER_NAME" \
-   -v "$DATA_DIR":/data \
-   "$IMAGE" \
-   bash -s <<'EOF'
 set -ex
 
 export PYTHONUNBUFFERED=1
 
-MODEL_ARGS=(
-   --swiglu
-   --num-layers 40
-   --hidden-size 5120
-   --ffn-hidden-size 17408
-   --num-attention-heads 40
-   --group-query-attention
-   --num-query-groups 8
-   --use-rotary-position-embeddings
-   --disable-bias-linear
-   --normalization "RMSNorm"
-   --norm-epsilon 1e-6
-   --rotary-base 1000000
-   --vocab-size 151936
-   --kv-channels 128
-   --qk-layernorm
-   --untie-embeddings-and-output-weights
-)
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/models/qwen3-14B.sh"
 
 CKPT_ARGS=(
    --hf-checkpoint /data/Qwen3-14B
@@ -127,27 +142,33 @@ SGLANG_ARGS=(
 )
 
 MISC_ARGS=(
+   # default dropout in megatron is 0.1
    --attention-dropout 0.0
    --hidden-dropout 0.0
+   # should be good for model performance
    --accumulate-allreduce-grads-in-fp32
    --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
    --attention-backend flash
 )
 
-export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
-ray start --head --node-ip-address "${MASTER_ADDR}" --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
-
-RUNTIME_ENV_JSON='{
-  "env_vars": {
-    "PYTHONPATH": "/root/Megatron-LM/",
-    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
-    "NCCL_NVLS_ENABLE": "1",
-    "SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK": "false"
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK\": \"false\"
   }
-}'
+}"
 
 ray job submit --address="http://127.0.0.1:8265" \
    --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   --working-dir /root/miles \
    -- python3 train.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 8 \
@@ -162,4 +183,3 @@ ray job submit --address="http://127.0.0.1:8265" \
    ${EVAL_ARGS[@]} \
    ${SGLANG_ARGS[@]} \
    ${MISC_ARGS[@]}
-EOF