diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml new file mode 100644 index 000000000..f1c0dc7e6 --- /dev/null +++ b/.github/workflows/pr-test.yml @@ -0,0 +1,56 @@ +name: PR Test + +on: + push: + branches: [main] + pull_request: + branches: [main] + types: [synchronize, labeled] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + e2e-test: + # TODO may use run-ci label etc + if: github.event.pull_request.draft == false + runs-on: self-hosted + container: + image: slimerl/slime:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -v /data/miles_ci:/data/miles_ci + -v /data/miles_ci/models:/root/models + -v /data/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: + - {test_file: test_quick_start_glm4_9B.py} + - {test_file: test_qwen3_30B_A3B.py} + # TODO use deterministic kernel + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . + + - name: Execute + shell: bash + run: python tests/${{ matrix.info.test_file }} diff --git a/miles/backends/megatron_utils/model.py b/miles/backends/megatron_utils/model.py index e0b939121..0fa9f9145 100644 --- a/miles/backends/megatron_utils/model.py +++ b/miles/backends/megatron_utils/model.py @@ -488,9 +488,9 @@ def train(rollout_id, model, optimizer, opt_param_scheduler, data_iterator, num_ if args.ci_test: if step_id == 0 and "train/ppo_kl" in log_dict and "train/pg_clipfrac" in log_dict: - assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0 + assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0, f"{loss_dict=}" if accumulated_step_id == 0 and "train/kl_loss" in log_dict: - assert log_dict["train/kl_loss"] == 0.0 + assert log_dict["train/kl_loss"] == 0.0, f"{loss_dict=}" print(f"{role_tag}step {accumulated_step_id}: {log_dict}") # Close out pre-hooks if using distributed optimizer and overlapped param gather. diff --git a/tests/ci/README.md b/tests/ci/README.md new file mode 100644 index 000000000..d6941e7b5 --- /dev/null +++ b/tests/ci/README.md @@ -0,0 +1,47 @@ +# Doc about CI + +## Configure GitHub secrets + +https://github.com/radixark/miles/settings/secrets/actions + +* `WANDB_API_KEY`: get from https://wandb.ai/authorize + +## Setup new GitHub runners + +### Step 1: Env + +Write `.env` mimicking `.env.example`. +The token can be found at https://github.com/radixark/miles/settings/actions/runners/new?arch=x64&os=linux. + +WARN: The `GITHUB_RUNNER_TOKEN` changes after a while. + +### Step 2: Prepare `/home/runner/externals` + +```shell +docker run --rm -it --privileged --pid=host -v /:/host_root ubuntu /bin/bash -c 'rm -rf /host_root/home/runner/externals && mkdir -p /host_root/home/runner/externals && chmod -R 777 /host_root/home/runner/externals' +docker run -d --name temp-runner ghcr.io/actions/actions-runner:2.328.0 tail -f /dev/null +docker cp temp-runner:/home/runner/externals/. /home/runner/externals +docker rm -f temp-runner +ls -alh /home/runner/externals +``` + +### Step 3: Run + +```shell +cd /data/tom/primary_synced/miles/tests/ci/github_runner +docker compose up -d +``` + +### Debugging + +Logs + +```shell +docker compose logs -f +``` + +Exec + +```shell +docker exec -it github_runner-runner-1 /bin/bash +``` diff --git a/tests/ci/github_runner/.env.example b/tests/ci/github_runner/.env.example new file mode 100644 index 000000000..114a1a80c --- /dev/null +++ b/tests/ci/github_runner/.env.example @@ -0,0 +1,2 @@ +GITHUB_RUNNER_URL=https://github.com/radixark/miles +GITHUB_RUNNER_TOKEN=paste-your-token-here \ No newline at end of file diff --git a/tests/ci/github_runner/.gitignore b/tests/ci/github_runner/.gitignore new file mode 100644 index 000000000..2eea525d8 --- /dev/null +++ b/tests/ci/github_runner/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/tests/ci/github_runner/docker-compose.yml b/tests/ci/github_runner/docker-compose.yml new file mode 100644 index 000000000..abe2c64bc --- /dev/null +++ b/tests/ci/github_runner/docker-compose.yml @@ -0,0 +1,29 @@ +# Please refer to `README.md` for how to setup this GitHub action runner +version: "3.9" + +services: + runner: + image: ghcr.io/actions/actions-runner:2.328.0 + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - /data/miles_ci:/data/miles_ci + # it requires this folder + - /home/runner/externals:/home/runner/externals + deploy: + # TODO 4 runner + lock gpu when running + replicas: 1 + restart: always + environment: + RUNNER_ALLOW_RUNASROOT: "1" + privileged: true + user: root + # ref: https://github.com/actions/runner/issues/367#issuecomment-2007558723 + # ref: https://github.com/actions/runner + # args ref: https://github.com/actions/runner/blob/68ff57dbc4c836d50f46602a8a53301fb9513eb4/src/Runner.Listener/CommandSettings.cs#L53 + # TODO seems we should not run config.sh repeatedly + entrypoint: > + sh -c " + cd /data/miles_ci && + /home/runner/config.sh --url ${GITHUB_RUNNER_URL} --token ${GITHUB_RUNNER_TOKEN} --unattended --work /data/miles_ci/runner_$(hostname) && + /home/runner/run.sh + " diff --git a/tests/command_utils.py b/tests/command_utils.py new file mode 100644 index 000000000..f5772ae2c --- /dev/null +++ b/tests/command_utils.py @@ -0,0 +1,108 @@ +import datetime +import json +import os +import random +import subprocess +from pathlib import Path + +repo_base_dir = Path(os.path.abspath(__file__)).resolve().parents[1] + + +def convert_checkpoint(model_name, model_type): + # TODO shall we make it in host-mapped folder and thus can cache it to speedup CI + path_dst = f"/root/{model_name}_torch_dist" + if Path(path_dst).exists(): + print(f"convert_checkpoint skip {path_dst} since exists") + return + + exec_command( + f"source {repo_base_dir}/scripts/models/{model_type}.sh && " + "PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py " + "${MODEL_ARGS[@]} " + f"--hf-checkpoint /root/models/{model_name} " + f"--save {path_dst}" + ) + + +def execute_train( + train_args: str, + num_gpus: int, + model_type: str, + master_addr: str = "127.0.0.1", +): + exec_command( + "pkill -9 sglang; " + "sleep 3; " + "ray stop --force; " + "pkill -9 ray; " + # cannot be run in CI, o/w kill the parent script + # TODO: do we really need this kill? (or can we instead kill miles) + # "pkill -9 python; " + "pkill -9 miles; " + "sleep 3; " + "pkill -9 ray; " + # "pkill -9 python; " + "pkill -9 miles; " + "pkill -9 redis; " + "true; " + ) + + exec_command( + # will prevent ray from buffering stdout/stderr + f"export PYTHONBUFFERED=16 && " + f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus} --disable-usage-stats" + ) + + runtime_env_json = json.dumps( + { + "env_vars": { + "PYTHONPATH": "/root/Megatron-LM/", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + "NCCL_NVLS_ENABLE": str(int(check_has_nvlink())), + "no_proxy": f"127.0.0.1,{master_addr}", + } + } + ) + + exec_command( + f"export PYTHONBUFFERED=16 && " + f'source "{repo_base_dir}/scripts/models/{model_type}.sh" && ' + # TODO should this 127.0.0.1 be `master_addr` instead + f'ray job submit --address="http://127.0.0.1:8265" ' + f"--runtime-env-json='{runtime_env_json}' " + "-- python3 train.py " + "${MODEL_ARGS[@]} " + f"{train_args}" + ) + + +def check_has_nvlink(): + output = exec_command("nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l", capture_output=True) + return int(output) > 0 + + +def get_default_wandb_args(test_file: str): + if not os.environ.get("WANDB_API_KEY"): + print("Skip wandb configuration since WANDB_API_KEY is not found") + return "" + + test_name = Path(test_file).stem + + run_name = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{random.randint(0, 1000000000)}" + if (x := os.environ.get("GITHUB_COMMIT_NAME")) is not None: + run_name += f"_{x}" + + # do not put wandb_api_key value here to avoid leaking to logs explicitly + return ( + "--use-wandb " + f"--wandb-project miles-ci-{test_name} " + f"--wandb-group {run_name} " + f"--wandb-key ${{WANDB_API_KEY}} " + ) + + +def exec_command(cmd: str, capture_output: bool = False): + print(f"EXEC: {cmd}", flush=True) + result = subprocess.run(["bash", "-c", cmd], shell=False, check=True, capture_output=capture_output) + if capture_output: + return result.stdout diff --git a/tests/test-qwen2.5-0.5B-gsm8k-async.sh b/tests/test-qwen2.5-0.5B-gsm8k-async.sh deleted file mode 100644 index 8b04ab2c4..000000000 --- a/tests/test-qwen2.5-0.5B-gsm8k-async.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash - -# for rerun the task -pkill -9 sglang -sleep 3 -ray stop --force -pkill -9 ray -pkill -9 python -sleep 3 -pkill -9 ray -pkill -9 python - -set -ex - - -huggingface-cli download --repo-type dataset zhuzilin/gsm8k --local-dir gsm8k - - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -source "${SCRIPT_DIR}/../scripts/models/qwen2.5-0.5B.sh" - -CKPT_ARGS=( - --hf-checkpoint /root/Qwen2.5-0.5B-Instruct/ - --ref-load /root/Qwen2.5-0.5B-Instruct_torch_dist/ -) - -ROLLOUT_ARGS=( - --prompt-data gsm8k/train.parquet - --input-key messages - --label-key label - --apply-chat-template - --rollout-shuffle - --rm-type math - --num-rollout 3000 - --rollout-batch-size 32 - --n-samples-per-prompt 8 - --rollout-max-response-len 1024 - --rollout-temperature 0.8 - --rollout-num-gpus 2 - - --over-sampling-batch-size 64 - --dynamic-sampling-filter-path miles.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std - - --global-batch-size 256 -) - -EVAL_ARGS=( - --eval-interval 20 - --eval-prompt-data gsm8k gsm8k/test.parquet - --n-samples-per-eval-prompt 1 - --eval-max-response-len 1024 - --eval-top-k 1 -) - -PERF_ARGS=( - --tensor-model-parallel-size 1 - --sequence-parallel - --pipeline-model-parallel-size 1 - --context-parallel-size 1 - --expert-model-parallel-size 1 - --expert-tensor-parallel-size 1 - - # --micro-batch-size 1 - --use-dynamic-batch-size - --max-tokens-per-gpu 9216 -) - -GRPO_ARGS=( - --advantage-estimator grpo - --use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --entropy-coef 0.00 - --eps-clip 0.2 - --eps-clip-high 0.28 -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 -) - -WANDB_ARGS=( - --use-wandb - --wandb-project miles-test - --wandb-group test-qwen2.5-0.5B-gsm8k -) - -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 1 - --sglang-mem-fraction-static 0.7 -) - -MISC_ARGS=( - # default dropout in megatron is 0.1 - --attention-dropout 0.0 - --hidden-dropout 0.0 - # should be good for model performance - --accumulate-allreduce-grads-in-fp32 - --attention-softmax-in-fp32 - # need to comment this when using model with MLA - --attention-backend flash -) - -# launch the master node of ray in container -ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json='{ - "env_vars": { - "PYTHONPATH": "/root/Megatron-LM", - "CUDA_DEVICE_MAX_CONNECTIONS": "1" - } - }' \ - -- python3 train_async.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 2 \ - ${MODEL_ARGS[@]} \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${WANDB_ARGS[@]} \ - ${PERF_ARGS[@]} \ - ${EVAL_ARGS[@]} \ - ${SGLANG_ARGS[@]} \ - ${MISC_ARGS[@]} diff --git a/tests/test-qwen2.5-0.5B-gsm8k.sh b/tests/test-qwen2.5-0.5B-gsm8k.sh deleted file mode 100644 index fd598c454..000000000 --- a/tests/test-qwen2.5-0.5B-gsm8k.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash - -# for rerun the task -pkill -9 sglang -sleep 3 -ray stop --force -pkill -9 ray -pkill -9 python -sleep 3 -pkill -9 ray -pkill -9 python - -set -ex - - -huggingface-cli download --repo-type dataset zhuzilin/gsm8k --local-dir gsm8k - - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -source "${SCRIPT_DIR}/../scripts/models/qwen2.5-0.5B.sh" - -CKPT_ARGS=( - --hf-checkpoint /root/Qwen2.5-0.5B-Instruct/ - --ref-load /root/Qwen2.5-0.5B-Instruct_torch_dist/ -) - -ROLLOUT_ARGS=( - --prompt-data gsm8k/train.parquet - --input-key messages - --label-key label - --apply-chat-template - --rollout-shuffle - --rm-type math - --num-rollout 3000 - --rollout-batch-size 32 - --n-samples-per-prompt 8 - --rollout-max-response-len 1024 - --rollout-temperature 0.8 - - --over-sampling-batch-size 64 - --dynamic-sampling-filter-path miles.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std - - --global-batch-size 256 -) - -EVAL_ARGS=( - --eval-interval 20 - --eval-prompt-data gsm8k gsm8k/test.parquet - --n-samples-per-eval-prompt 1 - --eval-max-response-len 1024 - --eval-top-k 1 -) - -PERF_ARGS=( - --tensor-model-parallel-size 1 - --sequence-parallel - --pipeline-model-parallel-size 1 - --context-parallel-size 1 - --expert-model-parallel-size 1 - --expert-tensor-parallel-size 1 - - # --micro-batch-size 1 - --use-dynamic-batch-size - --max-tokens-per-gpu 9216 -) - -GRPO_ARGS=( - --advantage-estimator grpo - --use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --entropy-coef 0.00 - --eps-clip 0.2 - --eps-clip-high 0.28 -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 -) - -WANDB_ARGS=( - --use-wandb - --wandb-project miles-test - --wandb-group test-qwen2.5-0.5B-gsm8k -) - -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 1 - --sglang-mem-fraction-static 0.7 -) - -MISC_ARGS=( - # default dropout in megatron is 0.1 - --attention-dropout 0.0 - --hidden-dropout 0.0 - # should be good for model performance - --accumulate-allreduce-grads-in-fp32 - --attention-softmax-in-fp32 - # need to comment this when using model with MLA - --attention-backend flash -) - -# launch the master node of ray in container -ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json='{ - "env_vars": { - "PYTHONPATH": "/root/Megatron-LM", - "CUDA_DEVICE_MAX_CONNECTIONS": "1" - } - }' \ - -- python3 train.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 4 \ - --colocate \ - ${MODEL_ARGS[@]} \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${WANDB_ARGS[@]} \ - ${PERF_ARGS[@]} \ - ${EVAL_ARGS[@]} \ - ${SGLANG_ARGS[@]} \ - ${MISC_ARGS[@]} diff --git a/tests/test_fsdp_import.py b/tests/test_fsdp_import.py deleted file mode 100644 index 66b6861ed..000000000 --- a/tests/test_fsdp_import.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - - -def test_fsdp_import(): - try: - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - except ImportError: - pytest.skip("FSDP not available in this environment") - assert FSDP is not None diff --git a/tests/test_quick_start_glm4-9B.sh b/tests/test_quick_start_glm4-9B.sh deleted file mode 100644 index 2af9e24fc..000000000 --- a/tests/test_quick_start_glm4-9B.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/bin/bash - -set -e - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) -if [ "$NVLINK_COUNT" -gt 0 ]; then - HAS_NVLINK=1 -else - HAS_NVLINK=0 -fi -echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" - -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -source "${SCRIPT_DIR}/../scripts/models/glm4-9B.sh" - -CKPT_ARGS=( - --hf-checkpoint /root/models/GLM-Z1-9B-0414/ - --ref-load /root/GLM-Z1-9B-0414_torch_dist - - --fp8-format e4m3 - --fp8-recipe blockwise -) - -ROLLOUT_ARGS=( - --prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl - --input-key prompt - --label-key label - --apply-chat-template - --rollout-shuffle - - --rm-type deepscaler - - --num-rollout 3 - --rollout-batch-size 8 - --n-samples-per-prompt 8 - --rollout-max-response-len 8192 - --rollout-temperature 0.8 - - --global-batch-size 32 - --balance-data -) - -EVAL_ARGS=( - --eval-interval 20 - --eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl - --n-samples-per-eval-prompt 1 - --eval-max-response-len 16384 - --eval-top-k 1 -) - -PERF_ARGS=( - --tensor-model-parallel-size 2 - --sequence-parallel - --pipeline-model-parallel-size 1 - --context-parallel-size 2 - --expert-model-parallel-size 1 - --expert-tensor-parallel-size 1 - - --recompute-granularity full - --recompute-method uniform - --recompute-num-layers 1 - - --use-dynamic-batch-size - --max-tokens-per-gpu 4608 -) - -GRPO_ARGS=( - --advantage-estimator grpo - --use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --entropy-coef 0.00 - --eps-clip 0.2 - --eps-clip-high 0.28 - - --use-tis - --calculate-per-token-loss -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 -) - -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 2 - - --use-miles-router -) - -MISC_ARGS=( - # default dropout in megatron is 0.1 - --attention-dropout 0.0 - --hidden-dropout 0.0 - # should be good for model performance - --accumulate-allreduce-grads-in-fp32 - --attention-softmax-in-fp32 - # need to comment this when using model with MLA - --attention-backend flash -) - -# launch the master node of ray in container -export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -export no_proxy="127.0.0.1,${MASTER_ADDR}" -ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats - -# Build the runtime environment JSON with proper variable substitution -RUNTIME_ENV_JSON="{ - \"env_vars\": { - \"PYTHONPATH\": \"/root/Megatron-LM/\", - \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", - \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", - \"no_proxy\": \"${no_proxy}\" - } -}" - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json="${RUNTIME_ENV_JSON}" \ - -- python3 train.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 4 \ - --rollout-num-gpus 4 \ - ${MODEL_ARGS[@]} \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${WANDB_ARGS[@]} \ - ${PERF_ARGS[@]} \ - ${EVAL_ARGS[@]} \ - ${SGLANG_ARGS[@]} \ - ${MISC_ARGS[@]} \ - --ci-test diff --git a/tests/test_quick_start_glm4_9B.py b/tests/test_quick_start_glm4_9B.py new file mode 100644 index 000000000..b8176b7b9 --- /dev/null +++ b/tests/test_quick_start_glm4_9B.py @@ -0,0 +1,127 @@ +import os + +import command_utils as U + +MODEL_NAME = "GLM-Z1-9B-0414" +MODEL_TYPE = "glm4-9B" + +ENABLE_EVAL = bool(int(os.environ.get("MILES_TEST_ENABLE_EVAL", "1"))) +TIGHT_HOST_MEMORY = bool(int(os.environ.get("MILES_TEST_TIGHT_HOST_MEMORY", "1"))) + + +def prepare(): + U.exec_command("mkdir -p /root/models /root/datasets") + U.exec_command("hf download zai-org/GLM-Z1-9B-0414 --local-dir /root/models/GLM-Z1-9B-0414") + U.exec_command("hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k") + U.exec_command("hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024") + + U.convert_checkpoint(model_name=MODEL_NAME, model_type=MODEL_TYPE) + + +def execute(): + ckpt_args = ( + f"--hf-checkpoint /root/models/{MODEL_NAME}/ " + f"--ref-load /root/{MODEL_NAME}_torch_dist " + "--fp8-format e4m3 " + "--fp8-recipe blockwise " + ) + + rollout_args = ( + "--prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl " + "--input-key prompt " + "--label-key label " + "--apply-chat-template " + "--rollout-shuffle " + "--rm-type deepscaler " + "--num-rollout 3 " + "--rollout-batch-size 8 " + "--n-samples-per-prompt 8 " + f"--rollout-max-response-len 8192 " + "--rollout-temperature 0.8 " + "--global-batch-size 32 " + "--balance-data " + ) + + eval_args = ( + f"{'--eval-interval 20 ' if ENABLE_EVAL else ''}" + "--eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl " + "--n-samples-per-eval-prompt 1 " + "--eval-max-response-len 16384 " + "--eval-top-k 1 " + ) + + perf_args = ( + "--tensor-model-parallel-size 2 " + "--sequence-parallel " + "--pipeline-model-parallel-size 1 " + "--context-parallel-size 2 " + "--expert-model-parallel-size 1 " + "--expert-tensor-parallel-size 1 " + "--recompute-granularity full " + "--recompute-method uniform " + "--recompute-num-layers 1 " + "--use-dynamic-batch-size " + f"--max-tokens-per-gpu {2048 if TIGHT_HOST_MEMORY else 4608} " + ) + + grpo_args = ( + "--advantage-estimator grpo " + "--use-kl-loss " + "--kl-loss-coef 0.00 " + "--kl-loss-type low_var_kl " + "--entropy-coef 0.00 " + "--eps-clip 0.2 " + "--eps-clip-high 0.28 " + "--use-tis " + "--calculate-per-token-loss " + ) + + optimizer_args = ( + "--optimizer adam " + "--lr 1e-6 " + "--lr-decay-style constant " + "--weight-decay 0.1 " + "--adam-beta1 0.9 " + "--adam-beta2 0.98 " + ) + + sglang_args = "--rollout-num-gpus-per-engine 2 " "--use-miles-router " + + misc_args = ( + # default dropout in megatron is 0.1 + "--attention-dropout 0.0 " + "--hidden-dropout 0.0 " + # should be good for model performance + "--accumulate-allreduce-grads-in-fp32 " + "--attention-softmax-in-fp32 " + # need to comment this when using model with MLA + "--attention-backend flash " + "--ci-test " + "--actor-num-nodes 1 " + "--actor-num-gpus-per-node 4 " + "--rollout-num-gpus 4 " + ) + + train_args = ( + f"{ckpt_args} " + f"{rollout_args} " + f"{optimizer_args} " + f"{grpo_args} " + f"{U.get_default_wandb_args(__file__)} " + f"{perf_args} " + f"{eval_args} " + f"{sglang_args} " + f"{misc_args} " + ) + + U.execute_train( + train_args=train_args, + num_gpus=8, + model_type=MODEL_TYPE, + ) + + +if __name__ == "__main__": + # TODO also use typer + prepare() + execute() diff --git a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh b/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh deleted file mode 100755 index edf96f374..000000000 --- a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# FSDP Colocated 2GPU Training Script with Weights & Biases Support -# -# This script runs FSDP training with wandb logging enabled. -# -# Wandb Configuration: -# - Rank and world size are automatically detected from distributed context -# - Only rank 0 will log to wandb to avoid duplicate entries -# - Distributed coordination handled by torch.distributed in FSDP actors -# -# To customize wandb settings: -# 1. Uncomment and set --wandb-team if you're using a team/organization (optional for personal accounts) -# 2. Set your wandb API key if needed (or use 'wandb login' beforehand) -# 3. Modify project name and group as needed -# 4. Change wandb mode to 'offline' for local logging only -# 5. Uncomment --wandb-dir to specify custom log directory - -# for rerun the task -pkill -9 sglang -sleep 3 -ray stop --force -pkill -9 ray -pkill -9 python -sleep 3 -pkill -9 ray -pkill -9 python - -set -ex - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -CKPT_ARGS=( - --hf-checkpoint /root/Qwen3-0.6B -) - -ROLLOUT_ARGS=( - --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl - --input-key prompt - --label-key label - --apply-chat-template - --rollout-shuffle - --rm-type deepscaler - --num-rollout 1000 - --rollout-batch-size 8 - --n-samples-per-prompt 8 - --rollout-max-response-len 4096 - --rollout-temperature 0.8 - - --global-batch-size 64 -) - -GRPO_ARGS=( - --advantage-estimator grpo - #--use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --kl-coef 0.00 - --entropy-coef 0.00 - --eps-clip 0.2 - --eps-clip-high 0.28 -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 -) - -SGLANG_ARGS=( - # Set equal to the number of GPUs per node for colocated mode - --rollout-num-gpus-per-engine 2 - --sglang-decode-log-interval 1000 -) - - -WANDB_ARGS=( - --use-wandb - --wandb-project "gsm8k_async_rl" - --wandb-group "fsdp-2gpu-colocated" - --wandb-mode "online" # Change to "offline" for local logging only -) - -FSDP_ARGS=( - # Set to true for FULL_STATE_DICT mode, false for SHARDED_STATE_DICT mode (default) - # --fsdp-full-params # Uncomment this line to enable full params mode - - # Set the bucket size for weight update - --update-weights-bucket-size 512 * 1024 * 1024 # 512MB -) - -# launch the master node of ray in container -ray start --head --node-ip-address 127.0.0.1 --num-gpus 2 --disable-usage-stats - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json='{ - "env_vars": { - "no_proxy": "localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}", - "MILES_BACKEND": "fsdp" - } - }' \ - -- python3 train.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 2 \ - --colocate \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${SGLANG_ARGS[@]} \ - ${WANDB_ARGS[@]} \ No newline at end of file diff --git a/tests/test_qwen3-0.6B_fsdp_distributed.sh b/tests/test_qwen3-0.6B_fsdp_distributed.sh deleted file mode 100644 index 0ea417684..000000000 --- a/tests/test_qwen3-0.6B_fsdp_distributed.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -# for rerun the task -pkill -9 sglang -sleep 3 -ray stop --force -pkill -9 ray -pkill -9 python -sleep 3 -pkill -9 ray -pkill -9 python - -set -ex - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -CKPT_ARGS=( - --hf-checkpoint /root/Qwen3-0.6B -) - -ROLLOUT_ARGS=( - --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl - --input-key prompt - --label-key label - --apply-chat-template - --rollout-shuffle - --rm-type deepscaler - --num-rollout 3000 - --rollout-batch-size 16 - --n-samples-per-prompt 16 - --rollout-max-response-len 8192 - --rollout-temperature 0.8 - - --global-batch-size 128 -) - -GRPO_ARGS=( - --advantage-estimator grpo - #--use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --kl-coef 0.00 - --entropy-coef 0.00 - --eps-clip 0.2 - --eps-clip-high 0.28 -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 -) - -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 1 -) - -# launch the master node of ray in container -ray start --head --node-ip-address 127.0.0.1 --num-gpus 4 --disable-usage-stats - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json='{ - "env_vars": { - "no_proxy": "localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}", - "MILES_BACKEND": "fsdp" - } - }' \ - -- python3 train.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 4 \ - --colocate \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${SGLANG_ARGS[@]} diff --git a/tests/test_qwen3-30B-A3B.sh b/tests/test_qwen3-30B-A3B.sh deleted file mode 100644 index 125efc827..000000000 --- a/tests/test_qwen3-30B-A3B.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/bin/bash - -set -e - -# will prevent ray from buffering stdout/stderr -export PYTHONBUFFERED=16 - -NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) -if [ "$NVLINK_COUNT" -gt 0 ]; then - HAS_NVLINK=1 -else - HAS_NVLINK=0 -fi -echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" - -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -source "${SCRIPT_DIR}/../scripts/models/qwen3-30B-A3B.sh" - -CKPT_ARGS=( - --hf-checkpoint /root/models/Qwen3-30B-A3B-FP8 - --ref-load /root/Qwen3-30B-A3B_torch_dist -) - -ROLLOUT_ARGS=( - --prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl - --input-key prompt - --label-key label - --apply-chat-template - --rollout-shuffle - - --rm-type deepscaler - - --num-rollout 3 - --rollout-batch-size 8 - --n-samples-per-prompt 8 - --rollout-max-response-len 8192 - --rollout-temperature 0.8 - - --global-batch-size 32 - --balance-data -) - -EVAL_ARGS=( - --eval-interval 20 - --eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl - --n-samples-per-eval-prompt 1 - --eval-max-response-len 16384 - --eval-top-k 1 -) - -PERF_ARGS=( - --tensor-model-parallel-size 4 - --sequence-parallel - --pipeline-model-parallel-size 1 - --context-parallel-size 2 - --expert-model-parallel-size 8 - --expert-tensor-parallel-size 1 - - --recompute-granularity full - --recompute-method uniform - --recompute-num-layers 1 - - --use-dynamic-batch-size - --max-tokens-per-gpu 16384 -) - -GRPO_ARGS=( - --advantage-estimator gspo - --use-kl-loss - --kl-loss-coef 0.00 - --kl-loss-type low_var_kl - --kl-coef 0.00 - --entropy-coef 0.00 - --eps-clip 4e-4 - - --use-tis - --use-routing-replay -) - -OPTIMIZER_ARGS=( - --optimizer adam - --lr 1e-6 - --lr-decay-style constant - --weight-decay 0.1 - --adam-beta1 0.9 - --adam-beta2 0.98 - - --optimizer-cpu-offload - --overlap-cpu-optimizer-d2h-h2d - --use-precision-aware-optimizer -) - -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 8 - --sglang-mem-fraction-static 0.8 - - --sglang-moe-a2a-backend deepep - --sglang-deepep-mode auto - - --sglang-max-running-requests 512 - --sglang-disable-radix-cache -) - -MISC_ARGS=( - # default dropout in megatron is 0.1 - --attention-dropout 0.0 - --hidden-dropout 0.0 - # should be good for model performance - --accumulate-allreduce-grads-in-fp32 - --attention-softmax-in-fp32 - # need to comment this when using model with MLA - --attention-backend flash - - --moe-token-dispatcher-type flex - --moe-enable-deepep -) - -# launch the master node of ray in container -export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -export no_proxy="127.0.0.1,${MASTER_ADDR}" -ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats - -export CUDA_HOME=${CUDA_HOME:-"/usr/local/cuda"} -# Build the runtime environment JSON with proper variable substitution -RUNTIME_ENV_JSON="{ - \"env_vars\": { - \"PYTHONPATH\": \"/root/Megatron-LM/\", - \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", - \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", - \"no_proxy\": \"${no_proxy}\", - \"CUDA_HOME\": \"${CUDA_HOME}\" - } -}" - -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json="${RUNTIME_ENV_JSON}" \ - -- python3 train.py \ - --actor-num-nodes 1 \ - --actor-num-gpus-per-node 8 \ - --colocate \ - ${MODEL_ARGS[@]} \ - ${CKPT_ARGS[@]} \ - ${ROLLOUT_ARGS[@]} \ - ${OPTIMIZER_ARGS[@]} \ - ${GRPO_ARGS[@]} \ - ${DISTRIBUTED_ARGS[@]} \ - ${WANDB_ARGS[@]} \ - ${PERF_ARGS[@]} \ - ${EVAL_ARGS[@]} \ - ${SGLANG_ARGS[@]} \ - ${MISC_ARGS[@]} \ - --ci-test diff --git a/tests/test_qwen3_30B_A3B.py b/tests/test_qwen3_30B_A3B.py new file mode 100644 index 000000000..db4f01d41 --- /dev/null +++ b/tests/test_qwen3_30B_A3B.py @@ -0,0 +1,136 @@ +import os + +import command_utils as U + +MODEL_NAME = "Qwen3-30B-A3B" +MODEL_TYPE = "qwen3-30B-A3B" + + +ENABLE_EVAL = bool(int(os.environ.get("MILES_TEST_ENABLE_EVAL", "1"))) +TIGHT_HOST_MEMORY = bool(int(os.environ.get("MILES_TEST_TIGHT_HOST_MEMORY", "1"))) + + +def prepare(): + U.exec_command("mkdir -p /root/models /root/datasets") + U.exec_command("hf download Qwen/Qwen3-30B-A3B --local-dir /root/models/Qwen3-30B-A3B") + U.exec_command("hf download Qwen/Qwen3-30B-A3B-FP8 --local-dir /root/models/Qwen3-30B-A3B-FP8") + U.exec_command("hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k") + U.exec_command("hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024") + + U.convert_checkpoint(model_name=MODEL_NAME, model_type=MODEL_TYPE) + + +def execute(): + ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME}-FP8 " f"--ref-load /root/{MODEL_NAME}_torch_dist " + + rollout_args = ( + "--prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl " + "--input-key prompt " + "--label-key label " + "--apply-chat-template " + "--rollout-shuffle " + "--rm-type deepscaler " + "--num-rollout 3 " + "--rollout-batch-size 8 " + "--n-samples-per-prompt 8 " + "--rollout-max-response-len 8192 " + "--rollout-temperature 0.8 " + "--global-batch-size 32 " + "--balance-data " + ) + + eval_args = ( + f"{'--eval-interval 20 ' if ENABLE_EVAL else ''}" + "--eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl " + "--n-samples-per-eval-prompt 1 " + "--eval-max-response-len 16384 " + "--eval-top-k 1 " + ) + + perf_args = ( + "--tensor-model-parallel-size 4 " + "--sequence-parallel " + "--pipeline-model-parallel-size 1 " + "--context-parallel-size 2 " + "--expert-model-parallel-size 8 " + "--expert-tensor-parallel-size 1 " + "--recompute-granularity full " + "--recompute-method uniform " + "--recompute-num-layers 1 " + "--use-dynamic-batch-size " + f"--max-tokens-per-gpu {2048 if TIGHT_HOST_MEMORY else 16384} " + ) + + grpo_args = ( + "--advantage-estimator gspo " + f"{'' if TIGHT_HOST_MEMORY else '--use-kl-loss '}" + "--kl-loss-coef 0.00 " + "--kl-loss-type low_var_kl " + "--kl-coef 0.00 " + "--entropy-coef 0.00 " + "--eps-clip 4e-4 " + "--use-tis " + "--use-routing-replay " + ) + + optimizer_args = ( + "--optimizer adam " + "--lr 1e-6 " + "--lr-decay-style constant " + "--weight-decay 0.1 " + "--adam-beta1 0.9 " + "--adam-beta2 0.98 " + "--optimizer-cpu-offload " + "--overlap-cpu-optimizer-d2h-h2d " + "--use-precision-aware-optimizer " + ) + + sglang_args = ( + "--rollout-num-gpus-per-engine 8 " + "--sglang-mem-fraction-static 0.8 " + "--sglang-moe-a2a-backend deepep " + "--sglang-deepep-mode auto " + "--sglang-max-running-requests 512 " + "--sglang-disable-radix-cache " + ) + + misc_args = ( + # default dropout in megatron is 0.1 + "--attention-dropout 0.0 " + "--hidden-dropout 0.0 " + # should be good for model performance + "--accumulate-allreduce-grads-in-fp32 " + "--attention-softmax-in-fp32 " + # need to comment this when using model with MLA + "--attention-backend flash " + "--moe-token-dispatcher-type flex " + "--moe-enable-deepep " + "--ci-test " + "--actor-num-nodes 1 " + "--actor-num-gpus-per-node 8 " + "--colocate " + ) + + train_args = ( + f"{ckpt_args} " + f"{rollout_args} " + f"{optimizer_args} " + f"{grpo_args} " + f"{U.get_default_wandb_args(__file__)} " + f"{perf_args} " + f"{eval_args} " + f"{sglang_args} " + f"{misc_args} " + ) + + U.execute_train( + train_args=train_args, + num_gpus=8, + model_type=MODEL_TYPE, + ) + + +if __name__ == "__main__": + # TODO also use typer + prepare() + execute()