diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 885127aa9..177103b49 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -48,7 +48,7 @@ jobs: strategy: fail-fast: false matrix: - info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}] + info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}] defaults: run: working-directory: ${{ github.workspace }} @@ -95,7 +95,7 @@ jobs: strategy: fail-fast: false matrix: - info: [{"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] + info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] defaults: run: working-directory: ${{ github.workspace }} @@ -283,7 +283,7 @@ jobs: strategy: fail-fast: false matrix: - info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}] + info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}] defaults: run: working-directory: ${{ github.workspace }} diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 507270af0..93be071c1 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -4,13 +4,12 @@ 'tests': [ {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4}, - {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, ], }, 'e2e-test-fsdp': { 'label': 'run-ci-fsdp', 'tests': [ - {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py --colocated', 'num_gpus': 4}, {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8}, {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, @@ -48,6 +47,8 @@ 'tests': [ {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2}, {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, ], }, 'e2e-test-image': { diff --git a/build_conda.sh b/build_conda.sh index 69ea1818e..43564311a 100644 --- a/build_conda.sh +++ b/build_conda.sh @@ -74,6 +74,7 @@ fi # https://github.com/pytorch/pytorch/issues/168167 pip install nvidia-cudnn-cu12==9.16.0.29 +pip install "numpy<2" # apply patch cd $BASE_DIR/sglang diff --git a/setup.py b/setup.py index d0f3a7894..91df81c5e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_tag(self): setup( author="slime Team", name="slime", - version="0.2.1", + version="0.2.2", packages=find_packages(include=["slime*", "slime_plugins*"]), include_package_data=True, install_requires=_fetch_requirements("requirements.txt"), diff --git a/tests/test_qwen3_4B_fsdp_true_on_policy.py b/tests/test_qwen3_4B_fsdp_true_on_policy.py index b3c9c7007..3a042ccef 100644 --- a/tests/test_qwen3_4B_fsdp_true_on_policy.py +++ b/tests/test_qwen3_4B_fsdp_true_on_policy.py @@ -1,11 +1,15 @@ import os +from argparse import ArgumentParser import slime.utils.external_utils.command_utils as U ENABLE_EVAL = bool(int(os.environ.get("SLIME_TEST_ENABLE_EVAL", "1"))) -NUM_GPUS = 2 +NUM_GPUS = 4 MODEL_NAME = "Qwen3-4B" +parser = ArgumentParser() +parser.add_argument("--colocated", action="store_true", help="Whether to run with colocate.") + def prepare(): U.exec_command("mkdir -p /root/models /root/datasets") @@ -14,7 +18,7 @@ def prepare(): U.hf_download_dataset("zhuzilin/aime-2024") -def execute(): +def execute(args): ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} " rollout_args = ( @@ -29,7 +33,7 @@ def execute(): "--n-samples-per-prompt 8 " "--rollout-max-response-len 4096 " "--rollout-temperature 1 " - "--global-batch-size 32 " + "--global-batch-size 64 " ) eval_args = ( @@ -75,7 +79,12 @@ def execute(): ci_args = "--ci-test " - misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate " + if args.colocated: + misc_args = f"--actor-num-nodes 1 --actor-num-gpus-per-node {NUM_GPUS} --colocate " + else: + misc_args = ( + f"--actor-num-nodes 1 --actor-num-gpus-per-node {NUM_GPUS // 2} --rollout-num-gpus {NUM_GPUS // 2} " + ) train_args = ( f"{ckpt_args} " @@ -106,7 +115,8 @@ def execute(): if __name__ == "__main__": + args = parser.parse_args() prepare() for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): os.environ.pop(proxy_var, None) - execute() + execute(args) diff --git a/tests/test_qwen3_vl_4B_fsdp.py b/tests/test_qwen3_vl_4B_fsdp.py index 7137deb54..9b780432a 100644 --- a/tests/test_qwen3_vl_4B_fsdp.py +++ b/tests/test_qwen3_vl_4B_fsdp.py @@ -68,8 +68,13 @@ def execute(): "--sglang-mem-fraction-static 0.6 " "--sglang-decode-log-interval 1000 " "--sglang-enable-metrics " + # "--sglang-enable-deterministic-inference " + # "--sglang-rl-on-policy-target fsdp " "--sglang-attention-backend fa3 " "--attn-implementation flash_attention_3 " + "--sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 " + # "--deterministic-mode " + # "--true-on-policy-mode " ) ci_args = "--ci-test " @@ -91,6 +96,9 @@ def execute(): ) extra_env_vars = { + # "NCCL_ALGO": "allreduce:tree", + # "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0", + # "CUBLAS_WORKSPACE_CONFIG": ":4096:8", "CUDA_DEVICE_MAX_CONNECTIONS": "1", } @@ -104,8 +112,6 @@ def execute(): if __name__ == "__main__": prepare() - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - os.environ.pop("HTTP_PROXY", None) - os.environ.pop("HTTPS_PROXY", None) + for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): + os.environ.pop(proxy_var, None) execute()