THUDM · zhuzilin · Jan 18, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -48,7 +48,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}]
+        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
@@ -95,7 +95,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
+        info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
@@ -283,7 +283,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}]
+        info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}

diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2
@@ -4,13 +4,12 @@
       'tests': [
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4},
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4},
-        {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2},
       ],
     },
     'e2e-test-fsdp': {
       'label': 'run-ci-fsdp',
       'tests': [
-        {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2},
+        {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py --colocated', 'num_gpus': 4},
         {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8},
         {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2},
         {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4},
@@ -48,6 +47,8 @@
       'tests': [
         {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2},
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2},
+        {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2},
+        {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2},
       ],
     },
     'e2e-test-image': {

diff --git a/build_conda.sh b/build_conda.sh
@@ -74,6 +74,7 @@ fi
 
 # https://github.com/pytorch/pytorch/issues/168167
 pip install nvidia-cudnn-cu12==9.16.0.29
+pip install "numpy<2"
 
 # apply patch
 cd $BASE_DIR/sglang

diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@ def get_tag(self):
 setup(
     author="slime Team",
     name="slime",
-    version="0.2.1",
+    version="0.2.2",
     packages=find_packages(include=["slime*", "slime_plugins*"]),
     include_package_data=True,
     install_requires=_fetch_requirements("requirements.txt"),

diff --git a/tests/test_qwen3_4B_fsdp_true_on_policy.py b/tests/test_qwen3_4B_fsdp_true_on_policy.py
@@ -1,11 +1,15 @@
 import os
+from argparse import ArgumentParser
 import slime.utils.external_utils.command_utils as U
 
 ENABLE_EVAL = bool(int(os.environ.get("SLIME_TEST_ENABLE_EVAL", "1")))
-NUM_GPUS = 2
+NUM_GPUS = 4
 
 MODEL_NAME = "Qwen3-4B"
 
+parser = ArgumentParser()
+parser.add_argument("--colocated", action="store_true", help="Whether to run with colocate.")
+
 
 def prepare():
     U.exec_command("mkdir -p /root/models /root/datasets")
@@ -14,7 +18,7 @@ def prepare():
     U.hf_download_dataset("zhuzilin/aime-2024")
 
 
-def execute():
+def execute(args):
     ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} "
 
     rollout_args = (
@@ -29,7 +33,7 @@ def execute():
         "--n-samples-per-prompt 8 "
         "--rollout-max-response-len 4096 "
         "--rollout-temperature 1 "
-        "--global-batch-size 32 "
+        "--global-batch-size 64 "
     )
 
     eval_args = (
@@ -75,7 +79,12 @@ def execute():
 
     ci_args = "--ci-test "
 
-    misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate "
+    if args.colocated:
+        misc_args = f"--actor-num-nodes 1 --actor-num-gpus-per-node {NUM_GPUS} --colocate "
+    else:
+        misc_args = (
+            f"--actor-num-nodes 1 --actor-num-gpus-per-node {NUM_GPUS // 2} --rollout-num-gpus {NUM_GPUS // 2} "
+        )
 
     train_args = (
         f"{ckpt_args} "
@@ -106,7 +115,8 @@ def execute():
 
 
 if __name__ == "__main__":
+    args = parser.parse_args()
     prepare()
     for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"):
         os.environ.pop(proxy_var, None)
-    execute()
+    execute(args)
diff --git a/tests/test_qwen3_vl_4B_fsdp.py b/tests/test_qwen3_vl_4B_fsdp.py
@@ -68,8 +68,13 @@ def execute():
         "--sglang-mem-fraction-static 0.6 "
         "--sglang-decode-log-interval 1000 "
         "--sglang-enable-metrics "
+        # "--sglang-enable-deterministic-inference "
+        # "--sglang-rl-on-policy-target fsdp "
         "--sglang-attention-backend fa3 "
         "--attn-implementation flash_attention_3 "
+        "--sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 "
+        # "--deterministic-mode "
+        # "--true-on-policy-mode "
     )
 
     ci_args = "--ci-test "
@@ -91,6 +96,9 @@ def execute():
     )
 
     extra_env_vars = {
+        # "NCCL_ALGO": "allreduce:tree",
+        # "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
+        # "CUBLAS_WORKSPACE_CONFIG": ":4096:8",
         "CUDA_DEVICE_MAX_CONNECTIONS": "1",
     }
 
@@ -104,8 +112,6 @@ def execute():
 
 if __name__ == "__main__":
     prepare()
-    os.environ.pop("http_proxy", None)
-    os.environ.pop("https_proxy", None)
-    os.environ.pop("HTTP_PROXY", None)
-    os.environ.pop("HTTPS_PROXY", None)
+    for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"):
+        os.environ.pop(proxy_var, None)
     execute()