diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
new file mode 100644
index 000000000..f1c0dc7e6
--- /dev/null
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,56 @@
+name: PR Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    types: [synchronize, labeled]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    # TODO may use run-ci label etc
+    if: github.event.pull_request.draft == false
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+        -v /data/miles_ci:/data/miles_ci
+        -v /data/miles_ci/models:/root/models
+        -v /data/miles_ci/datasets:/root/datasets
+    strategy:
+      fail-fast: false
+      matrix:
+        info:
+          - {test_file: test_quick_start_glm4_9B.py}
+          - {test_file: test_qwen3_30B_A3B.py}
+          # TODO use deterministic kernel
+    defaults:
+      run:
+        working-directory: ${{ github.workspace }}
+    env:
+      GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install
+        shell: bash
+        run: cd $GITHUB_WORKSPACE && pip install -e .
+
+      - name: Execute
+        shell: bash
+        run: python tests/${{ matrix.info.test_file }}
diff --git a/miles/backends/megatron_utils/model.py b/miles/backends/megatron_utils/model.py
index e0b939121..0fa9f9145 100644
--- a/miles/backends/megatron_utils/model.py
+++ b/miles/backends/megatron_utils/model.py
@@ -488,9 +488,9 @@ def train(rollout_id, model, optimizer, opt_param_scheduler, data_iterator, num_
 
             if args.ci_test:
                 if step_id == 0 and "train/ppo_kl" in log_dict and "train/pg_clipfrac" in log_dict:
-                    assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0
+                    assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0, f"{loss_dict=}"
                 if accumulated_step_id == 0 and "train/kl_loss" in log_dict:
-                    assert log_dict["train/kl_loss"] == 0.0
+                    assert log_dict["train/kl_loss"] == 0.0, f"{loss_dict=}"
 
             print(f"{role_tag}step {accumulated_step_id}: {log_dict}")
     # Close out pre-hooks if using distributed optimizer and overlapped param gather.
diff --git a/tests/ci/README.md b/tests/ci/README.md
new file mode 100644
index 000000000..d6941e7b5
--- /dev/null
+++ b/tests/ci/README.md
@@ -0,0 +1,47 @@
+# Doc about CI
+
+## Configure GitHub secrets
+
+https://github.com/radixark/miles/settings/secrets/actions
+
+* `WANDB_API_KEY`: get from https://wandb.ai/authorize
+
+## Setup new GitHub runners
+
+### Step 1: Env
+
+Write `.env` mimicking `.env.example`.
+The token can be found at https://github.com/radixark/miles/settings/actions/runners/new?arch=x64&os=linux.
+
+WARN: The `GITHUB_RUNNER_TOKEN` changes after a while.
+
+### Step 2: Prepare `/home/runner/externals`
+
+```shell
+docker run --rm -it --privileged --pid=host -v /:/host_root ubuntu /bin/bash -c 'rm -rf /host_root/home/runner/externals && mkdir -p /host_root/home/runner/externals && chmod -R 777 /host_root/home/runner/externals'
+docker run -d --name temp-runner ghcr.io/actions/actions-runner:2.328.0 tail -f /dev/null
+docker cp temp-runner:/home/runner/externals/. /home/runner/externals
+docker rm -f temp-runner
+ls -alh /home/runner/externals
+```
+
+### Step 3: Run
+
+```shell
+cd /data/tom/primary_synced/miles/tests/ci/github_runner
+docker compose up -d
+```
+
+### Debugging
+
+Logs
+
+```shell
+docker compose logs -f
+```
+
+Exec
+
+```shell
+docker exec -it github_runner-runner-1 /bin/bash
+```
diff --git a/tests/ci/github_runner/.env.example b/tests/ci/github_runner/.env.example
new file mode 100644
index 000000000..114a1a80c
--- /dev/null
+++ b/tests/ci/github_runner/.env.example
@@ -0,0 +1,2 @@
+GITHUB_RUNNER_URL=https://github.com/radixark/miles
+GITHUB_RUNNER_TOKEN=paste-your-token-here
\ No newline at end of file
diff --git a/tests/ci/github_runner/.gitignore b/tests/ci/github_runner/.gitignore
new file mode 100644
index 000000000..2eea525d8
--- /dev/null
+++ b/tests/ci/github_runner/.gitignore
@@ -0,0 +1 @@
+.env
\ No newline at end of file
diff --git a/tests/ci/github_runner/docker-compose.yml b/tests/ci/github_runner/docker-compose.yml
new file mode 100644
index 000000000..abe2c64bc
--- /dev/null
+++ b/tests/ci/github_runner/docker-compose.yml
@@ -0,0 +1,29 @@
+# Please refer to `README.md` for how to setup this GitHub action runner
+version: "3.9"
+
+services:
+  runner:
+    image: ghcr.io/actions/actions-runner:2.328.0
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /data/miles_ci:/data/miles_ci
+      # it requires this folder
+      - /home/runner/externals:/home/runner/externals
+    deploy:
+      # TODO 4 runner + lock gpu when running
+      replicas: 1
+    restart: always
+    environment:
+      RUNNER_ALLOW_RUNASROOT: "1"
+    privileged: true
+    user: root
+    # ref: https://github.com/actions/runner/issues/367#issuecomment-2007558723
+    # ref: https://github.com/actions/runner
+    # args ref: https://github.com/actions/runner/blob/68ff57dbc4c836d50f46602a8a53301fb9513eb4/src/Runner.Listener/CommandSettings.cs#L53
+    # TODO seems we should not run config.sh repeatedly
+    entrypoint: >
+      sh -c "
+      cd /data/miles_ci &&
+      /home/runner/config.sh --url ${GITHUB_RUNNER_URL} --token ${GITHUB_RUNNER_TOKEN} --unattended --work /data/miles_ci/runner_$(hostname) &&
+      /home/runner/run.sh
+      "
diff --git a/tests/command_utils.py b/tests/command_utils.py
new file mode 100644
index 000000000..f5772ae2c
--- /dev/null
+++ b/tests/command_utils.py
@@ -0,0 +1,108 @@
+import datetime
+import json
+import os
+import random
+import subprocess
+from pathlib import Path
+
+repo_base_dir = Path(os.path.abspath(__file__)).resolve().parents[1]
+
+
+def convert_checkpoint(model_name, model_type):
+    # TODO shall we make it in host-mapped folder and thus can cache it to speedup CI
+    path_dst = f"/root/{model_name}_torch_dist"
+    if Path(path_dst).exists():
+        print(f"convert_checkpoint skip {path_dst} since exists")
+        return
+
+    exec_command(
+        f"source {repo_base_dir}/scripts/models/{model_type}.sh && "
+        "PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py "
+        "${MODEL_ARGS[@]} "
+        f"--hf-checkpoint /root/models/{model_name} "
+        f"--save {path_dst}"
+    )
+
+
+def execute_train(
+    train_args: str,
+    num_gpus: int,
+    model_type: str,
+    master_addr: str = "127.0.0.1",
+):
+    exec_command(
+        "pkill -9 sglang; "
+        "sleep 3; "
+        "ray stop --force; "
+        "pkill -9 ray; "
+        # cannot be run in CI, o/w kill the parent script
+        # TODO: do we really need this kill? (or can we instead kill miles)
+        # "pkill -9 python; "
+        "pkill -9 miles; "
+        "sleep 3; "
+        "pkill -9 ray; "
+        # "pkill -9 python; "
+        "pkill -9 miles; "
+        "pkill -9 redis; "
+        "true; "
+    )
+
+    exec_command(
+        # will prevent ray from buffering stdout/stderr
+        f"export PYTHONBUFFERED=16 && "
+        f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus} --disable-usage-stats"
+    )
+
+    runtime_env_json = json.dumps(
+        {
+            "env_vars": {
+                "PYTHONPATH": "/root/Megatron-LM/",
+                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+                "NCCL_NVLS_ENABLE": str(int(check_has_nvlink())),
+                "no_proxy": f"127.0.0.1,{master_addr}",
+            }
+        }
+    )
+
+    exec_command(
+        f"export PYTHONBUFFERED=16 && "
+        f'source "{repo_base_dir}/scripts/models/{model_type}.sh" && '
+        # TODO should this 127.0.0.1 be `master_addr` instead
+        f'ray job submit --address="http://127.0.0.1:8265" '
+        f"--runtime-env-json='{runtime_env_json}' "
+        "-- python3 train.py "
+        "${MODEL_ARGS[@]} "
+        f"{train_args}"
+    )
+
+
+def check_has_nvlink():
+    output = exec_command("nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l", capture_output=True)
+    return int(output) > 0
+
+
+def get_default_wandb_args(test_file: str):
+    if not os.environ.get("WANDB_API_KEY"):
+        print("Skip wandb configuration since WANDB_API_KEY is not found")
+        return ""
+
+    test_name = Path(test_file).stem
+
+    run_name = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{random.randint(0, 1000000000)}"
+    if (x := os.environ.get("GITHUB_COMMIT_NAME")) is not None:
+        run_name += f"_{x}"
+
+    # do not put wandb_api_key value here to avoid leaking to logs explicitly
+    return (
+        "--use-wandb "
+        f"--wandb-project miles-ci-{test_name} "
+        f"--wandb-group {run_name} "
+        f"--wandb-key ${{WANDB_API_KEY}} "
+    )
+
+
+def exec_command(cmd: str, capture_output: bool = False):
+    print(f"EXEC: {cmd}", flush=True)
+    result = subprocess.run(["bash", "-c", cmd], shell=False, check=True, capture_output=capture_output)
+    if capture_output:
+        return result.stdout
diff --git a/tests/test-qwen2.5-0.5B-gsm8k-async.sh b/tests/test-qwen2.5-0.5B-gsm8k-async.sh
deleted file mode 100644
index 8b04ab2c4..000000000
--- a/tests/test-qwen2.5-0.5B-gsm8k-async.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-
-# for rerun the task
-pkill -9 sglang
-sleep 3
-ray stop --force
-pkill -9 ray
-pkill -9 python
-sleep 3
-pkill -9 ray
-pkill -9 python
-
-set -ex
-
-
-huggingface-cli download --repo-type dataset zhuzilin/gsm8k --local-dir gsm8k
-
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-source "${SCRIPT_DIR}/../scripts/models/qwen2.5-0.5B.sh"
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/Qwen2.5-0.5B-Instruct/
-   --ref-load /root/Qwen2.5-0.5B-Instruct_torch_dist/
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data gsm8k/train.parquet
-   --input-key messages
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-   --rm-type math
-   --num-rollout 3000
-   --rollout-batch-size 32
-   --n-samples-per-prompt 8
-   --rollout-max-response-len 1024
-   --rollout-temperature 0.8
-   --rollout-num-gpus 2
-
-   --over-sampling-batch-size 64
-   --dynamic-sampling-filter-path miles.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std
-
-   --global-batch-size 256
-)
-
-EVAL_ARGS=(
-   --eval-interval 20
-   --eval-prompt-data gsm8k gsm8k/test.parquet
-   --n-samples-per-eval-prompt 1
-   --eval-max-response-len 1024
-   --eval-top-k 1
-)
-
-PERF_ARGS=(
-   --tensor-model-parallel-size 1
-   --sequence-parallel
-   --pipeline-model-parallel-size 1
-   --context-parallel-size 1
-   --expert-model-parallel-size 1
-   --expert-tensor-parallel-size 1
-
-   # --micro-batch-size 1
-   --use-dynamic-batch-size
-   --max-tokens-per-gpu 9216
-)
-
-GRPO_ARGS=(
-   --advantage-estimator grpo
-   --use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --entropy-coef 0.00
-   --eps-clip 0.2
-   --eps-clip-high 0.28
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-)
-
-WANDB_ARGS=(
-   --use-wandb
-   --wandb-project miles-test
-   --wandb-group test-qwen2.5-0.5B-gsm8k
-)
-
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 1
-   --sglang-mem-fraction-static 0.7
-)
-
-MISC_ARGS=(
-   # default dropout in megatron is 0.1
-   --attention-dropout 0.0
-   --hidden-dropout 0.0
-   # should be good for model performance
-   --accumulate-allreduce-grads-in-fp32
-   --attention-softmax-in-fp32
-   # need to comment this when using model with MLA
-   --attention-backend flash
-)
-
-# launch the master node of ray in container
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json='{
-     "env_vars": {
-        "PYTHONPATH": "/root/Megatron-LM",
-        "CUDA_DEVICE_MAX_CONNECTIONS": "1"
-     }
-   }' \
-   -- python3 train_async.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 2 \
-   ${MODEL_ARGS[@]} \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${WANDB_ARGS[@]} \
-   ${PERF_ARGS[@]} \
-   ${EVAL_ARGS[@]} \
-   ${SGLANG_ARGS[@]} \
-   ${MISC_ARGS[@]}
diff --git a/tests/test-qwen2.5-0.5B-gsm8k.sh b/tests/test-qwen2.5-0.5B-gsm8k.sh
deleted file mode 100644
index fd598c454..000000000
--- a/tests/test-qwen2.5-0.5B-gsm8k.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-
-# for rerun the task
-pkill -9 sglang
-sleep 3
-ray stop --force
-pkill -9 ray
-pkill -9 python
-sleep 3
-pkill -9 ray
-pkill -9 python
-
-set -ex
-
-
-huggingface-cli download --repo-type dataset zhuzilin/gsm8k --local-dir gsm8k
-
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-source "${SCRIPT_DIR}/../scripts/models/qwen2.5-0.5B.sh"
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/Qwen2.5-0.5B-Instruct/
-   --ref-load /root/Qwen2.5-0.5B-Instruct_torch_dist/
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data gsm8k/train.parquet
-   --input-key messages
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-   --rm-type math
-   --num-rollout 3000
-   --rollout-batch-size 32
-   --n-samples-per-prompt 8
-   --rollout-max-response-len 1024
-   --rollout-temperature 0.8
-
-   --over-sampling-batch-size 64
-   --dynamic-sampling-filter-path miles.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std
-
-   --global-batch-size 256
-)
-
-EVAL_ARGS=(
-   --eval-interval 20
-   --eval-prompt-data gsm8k gsm8k/test.parquet
-   --n-samples-per-eval-prompt 1
-   --eval-max-response-len 1024
-   --eval-top-k 1
-)
-
-PERF_ARGS=(
-   --tensor-model-parallel-size 1
-   --sequence-parallel
-   --pipeline-model-parallel-size 1
-   --context-parallel-size 1
-   --expert-model-parallel-size 1
-   --expert-tensor-parallel-size 1
-
-   # --micro-batch-size 1
-   --use-dynamic-batch-size
-   --max-tokens-per-gpu 9216
-)
-
-GRPO_ARGS=(
-   --advantage-estimator grpo
-   --use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --entropy-coef 0.00
-   --eps-clip 0.2
-   --eps-clip-high 0.28
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-)
-
-WANDB_ARGS=(
-   --use-wandb
-   --wandb-project miles-test
-   --wandb-group test-qwen2.5-0.5B-gsm8k
-)
-
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 1
-   --sglang-mem-fraction-static 0.7
-)
-
-MISC_ARGS=(
-   # default dropout in megatron is 0.1
-   --attention-dropout 0.0
-   --hidden-dropout 0.0
-   # should be good for model performance
-   --accumulate-allreduce-grads-in-fp32
-   --attention-softmax-in-fp32
-   # need to comment this when using model with MLA
-   --attention-backend flash
-)
-
-# launch the master node of ray in container
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json='{
-     "env_vars": {
-        "PYTHONPATH": "/root/Megatron-LM",
-        "CUDA_DEVICE_MAX_CONNECTIONS": "1"
-     }
-   }' \
-   -- python3 train.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 4 \
-   --colocate \
-   ${MODEL_ARGS[@]} \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${WANDB_ARGS[@]} \
-   ${PERF_ARGS[@]} \
-   ${EVAL_ARGS[@]} \
-   ${SGLANG_ARGS[@]} \
-   ${MISC_ARGS[@]}
diff --git a/tests/test_fsdp_import.py b/tests/test_fsdp_import.py
deleted file mode 100644
index 66b6861ed..000000000
--- a/tests/test_fsdp_import.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import pytest
-
-
-def test_fsdp_import():
-    try:
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    except ImportError:
-        pytest.skip("FSDP not available in this environment")
-    assert FSDP is not None
diff --git a/tests/test_quick_start_glm4-9B.sh b/tests/test_quick_start_glm4-9B.sh
deleted file mode 100644
index 2af9e24fc..000000000
--- a/tests/test_quick_start_glm4-9B.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
-if [ "$NVLINK_COUNT" -gt 0 ]; then
-    HAS_NVLINK=1
-else
-    HAS_NVLINK=0
-fi
-echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-source "${SCRIPT_DIR}/../scripts/models/glm4-9B.sh"
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/models/GLM-Z1-9B-0414/
-   --ref-load /root/GLM-Z1-9B-0414_torch_dist
-
-   --fp8-format e4m3
-   --fp8-recipe blockwise
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl
-   --input-key prompt
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-
-   --rm-type deepscaler
-
-   --num-rollout 3
-   --rollout-batch-size 8
-   --n-samples-per-prompt 8
-   --rollout-max-response-len 8192
-   --rollout-temperature 0.8
-
-   --global-batch-size 32
-   --balance-data
-)
-
-EVAL_ARGS=(
-   --eval-interval 20
-   --eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl
-   --n-samples-per-eval-prompt 1
-   --eval-max-response-len 16384
-   --eval-top-k 1
-)
-
-PERF_ARGS=(
-   --tensor-model-parallel-size 2
-   --sequence-parallel
-   --pipeline-model-parallel-size 1
-   --context-parallel-size 2
-   --expert-model-parallel-size 1
-   --expert-tensor-parallel-size 1
-
-   --recompute-granularity full
-   --recompute-method uniform
-   --recompute-num-layers 1
-
-   --use-dynamic-batch-size
-   --max-tokens-per-gpu 4608
-)
-
-GRPO_ARGS=(
-   --advantage-estimator grpo
-   --use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --entropy-coef 0.00
-   --eps-clip 0.2
-   --eps-clip-high 0.28
-
-   --use-tis
-   --calculate-per-token-loss
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-)
-
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 2
-
-   --use-miles-router
-)
-
-MISC_ARGS=(
-   # default dropout in megatron is 0.1
-   --attention-dropout 0.0
-   --hidden-dropout 0.0
-   # should be good for model performance
-   --accumulate-allreduce-grads-in-fp32
-   --attention-softmax-in-fp32
-   # need to comment this when using model with MLA
-   --attention-backend flash
-)
-
-# launch the master node of ray in container
-export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
-export no_proxy="127.0.0.1,${MASTER_ADDR}"
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats
-
-# Build the runtime environment JSON with proper variable substitution
-RUNTIME_ENV_JSON="{
-  \"env_vars\": {
-    \"PYTHONPATH\": \"/root/Megatron-LM/\",
-    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
-    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
-    \"no_proxy\": \"${no_proxy}\"
-  }
-}"
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json="${RUNTIME_ENV_JSON}" \
-   -- python3 train.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 4 \
-   --rollout-num-gpus 4 \
-   ${MODEL_ARGS[@]} \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${WANDB_ARGS[@]} \
-   ${PERF_ARGS[@]} \
-   ${EVAL_ARGS[@]} \
-   ${SGLANG_ARGS[@]} \
-   ${MISC_ARGS[@]} \
-   --ci-test
diff --git a/tests/test_quick_start_glm4_9B.py b/tests/test_quick_start_glm4_9B.py
new file mode 100644
index 000000000..b8176b7b9
--- /dev/null
+++ b/tests/test_quick_start_glm4_9B.py
@@ -0,0 +1,127 @@
+import os
+
+import command_utils as U
+
+MODEL_NAME = "GLM-Z1-9B-0414"
+MODEL_TYPE = "glm4-9B"
+
+ENABLE_EVAL = bool(int(os.environ.get("MILES_TEST_ENABLE_EVAL", "1")))
+TIGHT_HOST_MEMORY = bool(int(os.environ.get("MILES_TEST_TIGHT_HOST_MEMORY", "1")))
+
+
+def prepare():
+    U.exec_command("mkdir -p /root/models /root/datasets")
+    U.exec_command("hf download zai-org/GLM-Z1-9B-0414 --local-dir /root/models/GLM-Z1-9B-0414")
+    U.exec_command("hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k")
+    U.exec_command("hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024")
+
+    U.convert_checkpoint(model_name=MODEL_NAME, model_type=MODEL_TYPE)
+
+
+def execute():
+    ckpt_args = (
+        f"--hf-checkpoint /root/models/{MODEL_NAME}/ "
+        f"--ref-load /root/{MODEL_NAME}_torch_dist "
+        "--fp8-format e4m3 "
+        "--fp8-recipe blockwise "
+    )
+
+    rollout_args = (
+        "--prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl "
+        "--input-key prompt "
+        "--label-key label "
+        "--apply-chat-template "
+        "--rollout-shuffle "
+        "--rm-type deepscaler "
+        "--num-rollout 3 "
+        "--rollout-batch-size 8 "
+        "--n-samples-per-prompt 8 "
+        f"--rollout-max-response-len 8192 "
+        "--rollout-temperature 0.8 "
+        "--global-batch-size 32 "
+        "--balance-data "
+    )
+
+    eval_args = (
+        f"{'--eval-interval 20 ' if ENABLE_EVAL else ''}"
+        "--eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl "
+        "--n-samples-per-eval-prompt 1 "
+        "--eval-max-response-len 16384 "
+        "--eval-top-k 1 "
+    )
+
+    perf_args = (
+        "--tensor-model-parallel-size 2 "
+        "--sequence-parallel "
+        "--pipeline-model-parallel-size 1 "
+        "--context-parallel-size 2 "
+        "--expert-model-parallel-size 1 "
+        "--expert-tensor-parallel-size 1 "
+        "--recompute-granularity full "
+        "--recompute-method uniform "
+        "--recompute-num-layers 1 "
+        "--use-dynamic-batch-size "
+        f"--max-tokens-per-gpu {2048 if TIGHT_HOST_MEMORY else 4608} "
+    )
+
+    grpo_args = (
+        "--advantage-estimator grpo "
+        "--use-kl-loss "
+        "--kl-loss-coef 0.00 "
+        "--kl-loss-type low_var_kl "
+        "--entropy-coef 0.00 "
+        "--eps-clip 0.2 "
+        "--eps-clip-high 0.28 "
+        "--use-tis "
+        "--calculate-per-token-loss "
+    )
+
+    optimizer_args = (
+        "--optimizer adam "
+        "--lr 1e-6 "
+        "--lr-decay-style constant "
+        "--weight-decay 0.1 "
+        "--adam-beta1 0.9 "
+        "--adam-beta2 0.98 "
+    )
+
+    sglang_args = "--rollout-num-gpus-per-engine 2 " "--use-miles-router "
+
+    misc_args = (
+        # default dropout in megatron is 0.1
+        "--attention-dropout 0.0 "
+        "--hidden-dropout 0.0 "
+        # should be good for model performance
+        "--accumulate-allreduce-grads-in-fp32 "
+        "--attention-softmax-in-fp32 "
+        # need to comment this when using model with MLA
+        "--attention-backend flash "
+        "--ci-test "
+        "--actor-num-nodes 1 "
+        "--actor-num-gpus-per-node 4 "
+        "--rollout-num-gpus 4 "
+    )
+
+    train_args = (
+        f"{ckpt_args} "
+        f"{rollout_args} "
+        f"{optimizer_args} "
+        f"{grpo_args} "
+        f"{U.get_default_wandb_args(__file__)} "
+        f"{perf_args} "
+        f"{eval_args} "
+        f"{sglang_args} "
+        f"{misc_args} "
+    )
+
+    U.execute_train(
+        train_args=train_args,
+        num_gpus=8,
+        model_type=MODEL_TYPE,
+    )
+
+
+if __name__ == "__main__":
+    # TODO also use typer
+    prepare()
+    execute()
diff --git a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh b/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh
deleted file mode 100755
index edf96f374..000000000
--- a/tests/test_qwen3-0.6B_fsdp_colocated_2xGPU.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-# FSDP Colocated 2GPU Training Script with Weights & Biases Support
-# 
-# This script runs FSDP training with wandb logging enabled.
-# 
-# Wandb Configuration:
-# - Rank and world size are automatically detected from distributed context
-# - Only rank 0 will log to wandb to avoid duplicate entries
-# - Distributed coordination handled by torch.distributed in FSDP actors
-# 
-# To customize wandb settings:
-# 1. Uncomment and set --wandb-team if you're using a team/organization (optional for personal accounts)
-# 2. Set your wandb API key if needed (or use 'wandb login' beforehand)
-# 3. Modify project name and group as needed
-# 4. Change wandb mode to 'offline' for local logging only
-# 5. Uncomment --wandb-dir to specify custom log directory
-
-# for rerun the task
-pkill -9 sglang
-sleep 3
-ray stop --force
-pkill -9 ray
-pkill -9 python
-sleep 3
-pkill -9 ray
-pkill -9 python
-
-set -ex
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/Qwen3-0.6B
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
-   --input-key prompt
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-   --rm-type deepscaler
-   --num-rollout 1000
-   --rollout-batch-size 8
-   --n-samples-per-prompt 8
-   --rollout-max-response-len 4096
-   --rollout-temperature 0.8
-
-   --global-batch-size 64
-)
-
-GRPO_ARGS=(
-   --advantage-estimator grpo
-   #--use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --kl-coef 0.00
-   --entropy-coef 0.00
-   --eps-clip 0.2
-   --eps-clip-high 0.28
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-)
-
-SGLANG_ARGS=(
-   # Set equal to the number of GPUs per node for colocated mode
-   --rollout-num-gpus-per-engine 2
-   --sglang-decode-log-interval 1000
-)
-
-
-WANDB_ARGS=(
-   --use-wandb
-   --wandb-project "gsm8k_async_rl"
-   --wandb-group "fsdp-2gpu-colocated"
-   --wandb-mode "online"  # Change to "offline" for local logging only
-)
-
-FSDP_ARGS=(
-   # Set to true for FULL_STATE_DICT mode, false for SHARDED_STATE_DICT mode (default)
-   # --fsdp-full-params  # Uncomment this line to enable full params mode
-
-   # Set the bucket size for weight update
-   --update-weights-bucket-size 512 * 1024 * 1024 # 512MB
-)
-
-# launch the master node of ray in container
-ray start --head --node-ip-address 127.0.0.1 --num-gpus 2 --disable-usage-stats
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json='{
-     "env_vars": {
-        "no_proxy": "localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}",
-        "MILES_BACKEND": "fsdp"
-     }
-   }' \
-   -- python3 train.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 2 \
-   --colocate \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${SGLANG_ARGS[@]} \
-   ${WANDB_ARGS[@]} 
\ No newline at end of file
diff --git a/tests/test_qwen3-0.6B_fsdp_distributed.sh b/tests/test_qwen3-0.6B_fsdp_distributed.sh
deleted file mode 100644
index 0ea417684..000000000
--- a/tests/test_qwen3-0.6B_fsdp_distributed.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-# for rerun the task
-pkill -9 sglang
-sleep 3
-ray stop --force
-pkill -9 ray
-pkill -9 python
-sleep 3
-pkill -9 ray
-pkill -9 python
-
-set -ex
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/Qwen3-0.6B
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
-   --input-key prompt
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-   --rm-type deepscaler
-   --num-rollout 3000
-   --rollout-batch-size 16
-   --n-samples-per-prompt 16
-   --rollout-max-response-len 8192
-   --rollout-temperature 0.8
-
-   --global-batch-size 128
-)
-
-GRPO_ARGS=(
-   --advantage-estimator grpo
-   #--use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --kl-coef 0.00
-   --entropy-coef 0.00
-   --eps-clip 0.2
-   --eps-clip-high 0.28
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-)
-
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 1
-)
-
-# launch the master node of ray in container
-ray start --head --node-ip-address 127.0.0.1 --num-gpus 4 --disable-usage-stats
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json='{
-     "env_vars": {
-        "no_proxy": "localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}",
-        "MILES_BACKEND": "fsdp"
-     }
-   }' \
-   -- python3 train.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 4 \
-   --colocate \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${SGLANG_ARGS[@]}
diff --git a/tests/test_qwen3-30B-A3B.sh b/tests/test_qwen3-30B-A3B.sh
deleted file mode 100644
index 125efc827..000000000
--- a/tests/test_qwen3-30B-A3B.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# will prevent ray from buffering stdout/stderr
-export PYTHONBUFFERED=16
-
-NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
-if [ "$NVLINK_COUNT" -gt 0 ]; then
-    HAS_NVLINK=1
-else
-    HAS_NVLINK=0
-fi
-echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
-
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-source "${SCRIPT_DIR}/../scripts/models/qwen3-30B-A3B.sh"
-
-CKPT_ARGS=(
-   --hf-checkpoint /root/models/Qwen3-30B-A3B-FP8
-   --ref-load /root/Qwen3-30B-A3B_torch_dist
-)
-
-ROLLOUT_ARGS=(
-   --prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl
-   --input-key prompt
-   --label-key label
-   --apply-chat-template
-   --rollout-shuffle
-
-   --rm-type deepscaler
-
-   --num-rollout 3
-   --rollout-batch-size 8
-   --n-samples-per-prompt 8
-   --rollout-max-response-len 8192
-   --rollout-temperature 0.8
-
-   --global-batch-size 32
-   --balance-data
-)
-
-EVAL_ARGS=(
-   --eval-interval 20
-   --eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl
-   --n-samples-per-eval-prompt 1
-   --eval-max-response-len 16384
-   --eval-top-k 1
-)
-
-PERF_ARGS=(
-   --tensor-model-parallel-size 4
-   --sequence-parallel
-   --pipeline-model-parallel-size 1
-   --context-parallel-size 2
-   --expert-model-parallel-size 8
-   --expert-tensor-parallel-size 1
-
-   --recompute-granularity full
-   --recompute-method uniform
-   --recompute-num-layers 1
-
-   --use-dynamic-batch-size
-   --max-tokens-per-gpu 16384
-)
-
-GRPO_ARGS=(
-   --advantage-estimator gspo
-   --use-kl-loss
-   --kl-loss-coef 0.00
-   --kl-loss-type low_var_kl
-   --kl-coef 0.00
-   --entropy-coef 0.00
-   --eps-clip 4e-4
-
-   --use-tis
-   --use-routing-replay
-)
-
-OPTIMIZER_ARGS=(
-   --optimizer adam
-   --lr 1e-6
-   --lr-decay-style constant
-   --weight-decay 0.1
-   --adam-beta1 0.9
-   --adam-beta2 0.98
-
-   --optimizer-cpu-offload
-   --overlap-cpu-optimizer-d2h-h2d
-   --use-precision-aware-optimizer
-)
-
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 8
-   --sglang-mem-fraction-static 0.8
-
-   --sglang-moe-a2a-backend deepep
-   --sglang-deepep-mode auto
-
-   --sglang-max-running-requests 512
-   --sglang-disable-radix-cache
-)
-
-MISC_ARGS=(
-   # default dropout in megatron is 0.1
-   --attention-dropout 0.0
-   --hidden-dropout 0.0
-   # should be good for model performance
-   --accumulate-allreduce-grads-in-fp32
-   --attention-softmax-in-fp32
-   # need to comment this when using model with MLA
-   --attention-backend flash
-
-   --moe-token-dispatcher-type flex
-   --moe-enable-deepep
-)
-
-# launch the master node of ray in container
-export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
-export no_proxy="127.0.0.1,${MASTER_ADDR}"
-ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats
-
-export CUDA_HOME=${CUDA_HOME:-"/usr/local/cuda"}
-# Build the runtime environment JSON with proper variable substitution
-RUNTIME_ENV_JSON="{
-  \"env_vars\": {
-    \"PYTHONPATH\": \"/root/Megatron-LM/\",
-    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
-    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
-    \"no_proxy\": \"${no_proxy}\",
-    \"CUDA_HOME\": \"${CUDA_HOME}\"
-  }
-}"
-
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json="${RUNTIME_ENV_JSON}" \
-   -- python3 train.py \
-   --actor-num-nodes 1 \
-   --actor-num-gpus-per-node 8 \
-   --colocate \
-   ${MODEL_ARGS[@]} \
-   ${CKPT_ARGS[@]} \
-   ${ROLLOUT_ARGS[@]} \
-   ${OPTIMIZER_ARGS[@]} \
-   ${GRPO_ARGS[@]} \
-   ${DISTRIBUTED_ARGS[@]} \
-   ${WANDB_ARGS[@]} \
-   ${PERF_ARGS[@]} \
-   ${EVAL_ARGS[@]} \
-   ${SGLANG_ARGS[@]} \
-   ${MISC_ARGS[@]} \
-   --ci-test
diff --git a/tests/test_qwen3_30B_A3B.py b/tests/test_qwen3_30B_A3B.py
new file mode 100644
index 000000000..db4f01d41
--- /dev/null
+++ b/tests/test_qwen3_30B_A3B.py
@@ -0,0 +1,136 @@
+import os
+
+import command_utils as U
+
+MODEL_NAME = "Qwen3-30B-A3B"
+MODEL_TYPE = "qwen3-30B-A3B"
+
+
+ENABLE_EVAL = bool(int(os.environ.get("MILES_TEST_ENABLE_EVAL", "1")))
+TIGHT_HOST_MEMORY = bool(int(os.environ.get("MILES_TEST_TIGHT_HOST_MEMORY", "1")))
+
+
+def prepare():
+    U.exec_command("mkdir -p /root/models /root/datasets")
+    U.exec_command("hf download Qwen/Qwen3-30B-A3B --local-dir /root/models/Qwen3-30B-A3B")
+    U.exec_command("hf download Qwen/Qwen3-30B-A3B-FP8 --local-dir /root/models/Qwen3-30B-A3B-FP8")
+    U.exec_command("hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k")
+    U.exec_command("hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024")
+
+    U.convert_checkpoint(model_name=MODEL_NAME, model_type=MODEL_TYPE)
+
+
+def execute():
+    ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME}-FP8 " f"--ref-load /root/{MODEL_NAME}_torch_dist "
+
+    rollout_args = (
+        "--prompt-data /root/datasets/dapo-math-17k/dapo-math-17k.jsonl "
+        "--input-key prompt "
+        "--label-key label "
+        "--apply-chat-template "
+        "--rollout-shuffle "
+        "--rm-type deepscaler "
+        "--num-rollout 3 "
+        "--rollout-batch-size 8 "
+        "--n-samples-per-prompt 8 "
+        "--rollout-max-response-len 8192 "
+        "--rollout-temperature 0.8 "
+        "--global-batch-size 32 "
+        "--balance-data "
+    )
+
+    eval_args = (
+        f"{'--eval-interval 20 ' if ENABLE_EVAL else ''}"
+        "--eval-prompt-data aime24 /root/datasets/aime-2024/aime-2024.jsonl "
+        "--n-samples-per-eval-prompt 1 "
+        "--eval-max-response-len 16384 "
+        "--eval-top-k 1 "
+    )
+
+    perf_args = (
+        "--tensor-model-parallel-size 4 "
+        "--sequence-parallel "
+        "--pipeline-model-parallel-size 1 "
+        "--context-parallel-size 2 "
+        "--expert-model-parallel-size 8 "
+        "--expert-tensor-parallel-size 1 "
+        "--recompute-granularity full "
+        "--recompute-method uniform "
+        "--recompute-num-layers 1 "
+        "--use-dynamic-batch-size "
+        f"--max-tokens-per-gpu {2048 if TIGHT_HOST_MEMORY else 16384} "
+    )
+
+    grpo_args = (
+        "--advantage-estimator gspo "
+        f"{'' if TIGHT_HOST_MEMORY else '--use-kl-loss '}"
+        "--kl-loss-coef 0.00 "
+        "--kl-loss-type low_var_kl "
+        "--kl-coef 0.00 "
+        "--entropy-coef 0.00 "
+        "--eps-clip 4e-4 "
+        "--use-tis "
+        "--use-routing-replay "
+    )
+
+    optimizer_args = (
+        "--optimizer adam "
+        "--lr 1e-6 "
+        "--lr-decay-style constant "
+        "--weight-decay 0.1 "
+        "--adam-beta1 0.9 "
+        "--adam-beta2 0.98 "
+        "--optimizer-cpu-offload "
+        "--overlap-cpu-optimizer-d2h-h2d "
+        "--use-precision-aware-optimizer "
+    )
+
+    sglang_args = (
+        "--rollout-num-gpus-per-engine 8 "
+        "--sglang-mem-fraction-static 0.8 "
+        "--sglang-moe-a2a-backend deepep "
+        "--sglang-deepep-mode auto "
+        "--sglang-max-running-requests 512 "
+        "--sglang-disable-radix-cache "
+    )
+
+    misc_args = (
+        # default dropout in megatron is 0.1
+        "--attention-dropout 0.0 "
+        "--hidden-dropout 0.0 "
+        # should be good for model performance
+        "--accumulate-allreduce-grads-in-fp32 "
+        "--attention-softmax-in-fp32 "
+        # need to comment this when using model with MLA
+        "--attention-backend flash "
+        "--moe-token-dispatcher-type flex "
+        "--moe-enable-deepep "
+        "--ci-test "
+        "--actor-num-nodes 1 "
+        "--actor-num-gpus-per-node 8 "
+        "--colocate "
+    )
+
+    train_args = (
+        f"{ckpt_args} "
+        f"{rollout_args} "
+        f"{optimizer_args} "
+        f"{grpo_args} "
+        f"{U.get_default_wandb_args(__file__)} "
+        f"{perf_args} "
+        f"{eval_args} "
+        f"{sglang_args} "
+        f"{misc_args} "
+    )
+
+    U.execute_train(
+        train_args=train_args,
+        num_gpus=8,
+        model_type=MODEL_TYPE,
+    )
+
+
+if __name__ == "__main__":
+    # TODO also use typer
+    prepare()
+    execute()