radixark · fzyzcjy · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
@@ -0,0 +1,56 @@
+name: PR Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    types: [synchronize, labeled]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    # TODO may use run-ci label etc
+    if: github.event.pull_request.draft == false
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+        -v /data/miles_ci:/data/miles_ci
+        -v /data/miles_ci/models:/root/models
+        -v /data/miles_ci/datasets:/root/datasets
+    strategy:
+      fail-fast: false
+      matrix:
+        info:
+          - {test_file: test_quick_start_glm4_9B.py}
+          - {test_file: test_qwen3_30B_A3B.py}
+          # TODO use deterministic kernel
+    defaults:
+      run:
+        working-directory: ${{ github.workspace }}
+    env:
+      GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install
+        shell: bash
+        run: cd $GITHUB_WORKSPACE && pip install -e .
+
+      - name: Execute
+        shell: bash
+        run: python tests/${{ matrix.info.test_file }}
@@ -488,9 +488,9 @@ def train(rollout_id, model, optimizer, opt_param_scheduler, data_iterator, num_
 
             if args.ci_test:
                 if step_id == 0 and "train/ppo_kl" in log_dict and "train/pg_clipfrac" in log_dict:
-                    assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0
+                    assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0, f"{loss_dict=}"
                 if accumulated_step_id == 0 and "train/kl_loss" in log_dict:
-                    assert log_dict["train/kl_loss"] == 0.0
+                    assert log_dict["train/kl_loss"] == 0.0, f"{loss_dict=}"
 
             print(f"{role_tag}step {accumulated_step_id}: {log_dict}")
     # Close out pre-hooks if using distributed optimizer and overlapped param gather.

diff --git a/tests/ci/README.md b/tests/ci/README.md
@@ -0,0 +1,47 @@
+# Doc about CI
+
+## Configure GitHub secrets
+
+https://github.com/radixark/miles/settings/secrets/actions
+
+* `WANDB_API_KEY`: get from https://wandb.ai/authorize
+
+## Setup new GitHub runners
+
+### Step 1: Env
+
+Write `.env` mimicking `.env.example`.
+The token can be found at https://github.com/radixark/miles/settings/actions/runners/new?arch=x64&os=linux.
+
+WARN: The `GITHUB_RUNNER_TOKEN` changes after a while.
+
+### Step 2: Prepare `/home/runner/externals`
+
+```shell
+docker run --rm -it --privileged --pid=host -v /:/host_root ubuntu /bin/bash -c 'rm -rf /host_root/home/runner/externals && mkdir -p /host_root/home/runner/externals && chmod -R 777 /host_root/home/runner/externals'
+docker run -d --name temp-runner ghcr.io/actions/actions-runner:2.328.0 tail -f /dev/null
+docker cp temp-runner:/home/runner/externals/. /home/runner/externals
+docker rm -f temp-runner
+ls -alh /home/runner/externals
+```
+
+### Step 3: Run
+
+```shell
+cd /data/tom/primary_synced/miles/tests/ci/github_runner
+docker compose up -d
+```
+
+### Debugging
+
+Logs
+
+```shell
+docker compose logs -f
+```
+
+Exec
+
+```shell
+docker exec -it github_runner-runner-1 /bin/bash
+```
diff --git a/tests/ci/github_runner/.env.example b/tests/ci/github_runner/.env.example
@@ -0,0 +1,2 @@
+GITHUB_RUNNER_URL=https://github.com/radixark/miles
+GITHUB_RUNNER_TOKEN=paste-your-token-here
diff --git a/tests/ci/github_runner/.gitignore b/tests/ci/github_runner/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/tests/ci/github_runner/docker-compose.yml b/tests/ci/github_runner/docker-compose.yml
@@ -0,0 +1,29 @@
+# Please refer to `README.md` for how to setup this GitHub action runner
+version: "3.9"
+
+services:
+  runner:
+    image: ghcr.io/actions/actions-runner:2.328.0
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /data/miles_ci:/data/miles_ci
+      # it requires this folder
+      - /home/runner/externals:/home/runner/externals
+    deploy:
+      # TODO 4 runner + lock gpu when running
+      replicas: 1
+    restart: always
+    environment:
+      RUNNER_ALLOW_RUNASROOT: "1"
+    privileged: true
+    user: root
+    # ref: https://github.com/actions/runner/issues/367#issuecomment-2007558723
+    # ref: https://github.com/actions/runner
+    # args ref: https://github.com/actions/runner/blob/68ff57dbc4c836d50f46602a8a53301fb9513eb4/src/Runner.Listener/CommandSettings.cs#L53
+    # TODO seems we should not run config.sh repeatedly
+    entrypoint: >
+      sh -c "
+      cd /data/miles_ci &&
+      /home/runner/config.sh --url ${GITHUB_RUNNER_URL} --token ${GITHUB_RUNNER_TOKEN} --unattended --work /data/miles_ci/runner_$(hostname) &&
+      /home/runner/run.sh
+      "
diff --git a/tests/command_utils.py b/tests/command_utils.py
@@ -0,0 +1,108 @@
+import datetime
+import json
+import os
+import random
+import subprocess
+from pathlib import Path
+
+repo_base_dir = Path(os.path.abspath(__file__)).resolve().parents[1]
+
+
+def convert_checkpoint(model_name, model_type):
+    # TODO shall we make it in host-mapped folder and thus can cache it to speedup CI
+    path_dst = f"/root/{model_name}_torch_dist"
+    if Path(path_dst).exists():
+        print(f"convert_checkpoint skip {path_dst} since exists")
+        return
+
+    exec_command(
+        f"source {repo_base_dir}/scripts/models/{model_type}.sh && "
+        "PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py "
+        "${MODEL_ARGS[@]} "
+        f"--hf-checkpoint /root/models/{model_name} "
+        f"--save {path_dst}"
+    )
+
+
+def execute_train(
+    train_args: str,
+    num_gpus: int,
+    model_type: str,
+    master_addr: str = "127.0.0.1",
+):
+    exec_command(
+        "pkill -9 sglang; "
+        "sleep 3; "
+        "ray stop --force; "
+        "pkill -9 ray; "
+        # cannot be run in CI, o/w kill the parent script
+        # TODO: do we really need this kill? (or can we instead kill miles)
+        # "pkill -9 python; "
+        "pkill -9 miles; "
+        "sleep 3; "
+        "pkill -9 ray; "
+        # "pkill -9 python; "
+        "pkill -9 miles; "
+        "pkill -9 redis; "
+        "true; "
+    )
+
+    exec_command(
+        # will prevent ray from buffering stdout/stderr
+        f"export PYTHONBUFFERED=16 && "
+        f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus} --disable-usage-stats"
+    )
+
+    runtime_env_json = json.dumps(
+        {
+            "env_vars": {
+                "PYTHONPATH": "/root/Megatron-LM/",
+                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+                "NCCL_NVLS_ENABLE": str(int(check_has_nvlink())),
+                "no_proxy": f"127.0.0.1,{master_addr}",
+            }
+        }
+    )
+
+    exec_command(
+        f"export PYTHONBUFFERED=16 && "
+        f'source "{repo_base_dir}/scripts/models/{model_type}.sh" && '
+        # TODO should this 127.0.0.1 be `master_addr` instead
+        f'ray job submit --address="http://127.0.0.1:8265" '
+        f"--runtime-env-json='{runtime_env_json}' "
+        "-- python3 train.py "
+        "${MODEL_ARGS[@]} "
+        f"{train_args}"
+    )
+
+
+def check_has_nvlink():
+    output = exec_command("nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l", capture_output=True)
+    return int(output) > 0
+
+
+def get_default_wandb_args(test_file: str):
+    if not os.environ.get("WANDB_API_KEY"):
+        print("Skip wandb configuration since WANDB_API_KEY is not found")
+        return ""
+
+    test_name = Path(test_file).stem
+
+    run_name = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{random.randint(0, 1000000000)}"
+    if (x := os.environ.get("GITHUB_COMMIT_NAME")) is not None:
+        run_name += f"_{x}"
+
+    # do not put wandb_api_key value here to avoid leaking to logs explicitly
+    return (
+        "--use-wandb "
+        f"--wandb-project miles-ci-{test_name} "
+        f"--wandb-group {run_name} "
+        f"--wandb-key ${{WANDB_API_KEY}} "
+    )
+
+
+def exec_command(cmd: str, capture_output: bool = False):
+    print(f"EXEC: {cmd}", flush=True)
+    result = subprocess.run(["bash", "-c", cmd], shell=False, check=True, capture_output=capture_output)
+    if capture_output:
+        return result.stdout
diff --git a/tests/test-qwen2.5-0.5B-gsm8k-async.sh b/tests/test-qwen2.5-0.5B-gsm8k-async.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		GITHUB_RUNNER_URL=https://github.com/radixark/miles
		GITHUB_RUNNER_TOKEN=paste-your-token-here