Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
c4cd0b6
more
fzyzcjy Oct 9, 2025
6cb02ea
more
fzyzcjy Oct 9, 2025
1ab13dc
temp rm old
fzyzcjy Oct 9, 2025
5855b02
more
fzyzcjy Oct 9, 2025
eb09a90
more
fzyzcjy Oct 9, 2025
b0e3b6e
more
fzyzcjy Oct 9, 2025
010a0b1
more
fzyzcjy Oct 9, 2025
8c00ffa
more
fzyzcjy Oct 9, 2025
9d15f75
more
fzyzcjy Oct 9, 2025
00d1295
more
fzyzcjy Oct 9, 2025
0f5637b
more
fzyzcjy Oct 9, 2025
72821a5
more
fzyzcjy Oct 9, 2025
a806cbb
more
fzyzcjy Oct 9, 2025
0fa6c51
more
fzyzcjy Oct 9, 2025
21e6704
more
fzyzcjy Oct 9, 2025
65674f1
more
fzyzcjy Oct 9, 2025
2786a8d
more
fzyzcjy Oct 9, 2025
28fe6bb
more
fzyzcjy Oct 9, 2025
7a949c5
more
fzyzcjy Oct 9, 2025
2039b35
more
fzyzcjy Oct 9, 2025
cc7f9fb
fmt
fzyzcjy Oct 9, 2025
bf2358b
more
fzyzcjy Oct 9, 2025
13c3883
more
fzyzcjy Oct 9, 2025
442a242
more
fzyzcjy Oct 9, 2025
e50f08a
more
fzyzcjy Oct 9, 2025
3594b10
more
fzyzcjy Oct 9, 2025
ed251c7
more
fzyzcjy Oct 9, 2025
5f3958b
more
fzyzcjy Oct 9, 2025
3819d46
more
fzyzcjy Oct 9, 2025
458f3bd
more
fzyzcjy Oct 9, 2025
43f6c15
more
fzyzcjy Oct 9, 2025
9dfe983
more
fzyzcjy Oct 9, 2025
91e1efa
more
fzyzcjy Oct 9, 2025
8c7ae9d
more
fzyzcjy Oct 9, 2025
412860e
more
fzyzcjy Oct 9, 2025
2c69b1d
more
fzyzcjy Oct 9, 2025
bfbd362
more
fzyzcjy Oct 9, 2025
6b6c2e8
more
fzyzcjy Oct 9, 2025
fe1d634
more
fzyzcjy Oct 9, 2025
ec158eb
more
fzyzcjy Oct 9, 2025
c2843e4
more
fzyzcjy Oct 9, 2025
b187134
Revert "more"
fzyzcjy Oct 9, 2025
92d8a4d
more
fzyzcjy Oct 9, 2025
193eb92
more
fzyzcjy Oct 9, 2025
1eb82d8
more
fzyzcjy Oct 9, 2025
605d517
more
fzyzcjy Oct 9, 2025
305d126
more
fzyzcjy Oct 9, 2025
907390e
more
fzyzcjy Oct 9, 2025
ac0b62f
more
fzyzcjy Oct 9, 2025
731e859
more
fzyzcjy Oct 9, 2025
0d84d15
more
fzyzcjy Oct 9, 2025
579c49e
more
fzyzcjy Oct 9, 2025
c99e4f5
more
fzyzcjy Oct 9, 2025
a957135
more
fzyzcjy Oct 9, 2025
93e2131
more
fzyzcjy Oct 9, 2025
ed3a7f0
more
fzyzcjy Oct 9, 2025
2892f20
more
fzyzcjy Oct 9, 2025
1a1964b
more
fzyzcjy Oct 9, 2025
5a3210d
more
fzyzcjy Oct 9, 2025
5828ce1
more
fzyzcjy Oct 9, 2025
f4f5744
more
fzyzcjy Oct 9, 2025
03468a3
more
fzyzcjy Oct 9, 2025
df1e721
more
fzyzcjy Oct 9, 2025
0478ee1
more
fzyzcjy Oct 9, 2025
3ef93fe
more
fzyzcjy Oct 9, 2025
340bde7
more
fzyzcjy Oct 9, 2025
4db96ec
more
fzyzcjy Oct 9, 2025
f04ca6c
more
fzyzcjy Oct 9, 2025
c82973c
more
fzyzcjy Oct 9, 2025
04d6872
more
fzyzcjy Oct 9, 2025
0a1b8af
fmt
fzyzcjy Oct 9, 2025
2467f25
more
fzyzcjy Oct 9, 2025
2f48468
more
fzyzcjy Oct 9, 2025
f8e1896
more
fzyzcjy Oct 9, 2025
2264a71
more
fzyzcjy Oct 9, 2025
c66eadd
more
fzyzcjy Oct 9, 2025
679b570
more
fzyzcjy Oct 9, 2025
fb1352a
Revert "more"
fzyzcjy Oct 9, 2025
e3790f2
more
fzyzcjy Oct 9, 2025
8c85ddd
more
fzyzcjy Oct 9, 2025
48cde22
more
fzyzcjy Oct 9, 2025
c0a92d0
more
fzyzcjy Oct 9, 2025
065690b
more
fzyzcjy Oct 9, 2025
e9c761a
typo
fzyzcjy Oct 9, 2025
79a632e
more
fzyzcjy Oct 9, 2025
678ea84
more
fzyzcjy Oct 9, 2025
da6ad85
more
fzyzcjy Oct 9, 2025
db79015
more
fzyzcjy Oct 9, 2025
2f1234c
more
fzyzcjy Oct 9, 2025
3a35c50
more
fzyzcjy Oct 9, 2025
2f7ccd8
more
fzyzcjy Oct 9, 2025
ddcfc21
more
fzyzcjy Oct 9, 2025
62c8db2
more
fzyzcjy Oct 9, 2025
0b8c86e
more
fzyzcjy Oct 9, 2025
e0a9ec3
more
fzyzcjy Oct 9, 2025
535e158
fmt
fzyzcjy Oct 9, 2025
962c5f1
more
fzyzcjy Oct 9, 2025
d97317e
more
fzyzcjy Oct 9, 2025
8a61d34
more
fzyzcjy Oct 9, 2025
1999b75
more
fzyzcjy Oct 9, 2025
42f0543
more
fzyzcjy Oct 9, 2025
99bd779
more
fzyzcjy Oct 9, 2025
08be50c
more
fzyzcjy Oct 9, 2025
c57f4c8
disable use_kl_loss
fzyzcjy Oct 9, 2025
7ed3dd7
more
fzyzcjy Oct 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: PR Test

on:
push:
branches: [main]
pull_request:
branches: [main]
types: [synchronize, labeled]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
e2e-test:
# TODO may use run-ci label etc
if: github.event.pull_request.draft == false
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-v /data/miles_ci:/data/miles_ci
-v /data/miles_ci/models:/root/models
-v /data/miles_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info:
- {test_file: test_quick_start_glm4_9B.py}
- {test_file: test_qwen3_30B_A3B.py}
# TODO use deterministic kernel
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e .

- name: Execute
shell: bash
run: python tests/${{ matrix.info.test_file }}
4 changes: 2 additions & 2 deletions miles/backends/megatron_utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,9 +488,9 @@ def train(rollout_id, model, optimizer, opt_param_scheduler, data_iterator, num_

if args.ci_test:
if step_id == 0 and "train/ppo_kl" in log_dict and "train/pg_clipfrac" in log_dict:
assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0
assert log_dict["train/ppo_kl"] == 0.0 and log_dict["train/pg_clipfrac"] == 0.0, f"{loss_dict=}"
if accumulated_step_id == 0 and "train/kl_loss" in log_dict:
assert log_dict["train/kl_loss"] == 0.0
assert log_dict["train/kl_loss"] == 0.0, f"{loss_dict=}"

print(f"{role_tag}step {accumulated_step_id}: {log_dict}")
# Close out pre-hooks if using distributed optimizer and overlapped param gather.
Expand Down
47 changes: 47 additions & 0 deletions tests/ci/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Doc about CI

## Configure GitHub secrets

https://github.com/radixark/miles/settings/secrets/actions

* `WANDB_API_KEY`: get from https://wandb.ai/authorize

## Setup new GitHub runners

### Step 1: Env

Write `.env` mimicking `.env.example`.
The token can be found at https://github.com/radixark/miles/settings/actions/runners/new?arch=x64&os=linux.

WARN: The `GITHUB_RUNNER_TOKEN` changes after a while.

### Step 2: Prepare `/home/runner/externals`

```shell
docker run --rm -it --privileged --pid=host -v /:/host_root ubuntu /bin/bash -c 'rm -rf /host_root/home/runner/externals && mkdir -p /host_root/home/runner/externals && chmod -R 777 /host_root/home/runner/externals'
docker run -d --name temp-runner ghcr.io/actions/actions-runner:2.328.0 tail -f /dev/null
docker cp temp-runner:/home/runner/externals/. /home/runner/externals
docker rm -f temp-runner
ls -alh /home/runner/externals
```

### Step 3: Run

```shell
cd /data/tom/primary_synced/miles/tests/ci/github_runner
docker compose up -d
```

### Debugging

Logs

```shell
docker compose logs -f
```

Exec

```shell
docker exec -it github_runner-runner-1 /bin/bash
```
2 changes: 2 additions & 0 deletions tests/ci/github_runner/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GITHUB_RUNNER_URL=https://github.com/radixark/miles
GITHUB_RUNNER_TOKEN=paste-your-token-here
1 change: 1 addition & 0 deletions tests/ci/github_runner/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
29 changes: 29 additions & 0 deletions tests/ci/github_runner/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Please refer to `README.md` for how to setup this GitHub action runner
version: "3.9"

services:
runner:
image: ghcr.io/actions/actions-runner:2.328.0
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /data/miles_ci:/data/miles_ci
# it requires this folder
- /home/runner/externals:/home/runner/externals
deploy:
# TODO 4 runner + lock gpu when running
replicas: 1
restart: always
environment:
RUNNER_ALLOW_RUNASROOT: "1"
privileged: true
user: root
# ref: https://github.com/actions/runner/issues/367#issuecomment-2007558723
# ref: https://github.com/actions/runner
# args ref: https://github.com/actions/runner/blob/68ff57dbc4c836d50f46602a8a53301fb9513eb4/src/Runner.Listener/CommandSettings.cs#L53
# TODO seems we should not run config.sh repeatedly
entrypoint: >
sh -c "
cd /data/miles_ci &&
/home/runner/config.sh --url ${GITHUB_RUNNER_URL} --token ${GITHUB_RUNNER_TOKEN} --unattended --work /data/miles_ci/runner_$(hostname) &&
/home/runner/run.sh
"
108 changes: 108 additions & 0 deletions tests/command_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import datetime
import json
import os
import random
import subprocess
from pathlib import Path

repo_base_dir = Path(os.path.abspath(__file__)).resolve().parents[1]


def convert_checkpoint(model_name, model_type):
# TODO shall we make it in host-mapped folder and thus can cache it to speedup CI
path_dst = f"/root/{model_name}_torch_dist"
if Path(path_dst).exists():
print(f"convert_checkpoint skip {path_dst} since exists")
return

exec_command(
f"source {repo_base_dir}/scripts/models/{model_type}.sh && "
"PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py "
"${MODEL_ARGS[@]} "
f"--hf-checkpoint /root/models/{model_name} "
f"--save {path_dst}"
)


def execute_train(
train_args: str,
num_gpus: int,
model_type: str,
master_addr: str = "127.0.0.1",
):
exec_command(
"pkill -9 sglang; "
"sleep 3; "
"ray stop --force; "
"pkill -9 ray; "
# cannot be run in CI, o/w kill the parent script
# TODO: do we really need this kill? (or can we instead kill miles)
# "pkill -9 python; "
"pkill -9 miles; "
"sleep 3; "
"pkill -9 ray; "
# "pkill -9 python; "
"pkill -9 miles; "
"pkill -9 redis; "
"true; "
)

exec_command(
# will prevent ray from buffering stdout/stderr
f"export PYTHONBUFFERED=16 && "
f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus} --disable-usage-stats"
)

runtime_env_json = json.dumps(
{
"env_vars": {
"PYTHONPATH": "/root/Megatron-LM/",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"NCCL_NVLS_ENABLE": str(int(check_has_nvlink())),
"no_proxy": f"127.0.0.1,{master_addr}",
}
}
)

exec_command(
f"export PYTHONBUFFERED=16 && "
f'source "{repo_base_dir}/scripts/models/{model_type}.sh" && '
# TODO should this 127.0.0.1 be `master_addr` instead
f'ray job submit --address="http://127.0.0.1:8265" '
f"--runtime-env-json='{runtime_env_json}' "
"-- python3 train.py "
"${MODEL_ARGS[@]} "
f"{train_args}"
)


def check_has_nvlink():
output = exec_command("nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l", capture_output=True)
return int(output) > 0


def get_default_wandb_args(test_file: str):
if not os.environ.get("WANDB_API_KEY"):
print("Skip wandb configuration since WANDB_API_KEY is not found")
return ""

test_name = Path(test_file).stem

run_name = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{random.randint(0, 1000000000)}"
if (x := os.environ.get("GITHUB_COMMIT_NAME")) is not None:
run_name += f"_{x}"

# do not put wandb_api_key value here to avoid leaking to logs explicitly
return (
"--use-wandb "
f"--wandb-project miles-ci-{test_name} "
f"--wandb-group {run_name} "
f"--wandb-key ${{WANDB_API_KEY}} "
)


def exec_command(cmd: str, capture_output: bool = False):
print(f"EXEC: {cmd}", flush=True)
result = subprocess.run(["bash", "-c", cmd], shell=False, check=True, capture_output=capture_output)
if capture_output:
return result.stdout
135 changes: 0 additions & 135 deletions tests/test-qwen2.5-0.5B-gsm8k-async.sh

This file was deleted.

Loading
Loading