diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 44e7604a..aa6f1152 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -12,7 +12,7 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver | Workflow | Trigger | Description | | -------------------------------------------------- | ---------------------------------------- | ----------------------------------------------------- | | [ci-checks.yml](ci-checks.yml) | Push to `main`, PRs, manual | Format, typecheck, unit tests, and CPU smoke tests | -| [gpu-tests.yml](gpu-tests.yml) | Push to `main`/`pull-request/*`, manual | GPU smoke tests (required) and E2E tests (A100) | +| [gpu-tests.yml](gpu-tests.yml) | Nightly , manual | GPU smoke tests (required) and E2E tests | | [conventional-commit.yml](conventional-commit.yml) | PRs | Validates PR titles follow conventional commit format | | [docs.yml](docs.yml) | Push to `main` (docs paths) | Builds and deploys documentation to GitHub Pages | | [release.yml](release.yml) | Manual dispatch | Builds and publishes package to Test PyPI or PyPI (production) | @@ -133,10 +133,10 @@ All jobs run on `ubuntu-latest` (GitHub-hosted). ## GPU Tests Workflow -The `gpu-tests.yml` workflow runs on pushes to `main` and `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`: +The `gpu-tests.yml` workflow runs on a schedule and using `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`: -- GPU Smoke Tests: Quick smoke tests on `linux-amd64-gpu-a100-latest-1` (A100) with a 30-minute job timeout and 20-minute step timeout. Required for merge. -- GPU E2E Tests: End-to-end tests on `linux-amd64-gpu-a100-latest-1` (A100) with a 55-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge. +- GPU Smoke Tests: Quick smoke tests on a gpu runner with a 30-minute job timeout and 20-minute step timeout. Required for merge. +- GPU E2E Tests: End-to-end tests on a gpu runner with a 55-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge. - GPU CI Status: Aggregation job -- single required check for branch protection. Fails if smoke tests fail; warns if E2E tests fail. The `changes` (Detect Changes) job always runs, including on `workflow_dispatch`. `dorny/paths-filter` outputs `true` for all filters when there is no base commit to diff against, so downstream jobs always run on a manual dispatch. The job must not be conditionally skipped: a skipped `needs` dependency causes downstream jobs to be skipped even when their own `if` condition would pass. @@ -154,8 +154,8 @@ To trigger from the PR UI and get a status check result, use `/sync` -- see [On- | Workflow | Job | Runner Label | Type | | --- | --- | --- | --- | | CI Checks | All jobs | `ubuntu-latest` | GitHub-hosted | -| GPU Tests | GPU Smoke Tests | `linux-amd64-gpu-a100-latest-1` | NVIDIA self-hosted GPU (A100) | -| GPU Tests | GPU E2E Tests | `linux-amd64-gpu-a100-latest-1` | NVIDIA self-hosted GPU (A100) | +| GPU Tests | GPU Smoke Tests | `nemo-ci-aws-gpu-x2` | NVIDIA self-hosted GPU | +| GPU Tests | GPU E2E Tests | `nemo-ci-aws-gpu-x2` | NVIDIA self-hosted GPU | | GPU Tests | Detect Changes, GPU CI Status | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) | | Dev Wheel | All jobs | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) | | Internal Release | All jobs | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) | diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 7a0d580a..99e959c6 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -22,9 +22,10 @@ name: GPU Tests on: + schedule: + - cron: '0 2 * * *' push: branches: - - main - "pull-request/[0-9]+" workflow_dispatch: @@ -60,7 +61,7 @@ jobs: needs: changes if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }} timeout-minutes: 30 - runs-on: linux-amd64-gpu-a100-latest-1 + runs-on: nemo-ci-aws-gpu-x2 strategy: fail-fast: false matrix: @@ -71,6 +72,9 @@ jobs: with: fetch-depth: 0 + - name: Install make + run: apt-get update && apt-get install -y --no-install-recommends make + - name: Setup Python environment uses: ./.github/actions/setup-python-env with: @@ -80,6 +84,10 @@ jobs: - name: Bootstrap CUDA environment run: make bootstrap-nss cu128 + - name: Check GPU availability + run: | + uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())" + - name: Run GPU smoke tests timeout-minutes: 20 run: make test-smoke-gpu @@ -89,13 +97,16 @@ jobs: needs: changes if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }} timeout-minutes: 55 - runs-on: linux-amd64-gpu-a100-latest-1 + runs-on: nemo-ci-aws-gpu-x2 steps: - name: checkout uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Install make + run: apt-get update && apt-get install -y --no-install-recommends make + - name: Setup Python environment uses: ./.github/actions/setup-python-env with: @@ -105,6 +116,10 @@ jobs: - name: Bootstrap CUDA environment run: make bootstrap-nss cu128 + - name: Check GPU availability + run: | + uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())" + - name: Run GPU E2E tests timeout-minutes: 45 run: make test-e2e