NVIDIA-NeMo · mckornfield · Apr 3, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
@@ -12,7 +12,7 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
 | Workflow                                           | Trigger                                  | Description                                           |
 | -------------------------------------------------- | ---------------------------------------- | ----------------------------------------------------- |
 | [ci-checks.yml](ci-checks.yml)                     | Push to `main`, PRs, manual              | Format, typecheck, unit tests, and CPU smoke tests    |
-| [gpu-tests.yml](gpu-tests.yml)                     | Push to `main`/`pull-request/*`, manual  | GPU smoke tests (required) and E2E tests (A100)       |
+| [gpu-tests.yml](gpu-tests.yml)                     | Nightly , manual                         | GPU smoke tests (required) and E2E tests              |
-| [gpu-tests.yml](gpu-tests.yml)                     | Nightly , manual                         | GPU smoke tests (required) and E2E tests              |
+| [gpu-tests.yml](gpu-tests.yml)                     | Nightly, push to `pull-request/*`, manual | GPU smoke tests (required) and E2E tests             |
-| [gpu-tests.yml](gpu-tests.yml)                     | Nightly , manual                         | GPU smoke tests (required) and E2E tests              |
+| [gpu-tests.yml](gpu-tests.yml)                     | Nightly, push to `pull-request/*`, manual | GPU smoke tests (required) and E2E tests             |
 | [conventional-commit.yml](conventional-commit.yml) | PRs                                      | Validates PR titles follow conventional commit format |
 | [docs.yml](docs.yml)                               | Push to `main` (docs paths)              | Builds and deploys documentation to GitHub Pages      |
 | [release.yml](release.yml)                         | Manual dispatch                          | Builds and publishes package to Test PyPI or PyPI (production)     |
@@ -133,10 +133,10 @@ All jobs run on `ubuntu-latest` (GitHub-hosted).
 
 ## GPU Tests Workflow
 
-The `gpu-tests.yml` workflow runs on pushes to `main` and `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
+The `gpu-tests.yml` workflow runs on a schedule and using `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
-The `gpu-tests.yml` workflow runs on a schedule and using `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
+The `gpu-tests.yml` workflow runs on a schedule and on pushes to `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
-The `gpu-tests.yml` workflow runs on a schedule and using `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
+The `gpu-tests.yml` workflow runs on a schedule and on pushes to `pull-request/*` branches (via copy-pr-bot), and can also be triggered manually via `workflow_dispatch`:
 
-- GPU Smoke Tests: Quick smoke tests on `linux-amd64-gpu-a100-latest-1` (A100) with a 30-minute job timeout and 20-minute step timeout. Required for merge.
-- GPU E2E Tests: End-to-end tests on `linux-amd64-gpu-a100-latest-1` (A100) with a 55-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
+- GPU Smoke Tests: Quick smoke tests on a gpu runner with a 30-minute job timeout and 20-minute step timeout. Required for merge.
+- GPU E2E Tests: End-to-end tests on a gpu runner with a 55-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
 - GPU CI Status: Aggregation job -- single required check for branch protection. Fails if smoke tests fail; warns if E2E tests fail.
 
 The `changes` (Detect Changes) job always runs, including on `workflow_dispatch`. `dorny/paths-filter` outputs `true` for all filters when there is no base commit to diff against, so downstream jobs always run on a manual dispatch. The job must not be conditionally skipped: a skipped `needs` dependency causes downstream jobs to be skipped even when their own `if` condition would pass.
@@ -154,8 +154,8 @@ To trigger from the PR UI and get a status check result, use `/sync` -- see [On-
 | Workflow | Job | Runner Label | Type |
 | --- | --- | --- | --- |
 | CI Checks | All jobs | `ubuntu-latest` | GitHub-hosted |
-| GPU Tests | GPU Smoke Tests | `linux-amd64-gpu-a100-latest-1` | NVIDIA self-hosted GPU (A100) |
-| GPU Tests | GPU E2E Tests | `linux-amd64-gpu-a100-latest-1` | NVIDIA self-hosted GPU (A100) |
+| GPU Tests | GPU Smoke Tests | `nemo-ci-aws-gpu-x2` | NVIDIA self-hosted GPU |
+| GPU Tests | GPU E2E Tests | `nemo-ci-aws-gpu-x2` | NVIDIA self-hosted GPU |
 | GPU Tests | Detect Changes, GPU CI Status | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) |
 | Dev Wheel | All jobs | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) |
 | Internal Release | All jobs | `linux-amd64-cpu4` | NVIDIA self-hosted CPU (4-core) |

@@ -22,9 +22,10 @@
 name: GPU Tests
 
 on:
+  schedule:
+    - cron: '0 2 * * *'
   push:
     branches:
-      - main
       - "pull-request/[0-9]+"
-      - "pull-request/[0-9]+"
+      - "pull-request/*"
-      - "pull-request/[0-9]+"
+      - "pull-request/*"
   workflow_dispatch:
 
@@ -60,7 +61,7 @@ jobs:
     needs: changes
     if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }}
     timeout-minutes: 30
-    runs-on: linux-amd64-gpu-a100-latest-1
+    runs-on: nemo-ci-aws-gpu-x2
     strategy:
       fail-fast: false
       matrix:
@@ -71,6 +72,9 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Install make
+        run: apt-get update && apt-get install -y --no-install-recommends make
-        run: apt-get update && apt-get install -y --no-install-recommends make
+        run: |
+          if ! command -v make >/dev/null 2>&1; then
+            apt-get update
+            apt-get install -y --no-install-recommends make
+          fi
-        run: apt-get update && apt-get install -y --no-install-recommends make
+        run: |
+          if ! command -v make >/dev/null 2>&1; then
+            apt-get update
+            apt-get install -y --no-install-recommends make
+          fi
+
       - name: Setup Python environment
         uses: ./.github/actions/setup-python-env
         with:
@@ -80,6 +84,10 @@ jobs:
       - name: Bootstrap CUDA environment
         run: make bootstrap-nss cu128
 
+      - name: Check GPU availability
+        run: |
+          uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
-          uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+          uv run python -c "import sys, torch; available = torch.cuda.is_available(); count = torch.cuda.device_count(); print('cuda available:', available); print('device count:', count); sys.exit('CUDA is not available on this runner') if not available else sys.exit('No CUDA devices detected on this runner') if count < 1 else None"
-          uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+          uv run python -c "import sys, torch; available = torch.cuda.is_available(); count = torch.cuda.device_count(); print('cuda available:', available); print('device count:', count); sys.exit('CUDA is not available on this runner') if not available else sys.exit('No CUDA devices detected on this runner') if count < 1 else None"
+
       - name: Run GPU smoke tests
         timeout-minutes: 20
         run: make test-smoke-gpu
@@ -89,13 +97,16 @@ jobs:
     needs: changes
     if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }}
     timeout-minutes: 55
-    runs-on: linux-amd64-gpu-a100-latest-1
+    runs-on: nemo-ci-aws-gpu-x2
-    runs-on: nemo-ci-aws-gpu-x2
+    runs-on: linux-amd64-gpu-a100-latest-1
-    runs-on: nemo-ci-aws-gpu-x2
+    runs-on: linux-amd64-gpu-a100-latest-1
     steps:
       - name: checkout
         uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
+      - name: Install make
+        run: apt-get update && apt-get install -y --no-install-recommends make
+
       - name: Setup Python environment
         uses: ./.github/actions/setup-python-env
         with:
@@ -105,6 +116,10 @@ jobs:
       - name: Bootstrap CUDA environment
         run: make bootstrap-nss cu128
 
+      - name: Check GPU availability
+        run: |
+          uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+
       - name: Run GPU E2E tests
         timeout-minutes: 45
         run: make test-e2e