From 72bd191ca6ac7cf1ec49611da10bf96563f78738 Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Thu, 30 Oct 2025 13:20:46 -0400 Subject: [PATCH 1/5] Test using new CI images Signed-off-by: Dillon Cullinan --- .github/actions/docker-build/action.yml | 6 --- .../container-validation-backends.yml | 48 +++++-------------- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index ed4617bd9c..b42a4bb773 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -62,12 +62,6 @@ runs: uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 #v3.11.1 with: driver: docker - - name: Install awscli - shell: bash - run: | - curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - name: Login to ECR shell: bash env: diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index b197838242..1d2598a39d 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -44,8 +44,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: cpu-amd-m5-2xlarge } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + - { arch: amd64, runner: cpu-amd-m5-2xlarge-test } + - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } name: operator (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -55,12 +55,6 @@ jobs: uses: docker/setup-buildx-action@v3 with: driver: docker - - name: Install awscli - shell: bash - run: | - curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - name: Login to ECR shell: bash env: @@ -113,8 +107,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64 } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + - { arch: amd64, runner: gpu-l40-amd64-test } + - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } name: vllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -178,8 +172,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64 } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + - { arch: amd64, runner: gpu-l40-amd64-test } + - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } name: sglang (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -241,8 +235,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64 } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + - { arch: amd64, runner: gpu-l40-amd64-test } + - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } name: trtllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -348,7 +342,7 @@ jobs: python3 .github/workflows/upload_complete_workflow_metrics.py deploy-test-vllm: - runs-on: cpu-amd-m5-2xlarge + runs-on: cpu-amd-m5-2xlarge-test if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, vllm] permissions: @@ -369,7 +363,7 @@ jobs: MODEL_NAME: "Qwen/Qwen3-0.6B" steps: &deploy-test-steps - uses: actions/checkout@v4 - - name: Set namespace and install dependencies + - name: Set namespace run: | # Set namespace using FRAMEWORK env var PROFILE_SANITIZED="${{ matrix.profile }}" @@ -377,24 +371,6 @@ jobs: echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV set -x - # Install dependencies - sudo apt-get update && sudo apt-get install -y curl bash openssl gettext git jq - - # Install yq - echo "Installing yq..." - curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o yq - sudo chmod 755 yq - sudo mv yq /usr/local/bin/ - # Install Helm - echo "Installing Helm..." - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - sudo chmod 700 get_helm.sh - sudo ./get_helm.sh - # Install kubectl - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - sudo chmod 755 kubectl - sudo mv kubectl /usr/local/bin/ - # Setup kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig @@ -568,7 +544,7 @@ jobs: echo "Namespace $NAMESPACE completed." deploy-test-sglang: - runs-on: cpu-amd-m5-2xlarge + runs-on: cpu-amd-m5-2xlarge-test if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, sglang] permissions: @@ -588,7 +564,7 @@ jobs: steps: *deploy-test-steps deploy-test-trtllm: - runs-on: cpu-amd-m5-2xlarge + runs-on: cpu-amd-m5-2xlarge-test if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, trtllm] permissions: From a16e99df4610d2bc7f4686f108f8c73f04804b18 Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Thu, 30 Oct 2025 13:37:57 -0400 Subject: [PATCH 2/5] Fix arm runners Signed-off-by: Dillon Cullinan --- .github/workflows/container-validation-backends.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index 1d2598a39d..23a7bc0074 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -45,7 +45,7 @@ jobs: matrix: platform: - { arch: amd64, runner: cpu-amd-m5-2xlarge-test } - - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } name: operator (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -108,7 +108,7 @@ jobs: matrix: platform: - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } name: vllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -173,7 +173,7 @@ jobs: matrix: platform: - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } name: sglang (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -236,7 +236,7 @@ jobs: matrix: platform: - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-amd-m5-2xlarge-test } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } name: trtllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: From 251103547b0995116e59b3cd52a6c7445057673b Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Thu, 30 Oct 2025 14:53:19 -0400 Subject: [PATCH 3/5] Debug docker runtime Signed-off-by: Dillon Cullinan --- .github/actions/pytest/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 0037129a5e..97521e92b4 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -54,6 +54,7 @@ runs: # Run pytest with detailed output and JUnit XML set +e # Don't exit on test failures + sudo cat /etc/docker/daemon.json docker run --runtime=nvidia --rm --gpus all -w /workspace \ --cpus=${NUM_CPUS} \ --network host \ From 54c50149270d79e9f326c6e4a20bb1fbf3494a82 Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Fri, 31 Oct 2025 11:19:09 -0400 Subject: [PATCH 4/5] Revert testing commits Signed-off-by: Dillon Cullinan --- .github/actions/pytest/action.yml | 1 - .../container-validation-backends.yml | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 97521e92b4..0037129a5e 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -54,7 +54,6 @@ runs: # Run pytest with detailed output and JUnit XML set +e # Don't exit on test failures - sudo cat /etc/docker/daemon.json docker run --runtime=nvidia --rm --gpus all -w /workspace \ --cpus=${NUM_CPUS} \ --network host \ diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index 23a7bc0074..7c1bf3d2b9 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -107,8 +107,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } + - { arch: amd64, runner: gpu-l40-amd64 } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } name: vllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -172,8 +172,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } + - { arch: amd64, runner: gpu-l40-amd64 } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } name: sglang (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -235,8 +235,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: gpu-l40-amd64-test } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } + - { arch: amd64, runner: gpu-l40-amd64 } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } name: trtllm (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: @@ -342,7 +342,7 @@ jobs: python3 .github/workflows/upload_complete_workflow_metrics.py deploy-test-vllm: - runs-on: cpu-amd-m5-2xlarge-test + runs-on: cpu-amd-m5-2xlarge if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, vllm] permissions: @@ -544,7 +544,7 @@ jobs: echo "Namespace $NAMESPACE completed." deploy-test-sglang: - runs-on: cpu-amd-m5-2xlarge-test + runs-on: cpu-amd-m5-2xlarge if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, sglang] permissions: @@ -564,7 +564,7 @@ jobs: steps: *deploy-test-steps deploy-test-trtllm: - runs-on: cpu-amd-m5-2xlarge-test + runs-on: cpu-amd-m5-2xlarge if: needs.changed-files.outputs.has_code_changes == 'true' needs: [changed-files, operator, trtllm] permissions: From 797be9c23b45bcddaa3ee24f8e8e2122270b8c1a Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Fri, 31 Oct 2025 11:55:35 -0400 Subject: [PATCH 5/5] Small removal fix Signed-off-by: Dillon Cullinan --- .github/workflows/container-validation-backends.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index 7c1bf3d2b9..e2f4c82216 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -44,8 +44,8 @@ jobs: fail-fast: false matrix: platform: - - { arch: amd64, runner: cpu-amd-m5-2xlarge-test } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge-test } + - { arch: amd64, runner: cpu-amd-m5-2xlarge } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } name: operator (${{ matrix.platform.arch }}) runs-on: ${{ matrix.platform.runner }} steps: