diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index b2eb13489e6..631a60f9e05 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -268,6 +268,13 @@ def get_gpu(gpu_string): result = matrix_yaml["gpus"][gpu_string] result["id"] = gpu_string + required_fields = ["name", "runner", "sm"] + missing_fields = [field for field in required_fields if field not in result] + if missing_fields: + raise Exception( + f"GPU '{gpu_string}' is missing required field(s): {', '.join(missing_fields)}" + ) + if "testing" not in result: result["testing"] = False @@ -426,7 +433,11 @@ def generate_dispatch_group_name(matrix_job): def generate_dispatch_job_name(matrix_job, job_type): job_info = get_job_type_info(job_type) cpu_str = matrix_job["cpu"] - gpu_str = (", " + matrix_job["gpu"].upper()) if job_info["gpu"] else "" + if job_info["gpu"]: + gpu = get_gpu(matrix_job["gpu"]) + gpu_str = ", " + gpu["name"] + else: + gpu_str = "" cuda_compile_arch = ( (" sm{" + str(matrix_job["sm"]) + "}") if "sm" in matrix_job else "" ) @@ -470,7 +481,7 @@ def generate_dispatch_job_runner(matrix_job, job_type): gpu = get_gpu(matrix_job["gpu"]) suffix = "-testing" if gpu["testing"] else "" - return f"{runner_os}-{cpu}-gpu-{gpu['id']}-latest-1{suffix}" + return f"{runner_os}-{cpu}-gpu-{gpu['runner']}{suffix}" def generate_dispatch_job_ctk_version(matrix_job, job_type): diff --git a/.github/actions/workflow-run-job-linux/action.yml b/.github/actions/workflow-run-job-linux/action.yml index b2daee41d2a..bfe4898e4ec 100644 --- a/.github/actions/workflow-run-job-linux/action.yml +++ b/.github/actions/workflow-run-job-linux/action.yml @@ -169,9 +169,14 @@ runs: declare -a gpu_request=() - # Explicitly pass which GPU to use if on a GPU runner + # Explicitly pass which GPU to use if on a GPU runner. if [[ "${JOB_RUNNER}" = *"-gpu-"* ]]; then - gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}") + # Multi-GPU runner labels end with "-"; expose the full set. + if [[ "${JOB_RUNNER}" =~ -gpu-.+-([0-9]+)(-testing)?$ ]] && (( BASH_REMATCH[1] > 1 )); then + gpu_request+=(--gpus all) + else + gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}") + fi fi # If the image contains "cudaXX.Yext"... diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6146d863c87..daf13f7ffd9 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -54,6 +54,8 @@ workflows: - {jobs: ['test_nolid', 'test_lid0'], project: 'cub', std: 'max', gpu: 'h100' } - {jobs: ['test_gpu'], project: 'thrust', std: 'max', gpu: 'h100' } - {jobs: ['test'], project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' } + # Multi-GPU coverage: + - {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'} # RTX PRO 6000 coverage (limited due to small number of runners): - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'} # Misc: @@ -645,17 +647,20 @@ projects: name: 'clang-tidy' stds: [17] -# testing -> Runner with GPU is in a nv-gh-runners testing pool +# name -> Display name for generated job labels. +# runner -> GPU/driver/count segment of the GHA runner label. +# testing -> Runner with GPU is in a nv-gh-runners testing pool. gpus: - t4: { sm: 75 } # 16 GB, 10 runners - rtx2080: { sm: 75 } # 8 GB, 12 runners - rtxa6000: { sm: 86 } # 48 GB, 12 runners - l4: { sm: 89 } # 24 GB, 48 runners - rtx4090: { sm: 89 } # 24 GB, 10 runners - h100: { sm: 90 } # 80 GB, 16 runners + t4: { name: 'T4', sm: 75, runner: 't4-latest-1' } # 16 GB, 10 runners + rtx2080: { name: 'RTX2080', sm: 75, runner: 'rtx2080-latest-1' } # 8 GB, 12 runners + rtxa6000: { name: 'RTXA6000', sm: 86, runner: 'rtxa6000-latest-1' } # 48 GB, 12 runners + l4: { name: 'L4', sm: 89, runner: 'l4-latest-1' } # 24 GB, 48 runners + rtx4090: { name: 'RTX4090', sm: 89, runner: 'rtx4090-latest-1' } # 24 GB, 10 runners + h100: { name: 'H100', sm: 90, runner: 'h100-latest-1' } # 80 GB, 16 runners + h100_2gpu: { name: 'H100 x2', sm: 90, runner: 'h100-latest-2' } # 2 x 80 GB # Very small number of runners on loan from cuda-python while we wait for our order to arrive. # Limit jobs on these: - rtxpro6000: { sm: 120 } + rtxpro6000: { name: 'RTXPRO6000', sm: 120, runner: 'rtxpro6000-latest-1' } # Tags are used to define a `matrix job` in the workflow section. #