NVIDIA · pciolkosz · Jun 12, 2026 · Jun 14, 2026
@@ -268,6 +268,13 @@ def get_gpu(gpu_string):
     result = matrix_yaml["gpus"][gpu_string]
     result["id"] = gpu_string
 
+    required_fields = ["name", "runner", "sm"]
+    missing_fields = [field for field in required_fields if field not in result]
+    if missing_fields:
+        raise Exception(
+            f"GPU '{gpu_string}' is missing required field(s): {', '.join(missing_fields)}"
+        )
+
     if "testing" not in result:
         result["testing"] = False
 
@@ -426,7 +433,11 @@ def generate_dispatch_group_name(matrix_job):
 def generate_dispatch_job_name(matrix_job, job_type):
     job_info = get_job_type_info(job_type)
     cpu_str = matrix_job["cpu"]
-    gpu_str = (", " + matrix_job["gpu"].upper()) if job_info["gpu"] else ""
+    if job_info["gpu"]:
+        gpu = get_gpu(matrix_job["gpu"])
+        gpu_str = ", " + gpu["name"]
+    else:
+        gpu_str = ""
     cuda_compile_arch = (
         (" sm{" + str(matrix_job["sm"]) + "}") if "sm" in matrix_job else ""
     )
@@ -470,7 +481,7 @@ def generate_dispatch_job_runner(matrix_job, job_type):
     gpu = get_gpu(matrix_job["gpu"])
     suffix = "-testing" if gpu["testing"] else ""
 
-    return f"{runner_os}-{cpu}-gpu-{gpu['id']}-latest-1{suffix}"
+    return f"{runner_os}-{cpu}-gpu-{gpu['runner']}{suffix}"
 
 
 def generate_dispatch_job_ctk_version(matrix_job, job_type):

@@ -169,9 +169,14 @@ runs:
 
         declare -a gpu_request=()
 
-        # Explicitly pass which GPU to use if on a GPU runner
+        # Explicitly pass which GPU to use if on a GPU runner.
         if [[ "${JOB_RUNNER}" = *"-gpu-"* ]]; then
-          gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}")
+          # Multi-GPU runner labels end with "-<gpu-count>"; expose the full set.
+          if [[ "${JOB_RUNNER}" =~ -gpu-.+-([0-9]+)(-testing)?$ ]] && (( BASH_REMATCH[1] > 1 )); then
+            gpu_request+=(--gpus all)
+          else
+            gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}")
+          fi
         fi
 
         # If the image contains "cudaXX.Yext"...

@@ -54,6 +54,8 @@ workflows:
     - {jobs: ['test_nolid', 'test_lid0'], project: 'cub',                   std: 'max', gpu: 'h100' }
     - {jobs: ['test_gpu'],                project: 'thrust',                std: 'max', gpu: 'h100' }
     - {jobs: ['test'],                    project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' }
+    # Multi-GPU coverage:
+    - {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
     # RTX PRO 6000 coverage (limited due to small number of runners):
     - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'}
     # Misc:
@@ -645,17 +647,20 @@ projects:
     name: 'clang-tidy'
     stds: [17]
 
-# testing -> Runner with GPU is in a nv-gh-runners testing pool
+# name -> Display name for generated job labels.
+# runner -> GPU/driver/count segment of the GHA runner label.
+# testing -> Runner with GPU is in a nv-gh-runners testing pool.
 gpus:
-  t4:         { sm: 75  } # 16 GB,  10 runners
-  rtx2080:    { sm: 75  } #  8 GB,  12 runners
-  rtxa6000:   { sm: 86  } # 48 GB,  12 runners
-  l4:         { sm: 89  } # 24 GB,  48 runners
-  rtx4090:    { sm: 89  } # 24 GB,  10 runners
-  h100:       { sm: 90  } # 80 GB,  16 runners
+  t4:         { name: 'T4',          sm: 75,  runner: 't4-latest-1'         } # 16 GB,  10 runners
+  rtx2080:    { name: 'RTX2080',     sm: 75,  runner: 'rtx2080-latest-1'    } #  8 GB,  12 runners
+  rtxa6000:   { name: 'RTXA6000',    sm: 86,  runner: 'rtxa6000-latest-1'   } # 48 GB,  12 runners
+  l4:         { name: 'L4',          sm: 89,  runner: 'l4-latest-1'         } # 24 GB,  48 runners
+  rtx4090:    { name: 'RTX4090',     sm: 89,  runner: 'rtx4090-latest-1'    } # 24 GB,  10 runners
+  h100:       { name: 'H100',        sm: 90,  runner: 'h100-latest-1'       } # 80 GB,  16 runners
+  h100_2gpu:  { name: 'H100 x2',     sm: 90,  runner: 'h100-latest-2'       } # 2 x 80 GB
   # Very small number of runners on loan from cuda-python while we wait for our order to arrive.
   # Limit jobs on these:
-  rtxpro6000: { sm: 120 }
+  rtxpro6000: { name: 'RTXPRO6000',  sm: 120, runner: 'rtxpro6000-latest-1' }
 
 # Tags are used to define a `matrix job` in the workflow section.
 #