Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions .github/actions/workflow-build/build-workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,13 @@ def get_gpu(gpu_string):
result = matrix_yaml["gpus"][gpu_string]
result["id"] = gpu_string

required_fields = ["name", "runner", "sm"]
missing_fields = [field for field in required_fields if field not in result]
if missing_fields:
raise Exception(
f"GPU '{gpu_string}' is missing required field(s): {', '.join(missing_fields)}"
)

if "testing" not in result:
result["testing"] = False

Expand Down Expand Up @@ -426,7 +433,11 @@ def generate_dispatch_group_name(matrix_job):
def generate_dispatch_job_name(matrix_job, job_type):
job_info = get_job_type_info(job_type)
cpu_str = matrix_job["cpu"]
gpu_str = (", " + matrix_job["gpu"].upper()) if job_info["gpu"] else ""
if job_info["gpu"]:
gpu = get_gpu(matrix_job["gpu"])
gpu_str = ", " + gpu["name"]
else:
gpu_str = ""
cuda_compile_arch = (
(" sm{" + str(matrix_job["sm"]) + "}") if "sm" in matrix_job else ""
)
Expand Down Expand Up @@ -470,7 +481,7 @@ def generate_dispatch_job_runner(matrix_job, job_type):
gpu = get_gpu(matrix_job["gpu"])
suffix = "-testing" if gpu["testing"] else ""

return f"{runner_os}-{cpu}-gpu-{gpu['id']}-latest-1{suffix}"
return f"{runner_os}-{cpu}-gpu-{gpu['runner']}{suffix}"


def generate_dispatch_job_ctk_version(matrix_job, job_type):
Expand Down
9 changes: 7 additions & 2 deletions .github/actions/workflow-run-job-linux/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,14 @@ runs:

declare -a gpu_request=()

# Explicitly pass which GPU to use if on a GPU runner
# Explicitly pass which GPU to use if on a GPU runner.
if [[ "${JOB_RUNNER}" = *"-gpu-"* ]]; then
gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}")
# Multi-GPU runner labels end with "-<gpu-count>"; expose the full set.
if [[ "${JOB_RUNNER}" =~ -gpu-.+-([0-9]+)(-testing)?$ ]] && (( BASH_REMATCH[1] > 1 )); then
gpu_request+=(--gpus all)
else
gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES:-}")
fi
fi

# If the image contains "cudaXX.Yext"...
Expand Down
21 changes: 13 additions & 8 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ workflows:
- {jobs: ['test_nolid', 'test_lid0'], project: 'cub', std: 'max', gpu: 'h100' }
- {jobs: ['test_gpu'], project: 'thrust', std: 'max', gpu: 'h100' }
- {jobs: ['test'], project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' }
# Multi-GPU coverage:
- {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
# RTX PRO 6000 coverage (limited due to small number of runners):
- {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'}
# Misc:
Expand Down Expand Up @@ -645,17 +647,20 @@ projects:
name: 'clang-tidy'
stds: [17]

# testing -> Runner with GPU is in a nv-gh-runners testing pool
# name -> Display name for generated job labels.
# runner -> GPU/driver/count segment of the GHA runner label.
# testing -> Runner with GPU is in a nv-gh-runners testing pool.
gpus:
t4: { sm: 75 } # 16 GB, 10 runners
rtx2080: { sm: 75 } # 8 GB, 12 runners
rtxa6000: { sm: 86 } # 48 GB, 12 runners
l4: { sm: 89 } # 24 GB, 48 runners
rtx4090: { sm: 89 } # 24 GB, 10 runners
h100: { sm: 90 } # 80 GB, 16 runners
t4: { name: 'T4', sm: 75, runner: 't4-latest-1' } # 16 GB, 10 runners
rtx2080: { name: 'RTX2080', sm: 75, runner: 'rtx2080-latest-1' } # 8 GB, 12 runners
rtxa6000: { name: 'RTXA6000', sm: 86, runner: 'rtxa6000-latest-1' } # 48 GB, 12 runners
l4: { name: 'L4', sm: 89, runner: 'l4-latest-1' } # 24 GB, 48 runners
rtx4090: { name: 'RTX4090', sm: 89, runner: 'rtx4090-latest-1' } # 24 GB, 10 runners
h100: { name: 'H100', sm: 90, runner: 'h100-latest-1' } # 80 GB, 16 runners
h100_2gpu: { name: 'H100 x2', sm: 90, runner: 'h100-latest-2' } # 2 x 80 GB
# Very small number of runners on loan from cuda-python while we wait for our order to arrive.
# Limit jobs on these:
rtxpro6000: { sm: 120 }
rtxpro6000: { name: 'RTXPRO6000', sm: 120, runner: 'rtxpro6000-latest-1' }

# Tags are used to define a `matrix job` in the workflow section.
#
Expand Down
Loading