Skip to content

Commit

Permalink
feat: add ability to enable CDI in containerd.
Browse files Browse the repository at this point in the history
Needed to support NVIDIA's DRA plugin, which will be used in tandem with upcoming GPU hardware.
  • Loading branch information
Alex Benn committed Mar 7, 2025
1 parent 47b9aa9 commit ef31941
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 10 deletions.
35 changes: 35 additions & 0 deletions aks-node-controller/parser/helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,41 @@ oom_score = -999
X-Meta-Source-Client = ["azure/aks"]
[metrics]
address = "0.0.0.0:10257"
`)),
},
{
name: "Containerd Configurations with NVIDIA and CDI",
args: args{
aksnodeconfig: &aksnodeconfigv1.Configuration{
NeedsCgroupv2: ToPtr(true),
GpuConfig: &aksnodeconfigv1.GpuConfig{
EnableNvidia: ToPtr(true),
RequiresCdi: ToPtr(true),
},
},
noGpu: false,
},
want: base64.StdEncoding.EncodeToString([]byte(`version = 2
oom_score = -999
[plugins."io.containerd.grpc.v1.cri"]
enable_cdi = true
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
sandbox_image = ""
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia-container-runtime"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
[plugins."io.containerd.grpc.v1.cri".registry.headers]
X-Meta-Source-Client = ["azure/aks"]
[metrics]
address = "0.0.0.0:10257"
`)),
},
}
Expand Down
4 changes: 4 additions & 0 deletions aks-node-controller/parser/templates/containerd.toml.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ version = 2
oom_score = -999{{if getHasDataDir .KubeletConfig}}
root = "{{.KubeletConfig.GetContainerDataDir}}"{{- end}}
[plugins."io.containerd.grpc.v1.cri"]
{{- if and ( .GetGpuConfig ) ( .GpuConfig.GetRequiresCdi ) }}
enable_cdi = true
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
{{- end }}
sandbox_image = "{{ .KubeBinaryConfig.GetPodInfraContainerImageUrl }}"
[plugins."io.containerd.grpc.v1.cri".containerd]
{{- if .TeleportConfig.GetStatus }}
Expand Down
33 changes: 23 additions & 10 deletions aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ message GpuConfig {

// Same as enable_nvidia, but for AMD GPUs.
optional bool enable_amd_gpu = 5;

// Whether this GPU configuration requires CDI (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html).
optional bool requires_cdi = 6;
}

0 comments on commit ef31941

Please sign in to comment.