diff --git a/aks-node-controller/parser/helper_test.go b/aks-node-controller/parser/helper_test.go index 2c950eee6f4..69323fcd9d8 100644 --- a/aks-node-controller/parser/helper_test.go +++ b/aks-node-controller/parser/helper_test.go @@ -343,6 +343,41 @@ oom_score = -999 X-Meta-Source-Client = ["azure/aks"] [metrics] address = "0.0.0.0:10257" +`)), + }, + { + name: "Containerd Configurations with NVIDIA and CDI", + args: args{ + aksnodeconfig: &aksnodeconfigv1.Configuration{ + NeedsCgroupv2: ToPtr(true), + GpuConfig: &aksnodeconfigv1.GpuConfig{ + EnableNvidia: ToPtr(true), + RequiresCdi: ToPtr(true), + }, + }, + noGpu: false, + }, + want: base64.StdEncoding.EncodeToString([]byte(`version = 2 +oom_score = -999 +[plugins."io.containerd.grpc.v1.cri"] + enable_cdi = true + cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"] + sandbox_image = "" + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia-container-runtime" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-container-runtime.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + SystemdCgroup = true + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + [plugins."io.containerd.grpc.v1.cri".registry.headers] + X-Meta-Source-Client = ["azure/aks"] +[metrics] + address = "0.0.0.0:10257" `)), }, } diff --git a/aks-node-controller/parser/templates/containerd.toml.gtpl b/aks-node-controller/parser/templates/containerd.toml.gtpl index 400c543e0f2..177739af91d 100644 --- a/aks-node-controller/parser/templates/containerd.toml.gtpl +++ b/aks-node-controller/parser/templates/containerd.toml.gtpl @@ -2,6 +2,10 @@ version = 2 oom_score = -999{{if getHasDataDir .KubeletConfig}} root = "{{.KubeletConfig.GetContainerDataDir}}"{{- end}} [plugins."io.containerd.grpc.v1.cri"] + {{- if and ( .GetGpuConfig ) ( .GpuConfig.GetRequiresCdi ) }} + enable_cdi = true + cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"] + {{- end }} sandbox_image = "{{ .KubeBinaryConfig.GetPodInfraContainerImageUrl }}" [plugins."io.containerd.grpc.v1.cri".containerd] {{- if .TeleportConfig.GetStatus }} diff --git a/aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go b/aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go index d86efa5560a..720e15124be 100644 --- a/aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go +++ b/aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go @@ -36,6 +36,8 @@ type GpuConfig struct { GpuInstanceProfile string `protobuf:"bytes,4,opt,name=gpu_instance_profile,json=gpuInstanceProfile,proto3" json:"gpu_instance_profile,omitempty"` // Same as enable_nvidia, but for AMD GPUs. EnableAmdGpu *bool `protobuf:"varint,5,opt,name=enable_amd_gpu,json=enableAmdGpu,proto3,oneof" json:"enable_amd_gpu,omitempty"` + // Whether this GPU configuration requires CDI (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html). + RequiresCdi *bool `protobuf:"varint,6,opt,name=requires_cdi,json=requiresCdi,proto3,oneof" json:"requires_cdi,omitempty"` } func (x *GpuConfig) Reset() { @@ -103,13 +105,20 @@ func (x *GpuConfig) GetEnableAmdGpu() bool { return false } +func (x *GpuConfig) GetRequiresCdi() bool { + if x != nil && x.RequiresCdi != nil { + return *x.RequiresCdi + } + return false +} + var File_aksnodeconfig_v1_gpu_config_proto protoreflect.FileDescriptor var file_aksnodeconfig_v1_gpu_config_proto_rawDesc = []byte{ 0x0a, 0x21, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2f, 0x76, 0x31, 0x2f, 0x67, 0x70, 0x75, 0x5f, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x10, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, - 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0x8f, 0x02, 0x0a, 0x09, 0x47, 0x70, 0x75, 0x43, 0x6f, 0x6e, + 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0xc8, 0x02, 0x0a, 0x09, 0x47, 0x70, 0x75, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x28, 0x0a, 0x0d, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6e, 0x76, 0x69, 0x64, 0x69, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x48, 0x00, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x4e, 0x76, 0x69, 0x64, 0x69, 0x61, 0x88, 0x01, 0x01, 0x12, 0x2a, 0x0a, @@ -124,15 +133,19 @@ var file_aksnodeconfig_v1_gpu_config_proto_rawDesc = []byte{ 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x29, 0x0a, 0x0e, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x6d, 0x64, 0x5f, 0x67, 0x70, 0x75, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x48, 0x01, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x41, 0x6d, 0x64, 0x47, 0x70, 0x75, 0x88, - 0x01, 0x01, 0x42, 0x10, 0x0a, 0x0e, 0x5f, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6e, 0x76, - 0x69, 0x64, 0x69, 0x61, 0x42, 0x11, 0x0a, 0x0f, 0x5f, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, - 0x61, 0x6d, 0x64, 0x5f, 0x67, 0x70, 0x75, 0x42, 0x5a, 0x5a, 0x58, 0x67, 0x69, 0x74, 0x68, 0x75, - 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x41, 0x7a, 0x75, 0x72, 0x65, 0x2f, 0x61, 0x67, 0x65, 0x6e, - 0x74, 0x62, 0x61, 0x6b, 0x65, 0x72, 0x2f, 0x61, 0x6b, 0x73, 0x2d, 0x6e, 0x6f, 0x64, 0x65, 0x2d, - 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x67, - 0x65, 0x6e, 0x2f, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, - 0x2f, 0x76, 0x31, 0x3b, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, - 0x67, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x01, 0x01, 0x12, 0x26, 0x0a, 0x0c, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, 0x5f, 0x63, + 0x64, 0x69, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x48, 0x02, 0x52, 0x0b, 0x72, 0x65, 0x71, 0x75, + 0x69, 0x72, 0x65, 0x73, 0x43, 0x64, 0x69, 0x88, 0x01, 0x01, 0x42, 0x10, 0x0a, 0x0e, 0x5f, 0x65, + 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6e, 0x76, 0x69, 0x64, 0x69, 0x61, 0x42, 0x11, 0x0a, 0x0f, + 0x5f, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x6d, 0x64, 0x5f, 0x67, 0x70, 0x75, 0x42, + 0x0f, 0x0a, 0x0d, 0x5f, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, 0x5f, 0x63, 0x64, 0x69, + 0x42, 0x5a, 0x5a, 0x58, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x41, + 0x7a, 0x75, 0x72, 0x65, 0x2f, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x62, 0x61, 0x6b, 0x65, 0x72, 0x2f, + 0x61, 0x6b, 0x73, 0x2d, 0x6e, 0x6f, 0x64, 0x65, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x61, 0x6b, 0x73, 0x6e, + 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2f, 0x76, 0x31, 0x3b, 0x61, 0x6b, 0x73, + 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto b/aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto index 1d645eef7dc..7ee7a4a7c3f 100644 --- a/aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto +++ b/aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto @@ -19,4 +19,7 @@ message GpuConfig { // Same as enable_nvidia, but for AMD GPUs. optional bool enable_amd_gpu = 5; + + // Whether this GPU configuration requires CDI (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html). + optional bool requires_cdi = 6; }