diff --git a/e2e/config/azure.go b/e2e/config/azure.go index 4a2fcc48e54..2b31d463e9a 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -290,25 +290,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin } func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) { - identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ - Location: to.Ptr(Config.Location), - }, nil) - if err != nil { - return "", fmt.Errorf("create managed identity: %w", err) - } - err = a.createBlobStorageAccount(ctx) - if err != nil { - return "", err - } - err = a.createBlobStorageContainer(ctx) - if err != nil { - return "", err - } - - if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { - return "", err - } - return *identity.Properties.ClientID, nil + // HACK: temporary disable to allow running test in different subscription, without enough permissions + return "", nil + // identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ + // Location: to.Ptr(Config.Location), + // }, nil) + // if err != nil { + // return "", fmt.Errorf("create managed identity: %w", err) + // } + // err = a.createBlobStorageAccount(ctx) + // if err != nil { + // return "", err + // } + // err = a.createBlobStorageContainer(ctx) + // if err != nil { + // return "", err + // } + + // if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { + // return "", err + // } + // return *identity.Properties.ClientID, nil } func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { diff --git a/e2e/kube.go b/e2e/kube.go index 5f7455b236c..d19bc99ff28 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -637,3 +637,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet { }, } } + +func podEnableAMDGPUResource(s *Scenario) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName), + Namespace: defaultNamespace, + }, + Spec: corev1.PodSpec{ + PriorityClassName: "system-node-critical", + NodeSelector: map[string]string{ + "kubernetes.io/hostname": s.Runtime.KubeNodeName, + }, + Containers: []corev1.Container{ + { + Name: "amdgpu-device-plugin-container", + Image: "rocm/k8s-device-plugin", + VolumeMounts: []corev1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + { + Name: "sys", + MountPath: "/sys", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "device-plugin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + { + Name: "sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys", + }, + }, + }, + }, + }, + } +} diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 748e6ba116f..47c91357fbb 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1664,3 +1664,64 @@ func Test_Ubuntu2404ARM(t *testing.T) { }, }) } + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { + //t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet") + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5" + nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2" + nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5" + nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5") + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) { + // the SKU isn't available in subscriptrion/region we run tests + //t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") + // LOCATION=southcentralus + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5" + nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2" + nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5" + nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5") + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index 65b50da03dd..8d397fa16b6 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str return podExecResult.stdout.String() } + +func ValidateAMDGPU(ctx context.Context, s *Scenario) { + s.T.Logf("validating pod using AMD GPU") + + execResult := execScriptOnVMForScenario(ctx, s, "lspci -k") + require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not") + assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not") + + ensurePod(ctx, s, podEnableAMDGPUResource(s)) + waitUntilResourceAvailable(ctx, s, "amd.com/gpu") + //ensureJob(ctx, s, jobAMDGPUWorkload(s)) +} \ No newline at end of file