Skip to content

Commit

Permalink
add amd gpu test
Browse files Browse the repository at this point in the history
  • Loading branch information
r2k1 committed Feb 17, 2025
1 parent 2f2391e commit 1cbeee1
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 19 deletions.
40 changes: 21 additions & 19 deletions e2e/config/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,25 +290,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin
}

func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) {
identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
Location: to.Ptr(Config.Location),
}, nil)
if err != nil {
return "", fmt.Errorf("create managed identity: %w", err)
}
err = a.createBlobStorageAccount(ctx)
if err != nil {
return "", err
}
err = a.createBlobStorageContainer(ctx)
if err != nil {
return "", err
}

if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
return "", err
}
return *identity.Properties.ClientID, nil
// HACK: temporary disable to allow running test in different subscription, without enough permissions
return "", nil
// identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
// Location: to.Ptr(Config.Location),
// }, nil)
// if err != nil {
// return "", fmt.Errorf("create managed identity: %w", err)
// }
// err = a.createBlobStorageAccount(ctx)
// if err != nil {
// return "", err
// }
// err = a.createBlobStorageContainer(ctx)
// if err != nil {
// return "", err
// }

// if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
// return "", err
// }
// return *identity.Properties.ClientID, nil
}

func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error {
Expand Down
49 changes: 49 additions & 0 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -637,3 +637,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet {
},
}
}

func podEnableAMDGPUResource(s *Scenario) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName),
Namespace: defaultNamespace,
},
Spec: corev1.PodSpec{
PriorityClassName: "system-node-critical",
NodeSelector: map[string]string{
"kubernetes.io/hostname": s.Runtime.KubeNodeName,
},
Containers: []corev1.Container{
{
Name: "amdgpu-device-plugin-container",
Image: "rocm/k8s-device-plugin",
VolumeMounts: []corev1.VolumeMount{
{
Name: "device-plugin",
MountPath: "/var/lib/kubelet/device-plugins",
},
{
Name: "sys",
MountPath: "/sys",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "device-plugin",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/kubelet/device-plugins",
},
},
},
{
Name: "sys",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/sys",
},
},
},
},
},
}
}
61 changes: 61 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1664,3 +1664,64 @@ func Test_Ubuntu2404ARM(t *testing.T) {
},
})
}

func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) {
//t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet")
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5"
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
nbc.EnableAMDGPU = true
nbc.ConfigGPUDriverIfNeeded = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5")
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateAMDGPU(ctx, s)
},
},
})
}

func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) {
// the SKU isn't available in subscriptrion/region we run tests
//t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
// LOCATION=southcentralus
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
nbc.EnableAMDGPU = true
nbc.ConfigGPUDriverIfNeeded = true

},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateAMDGPU(ctx, s)
},
},
})
}
12 changes: 12 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str

return podExecResult.stdout.String()
}

func ValidateAMDGPU(ctx context.Context, s *Scenario) {
s.T.Logf("validating pod using AMD GPU")

execResult := execScriptOnVMForScenario(ctx, s, "lspci -k")
require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not")
assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not")

ensurePod(ctx, s, podEnableAMDGPUResource(s))
waitUntilResourceAvailable(ctx, s, "amd.com/gpu")
//ensureJob(ctx, s, jobAMDGPUWorkload(s))
}

0 comments on commit 1cbeee1

Please sign in to comment.