diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index 8b5e0dbaf62..5fcc03e0416 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -83,6 +83,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(), "IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)), "GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)), + "AMD_GPU_NODE": fmt.Sprintf("%v", config.GetGpuConfig().GetEnableAmdGpu()), "SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())), "MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())), "CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()), diff --git a/e2e/aks_model.go b/e2e/aks_model.go index 0058b2d4c8a..b65f3719bf7 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -46,7 +46,7 @@ func getBaseClusterModel(clusterName string) *armcontainerservice.ManagedCluster { Name: to.Ptr("nodepool1"), Count: to.Ptr[int32](1), - VMSize: to.Ptr("standard_d2ds_v5"), + VMSize: to.Ptr(config.Config.DefaultVMSKU), MaxPods: to.Ptr[int32](110), OSType: to.Ptr(armcontainerservice.OSTypeLinux), Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), diff --git a/e2e/cluster.go b/e2e/cluster.go index 2a88b285a5d..ccd00fa46d6 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -173,6 +173,8 @@ func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerserv return nil, fmt.Errorf("get host network debug pod: %w", err) } + t.Logf("cluster %q is ready", *cluster.Name) + return &Cluster{ Model: cluster, Kube: kube, diff --git a/e2e/config/azure.go b/e2e/config/azure.go index 276dc7176bf..2c7a0f948f8 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin } func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) { - identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ - Location: to.Ptr(Config.Location), - }, nil) - if err != nil { - return "", fmt.Errorf("create managed identity: %w", err) - } - err = a.createBlobStorageAccount(ctx) - if err != nil { - return "", err - } - err = a.createBlobStorageContainer(ctx) - if err != nil { - return "", err - } - - if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { - return "", err - } - return *identity.Properties.ClientID, nil + // HACK: temporary disable to allow running test in different subscription, without enough permissions + return "", nil + // identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{ + // Location: to.Ptr(Config.Location), + // }, nil) + // if err != nil { + // return "", fmt.Errorf("create managed identity: %w", err) + // } + // err = a.createBlobStorageAccount(ctx) + // if err != nil { + // return "", err + // } + // err = a.createBlobStorageContainer(ctx) + // if err != nil { + // return "", err + // } + + // if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil { + // return "", err + // } + // return *identity.Properties.ClientID, nil } func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { @@ -365,7 +367,7 @@ func (a *AzureClient) assignRolesToVMIdentity(ctx context.Context, principalID * return nil } -func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Image, tagName, tagValue string) (VHDResourceID, error) { +func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, t *testing.T, image *Image, tagName, tagValue string) (VHDResourceID, error) { galleryImageVersion, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) if err != nil { return "", fmt.Errorf("create a new images client: %v", err) @@ -407,6 +409,8 @@ func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Ima return "", fmt.Errorf("ensuring image replication: %w", err) } + t.Logf("found the latest image version for %s, %s", image.Name, *latestVersion.Name) + return VHDResourceID(*latestVersion.ID), nil } diff --git a/e2e/config/vhd.go b/e2e/config/vhd.go index 52ab5f0eea4..b7e2a4c632a 100644 --- a/e2e/config/vhd.go +++ b/e2e/config/vhd.go @@ -208,18 +208,20 @@ func (i *Image) String() string { func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, error) { i.vhdOnce.Do(func() { + t.Logf("finding the latest image version for %s, %s", i.Name, i.Version) switch { case i.Latest: - i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, "", "") + i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, "", "") case i.Version != "": i.vhd, i.vhdErr = Azure.EnsureSIGImageVersion(ctx, i) default: - i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, Config.SIGVersionTagName, Config.SIGVersionTagValue) + i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, Config.SIGVersionTagName, Config.SIGVersionTagValue) } if i.vhdErr != nil { i.vhdErr = fmt.Errorf("img: %s, tag %s=%s, err %w", i.Name, Config.SIGVersionTagName, Config.SIGVersionTagValue, i.vhdErr) t.Logf("failed to find the latest image version for %s", i.vhdErr) } + t.Logf("found the latest image version for %s, %s", i.Name, i.vhd) }) return i.vhd, i.vhdErr } diff --git a/e2e/exec.go b/e2e/exec.go index 6b2ad8985c2..dcd03a6032b 100644 --- a/e2e/exec.go +++ b/e2e/exec.go @@ -8,6 +8,7 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/google/uuid" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/remotecommand" @@ -54,7 +55,7 @@ type Script struct { interpreter Interpreter } -func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey string, script Script) (*podExecResult, error) { +func execScriptOnVm(ctx context.Context, s *Scenario, script Script) (*podExecResult, error) { /* This works in a way that doesn't rely on the node having joined the cluster: * We create a linux pod on a different node. @@ -77,13 +78,11 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam } steps := []string{ - fmt.Sprintf("echo '%[1]s' > %[2]s", sshPrivateKey, sshKeyName(vmPrivateIP)), "set -x", fmt.Sprintf("echo %[1]s > %[2]s", quoteForBash(script.script), scriptFileName), - fmt.Sprintf("chmod 0600 %s", sshKeyName(vmPrivateIP)), fmt.Sprintf("chmod 0755 %s", scriptFileName), - fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(vmPrivateIP), vmPrivateIP, scriptFileName, remoteScriptFileName), - fmt.Sprintf("%s %s %s", sshString(vmPrivateIP), interpreter, remoteScriptFileName), + fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(s.Runtime.VMPrivateIP), s.Runtime.VMPrivateIP, scriptFileName, remoteScriptFileName), + fmt.Sprintf("%s %s %s", sshString(s.Runtime.VMPrivateIP), interpreter, remoteScriptFileName), } joinedSteps := strings.Join(steps, " && ") @@ -91,7 +90,7 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam s.T.Logf("Executing script %[1]s using %[2]s:\n---START-SCRIPT---\n%[3]s\n---END-SCRIPT---\n", scriptFileName, interpreter, script.script) kube := s.Runtime.Cluster.Kube - execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, jumpboxPodName, joinedSteps) + execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, joinedSteps) if err != nil { return nil, fmt.Errorf("error executing command on pod: %w", err) } @@ -172,6 +171,13 @@ func unprivilegedCommandArray() []string { } } +func uploadSSHKey(ctx context.Context, s *Scenario) { + cmd := fmt.Sprintf("echo '%[1]s' > %[2]s && chmod 0600 %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP)) + kube := s.Runtime.Cluster.Kube + _, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, cmd) + require.NoError(s.T, err, "error uploading ssh key to pod") +} + func logSSHInstructions(s *Scenario) { result := "SSH Instructions:" if !config.Config.KeepVMSS { diff --git a/e2e/kube.go b/e2e/kube.go index fa6536ffe78..2a9a345dd91 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -171,8 +171,6 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t *testing.T, vmssN return node.Name } } - - t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) } if node == nil { @@ -454,13 +452,7 @@ func getClusterSubnetID(ctx context.Context, mcResourceGroupName string, t *test } func podHTTPServerLinux(s *Scenario) *corev1.Pod { - image := "mcr.microsoft.com/cbl-mariner/busybox:2.0" - secretName := "" - if s.Tags.Airgap { - image = fmt.Sprintf("%s.azurecr.io/cbl-mariner/busybox:2.0", config.GetPrivateACRName(s.Tags.NonAnonymousACR)) - secretName = config.Config.ACRSecretName - } - return &corev1.Pod{ + pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("%s-test-pod", s.Runtime.KubeNodeName), Namespace: "default", @@ -469,7 +461,7 @@ func podHTTPServerLinux(s *Scenario) *corev1.Pod { Containers: []corev1.Container{ { Name: "mariner", - Image: image, + Image: "mcr.microsoft.com/cbl-mariner/busybox:2.0", Ports: []corev1.ContainerPort{ { ContainerPort: 80, @@ -501,13 +493,13 @@ func podHTTPServerLinux(s *Scenario) *corev1.Pod { NodeSelector: map[string]string{ "kubernetes.io/hostname": s.Runtime.KubeNodeName, }, - ImagePullSecrets: []corev1.LocalObjectReference{ - { - Name: secretName, - }, - }, }, } + if s.Tags.Airgap { + pod.Spec.Containers[0].Image = fmt.Sprintf("%s.azurecr.io/cbl-mariner/busybox:2.0", config.GetPrivateACRName(s.Tags.NonAnonymousACR)) + pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: config.Config.ACRSecretName}} + } + return pod } func podHTTPServerWindows(s *Scenario) *corev1.Pod { @@ -667,3 +659,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet { }, } } + +func podEnableAMDGPUResource(s *Scenario) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName), + Namespace: defaultNamespace, + }, + Spec: corev1.PodSpec{ + PriorityClassName: "system-node-critical", + NodeSelector: map[string]string{ + "kubernetes.io/hostname": s.Runtime.KubeNodeName, + }, + Containers: []corev1.Container{ + { + Name: "amdgpu-device-plugin-container", + Image: "rocm/k8s-device-plugin", + VolumeMounts: []corev1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + { + Name: "sys", + MountPath: "/sys", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "device-plugin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + { + Name: "sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys", + }, + }, + }, + }, + }, + } +} diff --git a/e2e/node_config.go b/e2e/node_config.go index 32b95655f46..01b12d9b7d7 100644 --- a/e2e/node_config.go +++ b/e2e/node_config.go @@ -60,7 +60,7 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod Version: "v0", DisableCustomData: false, LinuxAdminUsername: "azureuser", - VmSize: "Standard_D2ds_v5", + VmSize: config.Config.DefaultVMSKU, ClusterConfig: &aksnodeconfigv1.ClusterConfig{ Location: nbc.ContainerService.Location, ResourceGroup: nbc.ResourceGroupName, @@ -347,7 +347,7 @@ func baseTemplateLinux(t *testing.T, location string, k8sVersion string, arch st }, AgentPoolProfile: &datamodel.AgentPoolProfile{ Name: "nodepool2", - VMSize: "Standard_D2ds_v5", + VMSize: config.Config.DefaultVMSKU, KubeletDiskType: "", WorkloadRuntime: "", DNSPrefix: "", diff --git a/e2e/scenario_helpers_test.go b/e2e/scenario_helpers_test.go index bf777a53036..acdc6876851 100644 --- a/e2e/scenario_helpers_test.go +++ b/e2e/scenario_helpers_test.go @@ -98,8 +98,8 @@ func RunScenario(t *testing.T, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutVMSS) defer cancel() prepareAKSNode(ctx, s) - t.Logf("Choosing the private ACR %q for the vm validation", config.GetPrivateACRName(s.Tags.NonAnonymousACR)) + validateVM(ctx, s) } @@ -146,10 +146,6 @@ func prepareAKSNode(ctx context.Context, s *Scenario) { s.T.Logf("vmss %s creation succeeded", s.Runtime.VMSSName) s.Runtime.KubeNodeName = s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName) - s.T.Logf("node %s is ready", s.Runtime.VMSSName) - - s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s) - require.NoError(s.T, err, "failed to get VM private IP address") } func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { @@ -177,7 +173,7 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { } } - vhd, err := s.VHD.VHDResourceID(ctx, t) + _, err := s.VHD.VHDResourceID(ctx, t) if err != nil { if config.Config.IgnoreScenariosWithMissingVHD && errors.Is(err, config.ErrNotFound) { t.Skipf("skipping scenario %q: could not find image", t.Name()) @@ -185,7 +181,6 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) { t.Fatalf("could not find image for %q: %s", t.Name(), err) } } - t.Logf("VHD: %q, TAGS %+v", vhd, s.Tags) } func validateVM(ctx context.Context, s *Scenario) { diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 167c44394fa..86f0cd76071 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1661,3 +1661,98 @@ func Test_Ubuntu2404ARM(t *testing.T) { }, }) } + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) { + t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet") + //E2E_LOCATION=eastus2euap + //SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5" + nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5") + // rocm images are huge, some space for manual testing + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} + +func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) { + // the SKU isn't available in subscriptrion/region we run tests + t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") + //E2E_LOCATION=southcentralus + //SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5" + nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5") + // rocm images are huge, need space for manual testing + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} + +func Test_Ubuntu2404Gen2_AMDGPU_V710(t *testing.T) { + // the SKU isn't available in subscriptrion/region we run tests + // TODO: enable once the SKU is available + t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet") + //E2E_LOCATION=southcentralus + //SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5" + nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5" + nbc.EnableAMDGPU = true + nbc.ConfigGPUDriverIfNeeded = true + + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5") + // rocm images are huge, need space for manual testing + vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAMDGPU(ctx, s) + }, + }, + }) +} diff --git a/e2e/validation.go b/e2e/validation.go index 25d595845be..3b3287f9847 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -38,11 +38,6 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { stdout := execResult.stdout.String() require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s") - // the instructions belows expects the SSH key to be uploaded to the user pool VM. - // which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works. - // maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs. - logSSHInstructions(s) - ValidateSysctlConfig(ctx, s, map[string]string{ "net.ipv4.tcp_retries2": "8", "net.core.message_burst": "80", diff --git a/e2e/validators.go b/e2e/validators.go index dcc5dd63efe..193ef06f989 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -174,7 +174,7 @@ func execScriptOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *po script.interpreter = Bash } - result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.Cluster.DebugPod.Name, string(s.Runtime.SSHKeyPrivate), script) + result, err := execScriptOnVm(ctx, s, script) require.NoError(s.T, err, "failed to execute command on VM") return result } @@ -334,7 +334,7 @@ func waitUntilResourceAvailable(ctx context.Context, s *Scenario, resourceName s nodeName := s.Runtime.KubeNodeName ticker := time.NewTicker(time.Second) defer ticker.Stop() - + s.T.Logf("waiting for resource %q to be available on node %q", resourceName, nodeName) for { select { case <-ctx.Done(): @@ -442,3 +442,16 @@ func ValidateTaints(ctx context.Context, s *Scenario, expectedTaints string) { } require.Equal(s.T, expectedTaints, actualTaints, "expected node %q to have taint %q, but got %q", s.Runtime.KubeNodeName, expectedTaints, actualTaints) } + +func ValidateAMDGPU(ctx context.Context, s *Scenario) { + s.T.Logf("validating pod using AMD GPU") + + execResult := execScriptOnVMForScenario(ctx, s, "lspci -k") + require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not") + assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not") + + ensurePod(ctx, s, podEnableAMDGPUResource(s)) + s.T.Logf("waiting for AMD GPU to be available") + waitUntilResourceAvailable(ctx, s, "amd.com/gpu") + //ensureJob(ctx, s, jobAMDGPUWorkload(s)) +} diff --git a/e2e/vmss.go b/e2e/vmss.go index 31d6fe17c7a..5a1f4324537 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -14,6 +14,7 @@ import ( "os" "path/filepath" "strings" + "sync" "testing" "time" @@ -25,6 +26,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/crypto/ssh" ) @@ -35,8 +37,8 @@ const ( ) func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScaleSet { + cluster := s.Runtime.Cluster - s.T.Logf("creating VMSS %q in resource group %q", s.Runtime.VMSSName, *cluster.Model.Properties.NodeResourceGroup) var nodeBootstrapping *datamodel.NodeBootstrapping ab, err := agent.NewAgentBaker() require.NoError(s.T, err) @@ -73,11 +75,15 @@ func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScal } s.PrepareVMSSModel(ctx, s.T, &model) - vmss, err := config.Azure.CreateVMSSWithRetry(ctx, s.T, *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, model) s.T.Cleanup(func() { cleanupVMSS(ctx, s) }) + var ipErr error + s.Runtime.VMPrivateIP, ipErr = getVMPrivateIPAddress(ctx, s) + assert.NoError(s.T, ipErr, "failed to get VM private IP address") + uploadSSHKey(ctx, s) + logSSHInstructions(s) skipTestIfSKUNotAvailableErr(s.T, err) // fail test, but continue to extract debug information require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T)) @@ -112,9 +118,6 @@ func extractLogsFromVM(ctx context.Context, s *Scenario) { } func extractLogsFromVMLinux(ctx context.Context, s *Scenario) { - privateIP, err := getVMPrivateIPAddress(ctx, s) - require.NoError(s.T, err) - commandList := map[string]string{ "cluster-provision.log": "sudo cat /var/log/azure/cluster-provision.log", "kubelet.log": "sudo journalctl -u kubelet", @@ -123,30 +126,34 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario) { "aks-node-controller.log": "sudo cat /var/log/azure/aks-node-controller.log", } - pod, err := s.Runtime.Cluster.Kube.GetHostNetworkDebugPod(ctx, s.T) - if err != nil { - require.NoError(s.T, err) - } - var logFiles = map[string]string{} + wg := sync.WaitGroup{} + lock := sync.Mutex{} for file, sourceCmd := range commandList { - execResult, err := execBashCommandOnVM(ctx, s, privateIP, pod.Name, string(s.Runtime.SSHKeyPrivate), sourceCmd) - if err != nil { - s.T.Logf("error executing %s: %s", sourceCmd, err) - continue - } - logFiles[file] = execResult.String() + wg.Add(1) + go func(file, sourceCmd string) { + defer wg.Done() + execResult, err := execBashCommandOnVM(ctx, s, sourceCmd) + if err != nil { + s.T.Logf("error executing %s: %s", sourceCmd, err) + return + } + lock.Lock() + logFiles[file] = execResult.String() + lock.Unlock() + }(file, sourceCmd) } - err = dumpFileMapToDir(s.T, logFiles) + wg.Wait() + err := dumpFileMapToDir(s.T, logFiles) require.NoError(s.T, err) } -func execBashCommandOnVM(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey, command string) (*podExecResult, error) { +func execBashCommandOnVM(ctx context.Context, s *Scenario, command string) (*podExecResult, error) { script := Script{ interpreter: Bash, script: command, } - return execScriptOnVm(ctx, s, vmPrivateIP, jumpboxPodName, sshPrivateKey, script) + return execScriptOnVm(ctx, s, script) } const uploadLogsPowershellScript = ` @@ -241,7 +248,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { return } s.T.Logf("run command executed successfully: %v", runCommandResp) - s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { @@ -427,7 +433,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual model := armcompute.VirtualMachineScaleSet{ Location: to.Ptr(config.Config.Location), SKU: &armcompute.SKU{ - Name: to.Ptr("Standard_D2ds_v5"), + Name: to.Ptr(config.Config.DefaultVMSKU), Capacity: to.Ptr[int64](1), }, Properties: &armcompute.VirtualMachineScaleSetProperties{ @@ -506,7 +512,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual Properties: &armcompute.VirtualMachineScaleSetExtensionProperties{ Publisher: to.Ptr("Microsoft.Azure.Extensions"), Type: to.Ptr("CustomScript"), - TypeHandlerVersion: to.Ptr("2.0"), + TypeHandlerVersion: to.Ptr("2.1"), AutoUpgradeMinorVersion: to.Ptr(true), Settings: map[string]interface{}{}, ProtectedSettings: map[string]interface{}{ diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 485ee8572b1..82f7a94e71c 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -60,6 +60,7 @@ USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}} API_SERVER_NAME={{GetKubernetesEndpoint}} IS_VHD={{GetVariable "isVHD"}} GPU_NODE={{GetVariable "gpuNode"}} +AMD_GPU_NODE={{GetVariable "amdGpuNode"}} SGX_NODE={{GetVariable "sgxNode"}} MIG_NODE={{GetVariable "migNode"}} CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}} diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 8195ef38dac..7ee66858b5e 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -844,6 +844,29 @@ ensureGPUDrivers() { fi } +ensureAMDGPUDrivers() { + if [[ $OS == $UBUNTU_OS_NAME ]]; then + ensureAMDGPUDriversUbuntu + else + echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers" + return + fi +} + +ensureAMDGPUDriversUbuntu() { + # for some reason the amdgpu module is in blacklist and won't be loaded without this, I don't know why it's ended up there + sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf + # Note, next command may crash non-AMDGPU machines + # Caused by an installation of amdgpu-dkms module inside the installer + # Takes about 6m to finish + sudo /root/rocm-offline-install.run + echo "AMD GPU drivers installed" +} + +cleanAMDGPUDriver() { + sudo rm /root/rocm-offline-instal.run +} + disableSSH() { systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH } diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index 61452347195..1ff5333075e 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -523,7 +523,7 @@ logs_to_events() { fi } -should_skip_nvidia_drivers() { +should_skip_gpu_drivers() { set -x body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") ret=$? diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index ee3761cb8ce..0657d721441 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -100,15 +100,15 @@ if [[ -n ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then logs_to_events "AKS.CSE.orasLogin.oras_login_with_kubelet_identity" oras_login_with_kubelet_identity "${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER%/}" $USER_ASSIGNED_IDENTITY_ID $TENANT_ID || exit $? fi -export -f should_skip_nvidia_drivers -skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers) +export -f should_skip_gpu_drivers +skip_gpu_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_gpu_drivers) ret=$? if [[ "$ret" != "0" ]]; then echo "Failed to determine if nvidia driver install should be skipped" exit $ERR_NVIDIA_DRIVER_INSTALL fi -if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then +if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_gpu_driver_install}" == "true" ]]; then logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers fi @@ -160,7 +160,7 @@ fi REBOOTREQUIRED=false echo $(date),$(hostname), "Start configuring GPU drivers" -if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then +if [[ "${GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then @@ -209,6 +209,11 @@ EOF fi fi +if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then + logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers +fi +cleanAMDGPUDrivers + echo $(date),$(hostname), "End configuring GPU drivers" if [ "${NEEDS_DOCKER_LOGIN}" == "true" ]; then diff --git a/pkg/agent/variables.go b/pkg/agent/variables.go index 80eedb389f5..f08a09922b6 100644 --- a/pkg/agent/variables.go +++ b/pkg/agent/variables.go @@ -113,6 +113,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa "userAssignedIdentityID": config.UserAssignedIdentityClientID, "isVHD": isVHD(profile), "gpuNode": strconv.FormatBool(config.EnableNvidia), + "amdGpuNode": strconv.FormatBool(config.EnableAMDGPU), "sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)), "configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded, "enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded, diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index a0252898781..055c6e2afe2 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -619,3 +619,63 @@ rm -f ./azcopy # cleanup immediately after usage will return in two downloads echo "install-dependencies step completed successfully" capture_benchmark "${SCRIPT_NAME}_overall" true process_benchmarks + + +downloadAMDGPUDriversUbuntu() { + echo "Downloading AMD GPU drivers for Ubuntu ${UBUNTU_RELEASE}" + # Determine the appropriate Ubuntu release + if [ "${UBUNTU_RELEASE}" == "22.04" ]; then + wget https://repo.radeon.com/rocm/installer/rocm-linux-install-offline/rocm-rel-6.3.3/ubuntu/22.04/rocm-offline-creator_1.0.7.60303-1~22.04.run -O rocm-offline-creator.run + elif [ "${UBUNTU_RELEASE}" == "24.04" ]; then + wget https://repo.radeon.com/rocm/installer/rocm-linux-install-offline/rocm-rel-6.3.3/ubuntu/24.04/rocm-offline-creator_1.0.7.60303-1~24.04.run -O rocm-offline-creator.run + else + echo "Skipping AMD GPU driver setup: Unsupported Ubuntu release (${UBUNTU_RELEASE})" + return 1 + fi + cat << EOL | sudo tee /etc/amdgpu.config +# Creator/Build Options +############################### +INSTALL_PACKAGE_TYPE=0 +INSTALL_PACKAGE_NAME="rocm-offline-install.run" +INSTALL_PACKAGE_DIR=/root + +INSTALL_PACKAGE_REPO=0 + +DOWNLOAD_PKG_CONFIG_NUM=0 + +# ROCm Options +############################### +ROCM_USECASES=dkms +ROCM_VERSIONS=6.3.3 + +# Driver/amdgpu Options +############################### +AMDGPU_INSTALL_DRIVER=yes +AMDGPU_POST_INSTALL_BLACKLIST=no +AMDGPU_POST_INSTALL_START=yes + +# Post-Install Options +############################### +AMDGPU_POST_GPU_ACCESS_CURRENT_USER=no +AMDGPU_POST_GPU_ACCESS_ALL_USERS=no + +# Extra Package Options +############################### +EXTRA_PACKAGES_ONLY=no +EXTRA_PACKAGES="" +EOL + # This takes time and potentially can be built and cached outside of VHD build to improve performance + # Th output file /root/rocm-offline-install.run is about 200 MB + sudo bash ./rocm-offline-creator.run config=/etc/amdgpu.config +} + +downloadAMDGPUDrivers() { + if [[ $OS == $UBUNTU_OS_NAME ]]; then + downloadAMDGPUDriversUbuntu + else + echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers" + return + fi +} + +downloadAMDGPUDrivers \ No newline at end of file diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index 9189a712367..95a0823c582 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -144,4 +144,4 @@ fi capture_benchmark "${SCRIPT_NAME}_handle_azureLinux_and_cgroupV2" echo "pre-install-dependencies step finished successfully" capture_benchmark "${SCRIPT_NAME}_overall" true -process_benchmarks \ No newline at end of file +process_benchmarks