Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AMD GPU support #5858

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions aks-node-controller/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
"API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(),
"IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)),
"GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)),
"AMD_GPU_NODE": fmt.Sprintf("%v", config.GetGpuConfig().GetEnableAmdGpu()),
"SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())),
"MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
"CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),
Expand Down
40 changes: 21 additions & 19 deletions e2e/config/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin
}

func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) {
identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
Location: to.Ptr(Config.Location),
}, nil)
if err != nil {
return "", fmt.Errorf("create managed identity: %w", err)
}
err = a.createBlobStorageAccount(ctx)
if err != nil {
return "", err
}
err = a.createBlobStorageContainer(ctx)
if err != nil {
return "", err
}

if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
return "", err
}
return *identity.Properties.ClientID, nil
// HACK: temporary disable to allow running test in different subscription, without enough permissions
return "", nil
// identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
// Location: to.Ptr(Config.Location),
// }, nil)
// if err != nil {
// return "", fmt.Errorf("create managed identity: %w", err)
// }
// err = a.createBlobStorageAccount(ctx)
// if err != nil {
// return "", err
// }
// err = a.createBlobStorageContainer(ctx)
// if err != nil {
// return "", err
// }

// if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
// return "", err
// }
// return *identity.Properties.ClientID, nil
}

func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error {
Expand Down
49 changes: 49 additions & 0 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -637,3 +637,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet {
},
}
}

func podEnableAMDGPUResource(s *Scenario) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName),
Namespace: defaultNamespace,
},
Spec: corev1.PodSpec{
PriorityClassName: "system-node-critical",
NodeSelector: map[string]string{
"kubernetes.io/hostname": s.Runtime.KubeNodeName,
},
Containers: []corev1.Container{
{
Name: "amdgpu-device-plugin-container",
Image: "rocm/k8s-device-plugin",
VolumeMounts: []corev1.VolumeMount{
{
Name: "device-plugin",
MountPath: "/var/lib/kubelet/device-plugins",
},
{
Name: "sys",
MountPath: "/sys",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "device-plugin",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/kubelet/device-plugins",
},
},
},
{
Name: "sys",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/sys",
},
},
},
},
},
}
}
61 changes: 61 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1664,3 +1664,64 @@ func Test_Ubuntu2404ARM(t *testing.T) {
},
})
}

func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) {
//t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet")
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5"
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
nbc.EnableAMDGPU = true
nbc.ConfigGPUDriverIfNeeded = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5")
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateAMDGPU(ctx, s)
},
},
})
}

func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) {
// the SKU isn't available in subscriptrion/region we run tests
//t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
// LOCATION=southcentralus
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
nbc.EnableAMDGPU = true
nbc.ConfigGPUDriverIfNeeded = true

},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateAMDGPU(ctx, s)
},
},
})
}
12 changes: 12 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str

return podExecResult.stdout.String()
}

func ValidateAMDGPU(ctx context.Context, s *Scenario) {
s.T.Logf("validating pod using AMD GPU")

execResult := execScriptOnVMForScenario(ctx, s, "lspci -k")
require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not")
assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not")

ensurePod(ctx, s, podEnableAMDGPUResource(s))
waitUntilResourceAvailable(ctx, s, "amd.com/gpu")
//ensureJob(ctx, s, jobAMDGPUWorkload(s))
}
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}}
API_SERVER_NAME={{GetKubernetesEndpoint}}
IS_VHD={{GetVariable "isVHD"}}
GPU_NODE={{GetVariable "gpuNode"}}
AMD_GPU_NODE={{GetVariable "amdGpuNode"}}
SGX_NODE={{GetVariable "sgxNode"}}
MIG_NODE={{GetVariable "migNode"}}
CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
Expand Down
18 changes: 18 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,24 @@ ensureGPUDrivers() {
fi
}

# TODO: this is a temporary ubuntu-only HACK until we get a driver
ensureAMDGPUDrivers() {
echo "Installing AMD GPU drivers"

# delete amdgpu module from blacklist
sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf

# temporary solution, until the driver is available in MCR
sudo apt-get update
wget https://repo.radeon.com/amdgpu-install/6.3.1/ubuntu/jammy/amdgpu-install_6.3.60301-1_all.deb
sudo apt-get install -y ./amdgpu-install_6.3.60301-1_all.deb
sudo apt-get update
sudo apt-get install -y amdgpu-dkms

REBOOTREQUIRED=true
echo "AMD GPU drivers installed"
}

disableSSH() {
systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH
}
Expand Down
2 changes: 1 addition & 1 deletion parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ logs_to_events() {
fi
}

should_skip_nvidia_drivers() {
should_skip_gpu_drivers() {
set -x
body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
ret=$?
Expand Down
13 changes: 9 additions & 4 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,15 @@ if [[ -n ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then
logs_to_events "AKS.CSE.orasLogin.oras_login_with_kubelet_identity" oras_login_with_kubelet_identity "${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER%/}" $USER_ASSIGNED_IDENTITY_ID $TENANT_ID || exit $?
fi

export -f should_skip_nvidia_drivers
skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers)
export -f should_skip_gpu_drivers
skip_gpu_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_gpu_drivers)
ret=$?
if [[ "$ret" != "0" ]]; then
echo "Failed to determine if nvidia driver install should be skipped"
exit $ERR_NVIDIA_DRIVER_INSTALL
fi

if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then
if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_gpu_driver_install}" == "true" ]]; then
logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers
fi

Expand Down Expand Up @@ -157,7 +157,7 @@ fi
REBOOTREQUIRED=false

echo $(date),$(hostname), "Start configuring GPU drivers"
if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then
if [[ "${GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then
logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers
if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then
if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then
Expand Down Expand Up @@ -206,6 +206,11 @@ EOF
fi
fi

if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then
logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers
fi


echo $(date),$(hostname), "End configuring GPU drivers"

if [ "${NEEDS_DOCKER_LOGIN}" == "true" ]; then
Expand Down
1 change: 1 addition & 0 deletions pkg/agent/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa
"userAssignedIdentityID": config.UserAssignedIdentityClientID,
"isVHD": isVHD(profile),
"gpuNode": strconv.FormatBool(config.EnableNvidia),
"amdGpuNode": strconv.FormatBool(config.EnableAMDGPU),
"sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)),
"configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded,
"enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded,
Expand Down
Loading