diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go
index 8b5e0dbaf62..5fcc03e0416 100644
--- a/aks-node-controller/parser/parser.go
+++ b/aks-node-controller/parser/parser.go
@@ -83,6 +83,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
 		"API_SERVER_NAME":                                config.GetApiServerConfig().GetApiServerName(),
 		"IS_VHD":                                         fmt.Sprintf("%v", getIsVHD(config.IsVhd)),
 		"GPU_NODE":                                       fmt.Sprintf("%v", getEnableNvidia(config)),
+		"AMD_GPU_NODE":                                   fmt.Sprintf("%v", config.GetGpuConfig().GetEnableAmdGpu()),
 		"SGX_NODE":                                       fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())),
 		"MIG_NODE":                                       fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
 		"CONFIG_GPU_DRIVER_IF_NEEDED":                    fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),
diff --git a/e2e/aks_model.go b/e2e/aks_model.go
index 0058b2d4c8a..b65f3719bf7 100644
--- a/e2e/aks_model.go
+++ b/e2e/aks_model.go
@@ -46,7 +46,7 @@ func getBaseClusterModel(clusterName string) *armcontainerservice.ManagedCluster
 				{
 					Name:         to.Ptr("nodepool1"),
 					Count:        to.Ptr[int32](1),
-					VMSize:       to.Ptr("standard_d2ds_v5"),
+					VMSize:       to.Ptr(config.Config.DefaultVMSKU),
 					MaxPods:      to.Ptr[int32](110),
 					OSType:       to.Ptr(armcontainerservice.OSTypeLinux),
 					Type:         to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
diff --git a/e2e/cluster.go b/e2e/cluster.go
index 2a88b285a5d..ccd00fa46d6 100644
--- a/e2e/cluster.go
+++ b/e2e/cluster.go
@@ -173,6 +173,8 @@ func prepareCluster(ctx context.Context, t *testing.T, cluster *armcontainerserv
 		return nil, fmt.Errorf("get host network debug pod: %w", err)
 	}
 
+	t.Logf("cluster %q is ready", *cluster.Name)
+
 	return &Cluster{
 		Model:         cluster,
 		Kube:          kube,
diff --git a/e2e/config/azure.go b/e2e/config/azure.go
index 276dc7176bf..2c7a0f948f8 100644
--- a/e2e/config/azure.go
+++ b/e2e/config/azure.go
@@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin
 }
 
 func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) {
-	identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
-		Location: to.Ptr(Config.Location),
-	}, nil)
-	if err != nil {
-		return "", fmt.Errorf("create managed identity: %w", err)
-	}
-	err = a.createBlobStorageAccount(ctx)
-	if err != nil {
-		return "", err
-	}
-	err = a.createBlobStorageContainer(ctx)
-	if err != nil {
-		return "", err
-	}
-
-	if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
-		return "", err
-	}
-	return *identity.Properties.ClientID, nil
+	// HACK: temporary disable to allow running test in different subscription, without enough permissions
+	return "", nil
+	// identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
+	// 	Location: to.Ptr(Config.Location),
+	// }, nil)
+	// if err != nil {
+	// 	return "", fmt.Errorf("create managed identity: %w", err)
+	// }
+	// err = a.createBlobStorageAccount(ctx)
+	// if err != nil {
+	// 	return "", err
+	// }
+	// err = a.createBlobStorageContainer(ctx)
+	// if err != nil {
+	// 	return "", err
+	// }
+
+	// if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
+	// 	return "", err
+	// }
+	// return *identity.Properties.ClientID, nil
 }
 
 func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error {
@@ -365,7 +367,7 @@ func (a *AzureClient) assignRolesToVMIdentity(ctx context.Context, principalID *
 	return nil
 }
 
-func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Image, tagName, tagValue string) (VHDResourceID, error) {
+func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, t *testing.T, image *Image, tagName, tagValue string) (VHDResourceID, error) {
 	galleryImageVersion, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions)
 	if err != nil {
 		return "", fmt.Errorf("create a new images client: %v", err)
@@ -407,6 +409,8 @@ func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Ima
 		return "", fmt.Errorf("ensuring image replication: %w", err)
 	}
 
+	t.Logf("found the latest image version for %s, %s", image.Name, *latestVersion.Name)
+
 	return VHDResourceID(*latestVersion.ID), nil
 }
 
diff --git a/e2e/config/vhd.go b/e2e/config/vhd.go
index 52ab5f0eea4..b7e2a4c632a 100644
--- a/e2e/config/vhd.go
+++ b/e2e/config/vhd.go
@@ -208,18 +208,20 @@ func (i *Image) String() string {
 
 func (i *Image) VHDResourceID(ctx context.Context, t *testing.T) (VHDResourceID, error) {
 	i.vhdOnce.Do(func() {
+		t.Logf("finding the latest image version for %s, %s", i.Name, i.Version)
 		switch {
 		case i.Latest:
-			i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, "", "")
+			i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, "", "")
 		case i.Version != "":
 			i.vhd, i.vhdErr = Azure.EnsureSIGImageVersion(ctx, i)
 		default:
-			i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, i, Config.SIGVersionTagName, Config.SIGVersionTagValue)
+			i.vhd, i.vhdErr = Azure.LatestSIGImageVersionByTag(ctx, t, i, Config.SIGVersionTagName, Config.SIGVersionTagValue)
 		}
 		if i.vhdErr != nil {
 			i.vhdErr = fmt.Errorf("img: %s, tag %s=%s, err %w", i.Name, Config.SIGVersionTagName, Config.SIGVersionTagValue, i.vhdErr)
 			t.Logf("failed to find the latest image version for %s", i.vhdErr)
 		}
+		t.Logf("found the latest image version for %s, %s", i.Name, i.vhd)
 	})
 	return i.vhd, i.vhdErr
 }
diff --git a/e2e/exec.go b/e2e/exec.go
index 6b2ad8985c2..dcd03a6032b 100644
--- a/e2e/exec.go
+++ b/e2e/exec.go
@@ -8,6 +8,7 @@ import (
 
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/google/uuid"
+	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/tools/remotecommand"
@@ -54,7 +55,7 @@ type Script struct {
 	interpreter Interpreter
 }
 
-func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey string, script Script) (*podExecResult, error) {
+func execScriptOnVm(ctx context.Context, s *Scenario, script Script) (*podExecResult, error) {
 	/*
 		This works in a way that doesn't rely on the node having joined the cluster:
 		* We create a linux pod on a different node.
@@ -77,13 +78,11 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam
 	}
 
 	steps := []string{
-		fmt.Sprintf("echo '%[1]s' > %[2]s", sshPrivateKey, sshKeyName(vmPrivateIP)),
 		"set -x",
 		fmt.Sprintf("echo %[1]s > %[2]s", quoteForBash(script.script), scriptFileName),
-		fmt.Sprintf("chmod 0600 %s", sshKeyName(vmPrivateIP)),
 		fmt.Sprintf("chmod 0755 %s", scriptFileName),
-		fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(vmPrivateIP), vmPrivateIP, scriptFileName, remoteScriptFileName),
-		fmt.Sprintf("%s %s %s", sshString(vmPrivateIP), interpreter, remoteScriptFileName),
+		fmt.Sprintf(`scp -i %[1]s -o PasswordAuthentication=no -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=5 %[3]s azureuser@%[2]s:%[4]s`, sshKeyName(s.Runtime.VMPrivateIP), s.Runtime.VMPrivateIP, scriptFileName, remoteScriptFileName),
+		fmt.Sprintf("%s %s %s", sshString(s.Runtime.VMPrivateIP), interpreter, remoteScriptFileName),
 	}
 
 	joinedSteps := strings.Join(steps, " && ")
@@ -91,7 +90,7 @@ func execScriptOnVm(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodNam
 	s.T.Logf("Executing script %[1]s using %[2]s:\n---START-SCRIPT---\n%[3]s\n---END-SCRIPT---\n", scriptFileName, interpreter, script.script)
 
 	kube := s.Runtime.Cluster.Kube
-	execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, jumpboxPodName, joinedSteps)
+	execResult, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, joinedSteps)
 	if err != nil {
 		return nil, fmt.Errorf("error executing command on pod: %w", err)
 	}
@@ -172,6 +171,13 @@ func unprivilegedCommandArray() []string {
 	}
 }
 
+func uploadSSHKey(ctx context.Context, s *Scenario) {
+	cmd := fmt.Sprintf("echo '%[1]s' > %[2]s && chmod 0600 %[2]s", s.Runtime.SSHKeyPrivate, sshKeyName(s.Runtime.VMPrivateIP))
+	kube := s.Runtime.Cluster.Kube
+	_, err := execOnPrivilegedPod(ctx, kube, defaultNamespace, s.Runtime.Cluster.DebugPod.Name, cmd)
+	require.NoError(s.T, err, "error uploading ssh key to pod")
+}
+
 func logSSHInstructions(s *Scenario) {
 	result := "SSH Instructions:"
 	if !config.Config.KeepVMSS {
diff --git a/e2e/kube.go b/e2e/kube.go
index fa6536ffe78..2a9a345dd91 100644
--- a/e2e/kube.go
+++ b/e2e/kube.go
@@ -171,8 +171,6 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t *testing.T, vmssN
 				return node.Name
 			}
 		}
-
-		t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
 	}
 
 	if node == nil {
@@ -454,13 +452,7 @@ func getClusterSubnetID(ctx context.Context, mcResourceGroupName string, t *test
 }
 
 func podHTTPServerLinux(s *Scenario) *corev1.Pod {
-	image := "mcr.microsoft.com/cbl-mariner/busybox:2.0"
-	secretName := ""
-	if s.Tags.Airgap {
-		image = fmt.Sprintf("%s.azurecr.io/cbl-mariner/busybox:2.0", config.GetPrivateACRName(s.Tags.NonAnonymousACR))
-		secretName = config.Config.ACRSecretName
-	}
-	return &corev1.Pod{
+	pod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      fmt.Sprintf("%s-test-pod", s.Runtime.KubeNodeName),
 			Namespace: "default",
@@ -469,7 +461,7 @@ func podHTTPServerLinux(s *Scenario) *corev1.Pod {
 			Containers: []corev1.Container{
 				{
 					Name:  "mariner",
-					Image: image,
+					Image: "mcr.microsoft.com/cbl-mariner/busybox:2.0",
 					Ports: []corev1.ContainerPort{
 						{
 							ContainerPort: 80,
@@ -501,13 +493,13 @@ func podHTTPServerLinux(s *Scenario) *corev1.Pod {
 			NodeSelector: map[string]string{
 				"kubernetes.io/hostname": s.Runtime.KubeNodeName,
 			},
-			ImagePullSecrets: []corev1.LocalObjectReference{
-				{
-					Name: secretName,
-				},
-			},
 		},
 	}
+	if s.Tags.Airgap {
+		pod.Spec.Containers[0].Image = fmt.Sprintf("%s.azurecr.io/cbl-mariner/busybox:2.0", config.GetPrivateACRName(s.Tags.NonAnonymousACR))
+		pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: config.Config.ACRSecretName}}
+	}
+	return pod
 }
 
 func podHTTPServerWindows(s *Scenario) *corev1.Pod {
@@ -667,3 +659,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet {
 		},
 	}
 }
+
+func podEnableAMDGPUResource(s *Scenario) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName),
+			Namespace: defaultNamespace,
+		},
+		Spec: corev1.PodSpec{
+			PriorityClassName: "system-node-critical",
+			NodeSelector: map[string]string{
+				"kubernetes.io/hostname": s.Runtime.KubeNodeName,
+			},
+			Containers: []corev1.Container{
+				{
+					Name:  "amdgpu-device-plugin-container",
+					Image: "rocm/k8s-device-plugin",
+					VolumeMounts: []corev1.VolumeMount{
+						{
+							Name:      "device-plugin",
+							MountPath: "/var/lib/kubelet/device-plugins",
+						},
+						{
+							Name:      "sys",
+							MountPath: "/sys",
+						},
+					},
+				},
+			},
+			Volumes: []corev1.Volume{
+				{
+					Name: "device-plugin",
+					VolumeSource: corev1.VolumeSource{
+						HostPath: &corev1.HostPathVolumeSource{
+							Path: "/var/lib/kubelet/device-plugins",
+						},
+					},
+				},
+				{
+					Name: "sys",
+					VolumeSource: corev1.VolumeSource{
+						HostPath: &corev1.HostPathVolumeSource{
+							Path: "/sys",
+						},
+					},
+				},
+			},
+		},
+	}
+}
diff --git a/e2e/node_config.go b/e2e/node_config.go
index 32b95655f46..01b12d9b7d7 100644
--- a/e2e/node_config.go
+++ b/e2e/node_config.go
@@ -60,7 +60,7 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod
 		Version:            "v0",
 		DisableCustomData:  false,
 		LinuxAdminUsername: "azureuser",
-		VmSize:             "Standard_D2ds_v5",
+		VmSize:             config.Config.DefaultVMSKU,
 		ClusterConfig: &aksnodeconfigv1.ClusterConfig{
 			Location:      nbc.ContainerService.Location,
 			ResourceGroup: nbc.ResourceGroupName,
@@ -347,7 +347,7 @@ func baseTemplateLinux(t *testing.T, location string, k8sVersion string, arch st
 		},
 		AgentPoolProfile: &datamodel.AgentPoolProfile{
 			Name:                "nodepool2",
-			VMSize:              "Standard_D2ds_v5",
+			VMSize:              config.Config.DefaultVMSKU,
 			KubeletDiskType:     "",
 			WorkloadRuntime:     "",
 			DNSPrefix:           "",
diff --git a/e2e/scenario_helpers_test.go b/e2e/scenario_helpers_test.go
index bf777a53036..acdc6876851 100644
--- a/e2e/scenario_helpers_test.go
+++ b/e2e/scenario_helpers_test.go
@@ -98,8 +98,8 @@ func RunScenario(t *testing.T, s *Scenario) {
 	ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutVMSS)
 	defer cancel()
 	prepareAKSNode(ctx, s)
-
 	t.Logf("Choosing the private ACR %q for the vm validation", config.GetPrivateACRName(s.Tags.NonAnonymousACR))
+
 	validateVM(ctx, s)
 }
 
@@ -146,10 +146,6 @@ func prepareAKSNode(ctx context.Context, s *Scenario) {
 	s.T.Logf("vmss %s creation succeeded", s.Runtime.VMSSName)
 
 	s.Runtime.KubeNodeName = s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName)
-	s.T.Logf("node %s is ready", s.Runtime.VMSSName)
-
-	s.Runtime.VMPrivateIP, err = getVMPrivateIPAddress(ctx, s)
-	require.NoError(s.T, err, "failed to get VM private IP address")
 }
 
 func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
@@ -177,7 +173,7 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
 		}
 	}
 
-	vhd, err := s.VHD.VHDResourceID(ctx, t)
+	_, err := s.VHD.VHDResourceID(ctx, t)
 	if err != nil {
 		if config.Config.IgnoreScenariosWithMissingVHD && errors.Is(err, config.ErrNotFound) {
 			t.Skipf("skipping scenario %q: could not find image", t.Name())
@@ -185,7 +181,6 @@ func maybeSkipScenario(ctx context.Context, t *testing.T, s *Scenario) {
 			t.Fatalf("could not find image for %q: %s", t.Name(), err)
 		}
 	}
-	t.Logf("VHD: %q, TAGS %+v", vhd, s.Tags)
 }
 
 func validateVM(ctx context.Context, s *Scenario) {
diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go
index 167c44394fa..86f0cd76071 100644
--- a/e2e/scenario_test.go
+++ b/e2e/scenario_test.go
@@ -1661,3 +1661,98 @@ func Test_Ubuntu2404ARM(t *testing.T) {
 		},
 	})
 }
+
+func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) {
+	t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet")
+	//E2E_LOCATION=eastus2euap
+	//SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2204Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5"
+				nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"
+				nbc.EnableAMDGPU = true
+				nbc.ConfigGPUDriverIfNeeded = true
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5")
+				// rocm images are huge, some space for manual testing
+				vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128)
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateAMDGPU(ctx, s)
+			},
+		},
+	})
+}
+
+func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) {
+	// the SKU isn't available in subscriptrion/region we run tests
+	t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
+	//E2E_LOCATION=southcentralus
+	//SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2204Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
+				nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
+				nbc.EnableAMDGPU = true
+				nbc.ConfigGPUDriverIfNeeded = true
+
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
+				// rocm images are huge, need space for manual testing
+				vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128)
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateAMDGPU(ctx, s)
+			},
+		},
+	})
+}
+
+func Test_Ubuntu2404Gen2_AMDGPU_V710(t *testing.T) {
+	// the SKU isn't available in subscriptrion/region we run tests
+	// TODO: enable once the SKU is available
+	t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
+	//E2E_LOCATION=southcentralus
+	//SUBSCRIPTION_ID=4f3dc0e4-0c77-40ff-bf9a-6ade1e3048ef
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2404Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
+				nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
+				nbc.EnableAMDGPU = true
+				nbc.ConfigGPUDriverIfNeeded = true
+
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
+				// rocm images are huge, need space for manual testing
+				vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128)
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateAMDGPU(ctx, s)
+			},
+		},
+	})
+}
diff --git a/e2e/validation.go b/e2e/validation.go
index 25d595845be..3b3287f9847 100644
--- a/e2e/validation.go
+++ b/e2e/validation.go
@@ -38,11 +38,6 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
 	stdout := execResult.stdout.String()
 	require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s")
 
-	// the instructions belows expects the SSH key to be uploaded to the user pool VM.
-	// which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works.
-	// maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs.
-	logSSHInstructions(s)
-
 	ValidateSysctlConfig(ctx, s, map[string]string{
 		"net.ipv4.tcp_retries2":             "8",
 		"net.core.message_burst":            "80",
diff --git a/e2e/validators.go b/e2e/validators.go
index dcc5dd63efe..193ef06f989 100644
--- a/e2e/validators.go
+++ b/e2e/validators.go
@@ -174,7 +174,7 @@ func execScriptOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *po
 		script.interpreter = Bash
 	}
 
-	result, err := execScriptOnVm(ctx, s, s.Runtime.VMPrivateIP, s.Runtime.Cluster.DebugPod.Name, string(s.Runtime.SSHKeyPrivate), script)
+	result, err := execScriptOnVm(ctx, s, script)
 	require.NoError(s.T, err, "failed to execute command on VM")
 	return result
 }
@@ -334,7 +334,7 @@ func waitUntilResourceAvailable(ctx context.Context, s *Scenario, resourceName s
 	nodeName := s.Runtime.KubeNodeName
 	ticker := time.NewTicker(time.Second)
 	defer ticker.Stop()
-
+	s.T.Logf("waiting for resource %q to be available on node %q", resourceName, nodeName)
 	for {
 		select {
 		case <-ctx.Done():
@@ -442,3 +442,16 @@ func ValidateTaints(ctx context.Context, s *Scenario, expectedTaints string) {
 	}
 	require.Equal(s.T, expectedTaints, actualTaints, "expected node %q to have taint %q, but got %q", s.Runtime.KubeNodeName, expectedTaints, actualTaints)
 }
+
+func ValidateAMDGPU(ctx context.Context, s *Scenario) {
+	s.T.Logf("validating pod using AMD GPU")
+
+	execResult := execScriptOnVMForScenario(ctx, s, "lspci -k")
+	require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not")
+	assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not")
+
+	ensurePod(ctx, s, podEnableAMDGPUResource(s))
+	s.T.Logf("waiting for AMD GPU to be available")
+	waitUntilResourceAvailable(ctx, s, "amd.com/gpu")
+	//ensureJob(ctx, s, jobAMDGPUWorkload(s))
+}
diff --git a/e2e/vmss.go b/e2e/vmss.go
index 31d6fe17c7a..5a1f4324537 100644
--- a/e2e/vmss.go
+++ b/e2e/vmss.go
@@ -14,6 +14,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -25,6 +26,7 @@ import (
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/crypto/ssh"
 )
@@ -35,8 +37,8 @@ const (
 )
 
 func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScaleSet {
+
 	cluster := s.Runtime.Cluster
-	s.T.Logf("creating VMSS %q in resource group %q", s.Runtime.VMSSName, *cluster.Model.Properties.NodeResourceGroup)
 	var nodeBootstrapping *datamodel.NodeBootstrapping
 	ab, err := agent.NewAgentBaker()
 	require.NoError(s.T, err)
@@ -73,11 +75,15 @@ func createVMSS(ctx context.Context, s *Scenario) *armcompute.VirtualMachineScal
 	}
 
 	s.PrepareVMSSModel(ctx, s.T, &model)
-
 	vmss, err := config.Azure.CreateVMSSWithRetry(ctx, s.T, *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, model)
 	s.T.Cleanup(func() {
 		cleanupVMSS(ctx, s)
 	})
+	var ipErr error
+	s.Runtime.VMPrivateIP, ipErr = getVMPrivateIPAddress(ctx, s)
+	assert.NoError(s.T, ipErr, "failed to get VM private IP address")
+	uploadSSHKey(ctx, s)
+	logSSHInstructions(s)
 	skipTestIfSKUNotAvailableErr(s.T, err)
 	// fail test, but continue to extract debug information
 	require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T))
@@ -112,9 +118,6 @@ func extractLogsFromVM(ctx context.Context, s *Scenario) {
 }
 
 func extractLogsFromVMLinux(ctx context.Context, s *Scenario) {
-	privateIP, err := getVMPrivateIPAddress(ctx, s)
-	require.NoError(s.T, err)
-
 	commandList := map[string]string{
 		"cluster-provision.log":            "sudo cat /var/log/azure/cluster-provision.log",
 		"kubelet.log":                      "sudo journalctl -u kubelet",
@@ -123,30 +126,34 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario) {
 		"aks-node-controller.log":          "sudo cat /var/log/azure/aks-node-controller.log",
 	}
 
-	pod, err := s.Runtime.Cluster.Kube.GetHostNetworkDebugPod(ctx, s.T)
-	if err != nil {
-		require.NoError(s.T, err)
-	}
-
 	var logFiles = map[string]string{}
+	wg := sync.WaitGroup{}
+	lock := sync.Mutex{}
 	for file, sourceCmd := range commandList {
-		execResult, err := execBashCommandOnVM(ctx, s, privateIP, pod.Name, string(s.Runtime.SSHKeyPrivate), sourceCmd)
-		if err != nil {
-			s.T.Logf("error executing %s: %s", sourceCmd, err)
-			continue
-		}
-		logFiles[file] = execResult.String()
+		wg.Add(1)
+		go func(file, sourceCmd string) {
+			defer wg.Done()
+			execResult, err := execBashCommandOnVM(ctx, s, sourceCmd)
+			if err != nil {
+				s.T.Logf("error executing %s: %s", sourceCmd, err)
+				return
+			}
+			lock.Lock()
+			logFiles[file] = execResult.String()
+			lock.Unlock()
+		}(file, sourceCmd)
 	}
-	err = dumpFileMapToDir(s.T, logFiles)
+	wg.Wait()
+	err := dumpFileMapToDir(s.T, logFiles)
 	require.NoError(s.T, err)
 }
 
-func execBashCommandOnVM(ctx context.Context, s *Scenario, vmPrivateIP, jumpboxPodName, sshPrivateKey, command string) (*podExecResult, error) {
+func execBashCommandOnVM(ctx context.Context, s *Scenario, command string) (*podExecResult, error) {
 	script := Script{
 		interpreter: Bash,
 		script:      command,
 	}
-	return execScriptOnVm(ctx, s, vmPrivateIP, jumpboxPodName, sshPrivateKey, script)
+	return execScriptOnVm(ctx, s, script)
 }
 
 const uploadLogsPowershellScript = `
@@ -241,7 +248,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) {
 		return
 	}
 	s.T.Logf("run command executed successfully: %v", runCommandResp)
-
 	s.T.Logf("uploaded logs to %s", blobUrl)
 
 	downloadBlob := func(blobSuffix string) {
@@ -427,7 +433,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual
 	model := armcompute.VirtualMachineScaleSet{
 		Location: to.Ptr(config.Config.Location),
 		SKU: &armcompute.SKU{
-			Name:     to.Ptr("Standard_D2ds_v5"),
+			Name:     to.Ptr(config.Config.DefaultVMSKU),
 			Capacity: to.Ptr[int64](1),
 		},
 		Properties: &armcompute.VirtualMachineScaleSetProperties{
@@ -506,7 +512,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual
 					Properties: &armcompute.VirtualMachineScaleSetExtensionProperties{
 						Publisher:               to.Ptr("Microsoft.Azure.Extensions"),
 						Type:                    to.Ptr("CustomScript"),
-						TypeHandlerVersion:      to.Ptr("2.0"),
+						TypeHandlerVersion:      to.Ptr("2.1"),
 						AutoUpgradeMinorVersion: to.Ptr(true),
 						Settings:                map[string]interface{}{},
 						ProtectedSettings: map[string]interface{}{
diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh
index 485ee8572b1..82f7a94e71c 100644
--- a/parts/linux/cloud-init/artifacts/cse_cmd.sh
+++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh
@@ -60,6 +60,7 @@ USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}}
 API_SERVER_NAME={{GetKubernetesEndpoint}}
 IS_VHD={{GetVariable "isVHD"}}
 GPU_NODE={{GetVariable "gpuNode"}}
+AMD_GPU_NODE={{GetVariable "amdGpuNode"}}
 SGX_NODE={{GetVariable "sgxNode"}}
 MIG_NODE={{GetVariable "migNode"}}
 CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh
index 8195ef38dac..7ee66858b5e 100755
--- a/parts/linux/cloud-init/artifacts/cse_config.sh
+++ b/parts/linux/cloud-init/artifacts/cse_config.sh
@@ -844,6 +844,29 @@ ensureGPUDrivers() {
     fi
 }
 
+ensureAMDGPUDrivers() {
+    if [[ $OS == $UBUNTU_OS_NAME ]]; then
+        ensureAMDGPUDriversUbuntu
+    else
+        echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers"
+        return
+    fi
+}
+
+ensureAMDGPUDriversUbuntu() {
+    # for some reason the amdgpu module is in blacklist and won't be loaded without this, I don't know why it's ended up there
+    sudo sed -i '/blacklist amdgpu/d' /etc/modprobe.d/blacklist-radeon-instinct.conf
+    # Note, next command may crash non-AMDGPU machines
+    # Caused by an installation of amdgpu-dkms module inside the installer
+    # Takes about 6m to finish
+    sudo /root/rocm-offline-install.run
+    echo "AMD GPU drivers installed"
+}
+
+cleanAMDGPUDriver() {
+    sudo rm /root/rocm-offline-instal.run
+}
+
 disableSSH() {
     systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH
 }
diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh
index 61452347195..1ff5333075e 100755
--- a/parts/linux/cloud-init/artifacts/cse_helpers.sh
+++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh
@@ -523,7 +523,7 @@ logs_to_events() {
     fi
 }
 
-should_skip_nvidia_drivers() {
+should_skip_gpu_drivers() {
     set -x
     body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
     ret=$?
diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh
index ee3761cb8ce..0657d721441 100755
--- a/parts/linux/cloud-init/artifacts/cse_main.sh
+++ b/parts/linux/cloud-init/artifacts/cse_main.sh
@@ -100,15 +100,15 @@ if [[ -n ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then
     logs_to_events "AKS.CSE.orasLogin.oras_login_with_kubelet_identity" oras_login_with_kubelet_identity "${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER%/}" $USER_ASSIGNED_IDENTITY_ID $TENANT_ID || exit $?
 fi
 
-export -f should_skip_nvidia_drivers
-skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers)
+export -f should_skip_gpu_drivers
+skip_gpu_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_gpu_drivers)
 ret=$?
 if [[ "$ret" != "0" ]]; then
     echo "Failed to determine if nvidia driver install should be skipped"
     exit $ERR_NVIDIA_DRIVER_INSTALL
 fi
 
-if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then
+if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_gpu_driver_install}" == "true" ]]; then
     logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers
 fi
 
@@ -160,7 +160,7 @@ fi
 REBOOTREQUIRED=false
 
 echo $(date),$(hostname), "Start configuring GPU drivers"
-if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then
+if [[ "${GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then
     logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers
     if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then
         if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then
@@ -209,6 +209,11 @@ EOF
     fi
 fi
 
+if [[ "${AMD_GPU_NODE}" = true ]] && [[ "${skip_gpu_driver_install}" != "true" ]]; then
+    logs_to_events "AKS.CSE.ensureAMDGPUDrivers" ensureAMDGPUDrivers
+fi
+cleanAMDGPUDrivers
+
 echo $(date),$(hostname), "End configuring GPU drivers"
 
 if [ "${NEEDS_DOCKER_LOGIN}" == "true" ]; then
diff --git a/pkg/agent/variables.go b/pkg/agent/variables.go
index 80eedb389f5..f08a09922b6 100644
--- a/pkg/agent/variables.go
+++ b/pkg/agent/variables.go
@@ -113,6 +113,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa
 		"userAssignedIdentityID":               config.UserAssignedIdentityClientID,
 		"isVHD":                                isVHD(profile),
 		"gpuNode":                              strconv.FormatBool(config.EnableNvidia),
+		"amdGpuNode":                           strconv.FormatBool(config.EnableAMDGPU),
 		"sgxNode":                              strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)),
 		"configGPUDriverIfNeeded":              config.ConfigGPUDriverIfNeeded,
 		"enableGPUDevicePluginIfNeeded":        config.EnableGPUDevicePluginIfNeeded,
diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh
index a0252898781..055c6e2afe2 100644
--- a/vhdbuilder/packer/install-dependencies.sh
+++ b/vhdbuilder/packer/install-dependencies.sh
@@ -619,3 +619,63 @@ rm -f ./azcopy # cleanup immediately after usage will return in two downloads
 echo "install-dependencies step completed successfully"
 capture_benchmark "${SCRIPT_NAME}_overall" true
 process_benchmarks
+
+
+downloadAMDGPUDriversUbuntu() {
+  echo "Downloading AMD GPU drivers for Ubuntu ${UBUNTU_RELEASE}"
+  # Determine the appropriate Ubuntu release
+  if [ "${UBUNTU_RELEASE}" == "22.04" ]; then
+    wget https://repo.radeon.com/rocm/installer/rocm-linux-install-offline/rocm-rel-6.3.3/ubuntu/22.04/rocm-offline-creator_1.0.7.60303-1~22.04.run -O rocm-offline-creator.run
+  elif [ "${UBUNTU_RELEASE}" == "24.04" ]; then
+    wget https://repo.radeon.com/rocm/installer/rocm-linux-install-offline/rocm-rel-6.3.3/ubuntu/24.04/rocm-offline-creator_1.0.7.60303-1~24.04.run -O rocm-offline-creator.run
+  else
+    echo "Skipping AMD GPU driver setup: Unsupported Ubuntu release (${UBUNTU_RELEASE})"
+    return 1
+  fi
+  cat << EOL | sudo tee /etc/amdgpu.config
+# Creator/Build Options
+###############################
+INSTALL_PACKAGE_TYPE=0
+INSTALL_PACKAGE_NAME="rocm-offline-install.run"
+INSTALL_PACKAGE_DIR=/root
+
+INSTALL_PACKAGE_REPO=0
+
+DOWNLOAD_PKG_CONFIG_NUM=0
+
+# ROCm Options
+###############################
+ROCM_USECASES=dkms
+ROCM_VERSIONS=6.3.3
+
+# Driver/amdgpu Options
+###############################
+AMDGPU_INSTALL_DRIVER=yes
+AMDGPU_POST_INSTALL_BLACKLIST=no
+AMDGPU_POST_INSTALL_START=yes
+
+# Post-Install Options
+###############################
+AMDGPU_POST_GPU_ACCESS_CURRENT_USER=no
+AMDGPU_POST_GPU_ACCESS_ALL_USERS=no
+
+# Extra Package Options
+###############################
+EXTRA_PACKAGES_ONLY=no
+EXTRA_PACKAGES=""
+EOL
+  # This takes time and potentially can be built and cached outside of VHD build to improve performance
+  # Th output file /root/rocm-offline-install.run is about 200 MB
+  sudo bash ./rocm-offline-creator.run config=/etc/amdgpu.config
+}
+
+downloadAMDGPUDrivers() {
+  if [[ $OS == $UBUNTU_OS_NAME ]]; then
+    downloadAMDGPUDriversUbuntu
+  else
+    echo "os $OS not supported at this time. skipping ensureAMDGPUDrivers"
+    return
+  fi
+}
+
+downloadAMDGPUDrivers
\ No newline at end of file
diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh
index 9189a712367..95a0823c582 100644
--- a/vhdbuilder/packer/pre-install-dependencies.sh
+++ b/vhdbuilder/packer/pre-install-dependencies.sh
@@ -144,4 +144,4 @@ fi
 capture_benchmark "${SCRIPT_NAME}_handle_azureLinux_and_cgroupV2"
 echo "pre-install-dependencies step finished successfully"
 capture_benchmark "${SCRIPT_NAME}_overall" true
-process_benchmarks
\ No newline at end of file
+process_benchmarks