Merge pull request #5417 from Patryk-Stefanski/HOSTEDCP-2169

HOSTEDCP-2169: Add aro scheduler
openshift · Jan 29, 2025 · 55c2de5 · 55c2de5
2 parents b1062d6 + f343ade
commit 55c2de5
Show file tree

Hide file tree

Showing 15 changed files with 1,154 additions and 228 deletions.
diff --git a/docs/content/how-to/azure/scheduler.md b/docs/content/how-to/azure/scheduler.md
@@ -0,0 +1,174 @@
+# Azure Scheduler
+
+The Azure Scheduler works with the default `ClusterSizingConfiguration` resource and the `HostedClusterSizing` controller.
+
+## ClusterSizingConfiguration
+
+The `ClusterSizingConfiguration` is an API used for setting tshirt sizes based on the number of nodes a `HostedCluster` has. Each tshirt size can configure different effects that control various aspects of the cluster, such as the Kube API Server (KAS), etcd, etc. Additionally, it allows controlling the frequency of transitions between cluster sizes.
+
+### Effects
+
+- `kasGoMemLimit`: Specifies the memory limit for the Kube API Server.
+- `controlPlanePriorityClassName`: The priority class for most control plane pods.
+- `etcdPriorityClassName`: The priority class for etcd pods.
+- `apiCriticalPriorityClassName`: The priority class for pods in the API request serving path, including Kube API Server and OpenShift APIServer.
+- `resourceRequests`: Allows specifying resource requests for control plane pods.
+- `machineHealthCheckTimeout`: Specifies an optional timeout for machine health checks created for `HostedClusters` with this specific size.
+- `maximumRequestsInFlight`: Specifies the maximum requests in flight for Kube API Server.
+- `maximumMutatingRequestsInflight`: Specifies the maximum mutating requests in flight for Kube API Server.
+
+### ConcurrencyConfiguration
+
+The `ConcurrencyConfiguration` defines the bounds of allowed behavior for clusters transitioning between sizes. It includes:
+
+- `SlidingWindow`: The window over which the concurrency bound is enforced. This is a duration (e.g., `10m` for 10 minutes) that specifies the time frame within which the concurrency limit is applied.
+- `Limit`: The maximum allowed number of cluster size transitions during the sliding window. This is an integer (e.g., `5`) that specifies how many transitions can occur within the sliding window.
+
+### TransitionDelayConfiguration
+
+The `TransitionDelayConfiguration` defines the lag between cluster size changing and the assigned tshirt size class being applied. It includes:
+
+- `Increase`: The minimum period of time to wait between a cluster's size increasing and the tshirt size assigned to it being updated to reflect the new size. This is a duration (e.g., `30s` for 30 seconds).
+- `Decrease`: The minimum period of time to wait between a cluster's size decreasing and the tshirt size assigned to it being updated to reflect the new size. This is a duration (e.g., `10m` for 10 minutes).
+
+## HostedClusterSizing Controller
+
+The `HostedClusterSizing` controller determines the number of nodes associated with a `HostedCluster` either from the `HostedControlPlane.Status` or by iterating through the nodepools and counting the nodepools associated with the `HostedCluster`. It then compares the number of nodes against the minimum and maximum sizes set for each tshirt size in the `ClusterSizingConfiguration`. Based on this comparison, it applies a label to the `HostedCluster` with the appropriate tshirt size. Depending on the settings in the `ClusterSizingConfiguration`, it can wait a specified amount of time before transitioning between tshirt sizes using a sliding window, ensuring that only a limited number of transitions (e.g., 5 transitions) can occur within a specified time frame (e.g., 20 minutes).
+
+The controller also updates the status of the `HostedCluster`, reporting the computed cluster size, indicating if a tshirt size transition is pending, and specifying if the cluster requires a transition to a different size.
+
+## Azure Scheduler Controller
+
+The Azure scheduler controller is straightforward. It checks the label set by the `HostedClusterSizing` controller and retrieves the cluster sizing configuration associated with the tshirt size. Based on the configuration, it can modify the `HostedCluster` with annotations for the specified fields. These annotations are then used by different controllers to propagate the required changes to the appropriate pods and containers.
+
+## How to Use
+
+### Prerequisites
+
+- AKS cluster with cluster-autoscaler enabled and using Standard\_D4s\_v4 VMs for this example. (--enable-cluster-autoscaler flag when installing AKS cluster, with --min-count 2 --max-count 6)
+- Hypershift operator with size tagging enabled. (--enable-size-tagging flag when installing hypershift operator)
+- ClusterSizingConfiguration resource created. (A default clusterSizingConfiguration resource is created by the hypershift operator)
+- A HostedCluster in the Completed state.
+- A Nodepool with 2 nodes associated with the HostedCluster.
+
+
+### Steps
+
+In the example below we will use a HostedCluster with the name 'pstefans-3' in the 'clusters' namespace and the nodepool 'pstefans-3' in the 'clusters' namespace.
+
+1. The AKS cluster should have only 2 nodes at this point.
+
+    ```shell
+    oc get nodes
+    NAME                                STATUS   ROLES    AGE     VERSION
+    aks-nodepool1-11371333-vmss000000   Ready    <none>   3h43m   v1.31.1
+    aks-nodepool1-11371333-vmss000002   Ready    <none>   3h43m   v1.31.1
+    ```
+
+2. Edit the `ClusterSizingConfiguration` resource with the following spec:
+
+    ```shell
+    oc edit clustersizingconfiguration cluster
+    ```
+
+    ```yaml
+    spec:
+      concurrency:
+        limit: 5
+        slidingWindow: 0s
+      sizes:
+      - criteria:
+          from: 0
+          to: 2
+        name: small
+      - criteria:
+          from: 3
+          to: 4
+        effects:
+          resourceRequests:
+          - containerName: kube-apiserver
+            cpu: 3
+            deploymentName: kube-apiserver
+          - containerName: control-plane-operator
+            cpu: 3
+            deploymentName: control-plane-operator
+        name: medium
+      - criteria:
+          from: 5
+        name: large
+      transitionDelay:
+        decrease: 0s
+        increase: 0s
+    ```
+
+3. Scale nodepool up to 3 nodes:
+
+    ```shell
+    oc scale nodepool pstefans-3 \
+      --namespace clusters \
+      --replicas 3
+    ```
+
+4. Once node pool scales successfully, the `HostedCluster` will be updated with the new tshirt size label and should have the resource request overrides annotations applied to the HC and the relevant controllers should pick this up and set it on the specified pods.
+
+    ```shell
+    oc get deployment kube-apiserver -n clusters-pstefans-3 -o json | jq '.spec.template.spec.containers[] | select(.name == "kube-apiserver") | .resources'
+    ```
+
+    ```json
+    {
+      "requests": {
+        "cpu": "3",
+        "memory": "2Gi"
+      }
+    }
+    ```
+
+    ```shell
+    oc get deployment control-plane-operator -n clusters-pstefans-3 -o json | jq '.spec.template.spec.containers[] | select(.name == "control-plane-operator") | .resources'
+    ```
+
+    ```json
+    {
+      "requests": {
+        "cpu": "3",
+        "memory": "80Mi"
+      }
+    }
+    ```
+
+    ```shell
+    oc get hc pstefans-3 -n clusters  -o yaml | grep resource-request-override.hypershift.openshift.io
+    resource-request-override.hypershift.openshift.io/control-plane-operator.control-plane-operator: cpu=3
+    resource-request-override.hypershift.openshift.io/kube-apiserver.kube-apiserver: cpu=3
+    ```
+
+5. You should now see the autoscaler scaled the nodes on the AKS cluster to 3 as we requested 3 CPU cores for the kube-apiserver and control-plane-operator on a nodepool with max 4 cores. So each deployment will nearly request nearly a full node to itself.
+
+    ```shell
+    oc get nodes
+    NAME                                STATUS   ROLES    AGE     VERSION
+    aks-nodepool1-11371333-vmss000000   Ready    <none>   4h8m    v1.31.1
+    aks-nodepool1-11371333-vmss000002   Ready    <none>   4h8m    v1.31.1
+    aks-nodepool1-11371333-vmss000003   Ready    <none>   9m31s   v1.31.1
+    ```
+
+6. You should now see that each of the deployments we changed the resource requests for are running on a different node with sufficient compute.
+
+    ```shell
+    kubectl get pods --all-namespaces --field-selector spec.nodeName=aks-nodepool1-11371333-vmss000003
+    ```
+
+    ```shell
+    NAMESPACE             NAME                                     READY   STATUS    RESTARTS   AGE
+    clusters-pstefans-3   kube-apiserver-549c75cb99-jj964          4/4     Running   0          12m
+    ```
+
+    ```shell
+    kubectl get pods --all-namespaces --field-selector spec.nodeName=aks-nodepool1-11371333-vmss000002
+    ```
+
+    ```shell
+    NAMESPACE             NAME                                      READY   STATUS    RESTARTS   AGE
+    clusters-pstefans-3   control-plane-operator-69b894d9dd-cxv2z   1/1     Running   0          14m
+    ```
diff --git a/hypershift-operator/controllers/hostedcluster/karpenter.go b/hypershift-operator/controllers/hostedcluster/karpenter.go
@@ -101,7 +101,7 @@ spec:
 				Subnet: hyperv1.AWSResourceReference{
 					// TODO(alberto): this is just to pass cel.
 					// Setting an ID instead of filter would break publicAndPrivate topology because the AWSEndpointService won't find the subnet.
-					// We'll move to generate the userdata for karpenter programatically.
+					// We'll move to generate the userdata for karpenter programmatically.
 					Filters: []hyperv1.Filter{
 						{
 							Name:   "subnet-none",

diff --git a/...rator/controllers/scheduler/autoscaler.go → ...r/controllers/scheduler/aws/autoscaler.go b/...rator/controllers/scheduler/autoscaler.go → ...r/controllers/scheduler/aws/autoscaler.go
diff --git a/.../controllers/scheduler/autoscaler_test.go → ...trollers/scheduler/aws/autoscaler_test.go b/.../controllers/scheduler/autoscaler_test.go → ...trollers/scheduler/aws/autoscaler_test.go
diff --git a/...eduler/dedicated_request_serving_nodes.go → ...er/aws/dedicated_request_serving_nodes.go b/...eduler/dedicated_request_serving_nodes.go → ...er/aws/dedicated_request_serving_nodes.go
@@ -9,12 +9,12 @@ import (
 	hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1"
 	schedulingv1alpha1 "github.com/openshift/hypershift/api/scheduling/v1alpha1"
 	"github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster"
+	schedulerutil "github.com/openshift/hypershift/hypershift-operator/controllers/scheduler/util"
 	"github.com/openshift/hypershift/support/upsert"
 	"github.com/openshift/hypershift/support/util"
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/equality"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -43,8 +43,6 @@ const (
 	OSDFleetManagerPairedNodesLabel   = "osd-fleet-manager.openshift.io/paired-nodes"
 	HostedClusterNameLabel            = "hypershift.openshift.io/cluster-name"
 	HostedClusterNamespaceLabel       = "hypershift.openshift.io/cluster-namespace"
-	goMemLimitLabel                   = "hypershift.openshift.io/request-serving-gomemlimit"
-	lbSubnetsLabel                    = "hypershift.openshift.io/request-serving-subnets"
 
 	// PlaceholderLabel is used as a label on Deployments that are used to keep nodes warm.
 	PlaceholderLabel = "hypershift.openshift.io/placeholder"
@@ -256,11 +254,11 @@ func (r *DedicatedServingComponentScheduler) Reconcile(ctx context.Context, req
 	for _, node := range nodesToUse {
 		originalNode := node.DeepCopy()
 
-		if node.Labels[goMemLimitLabel] != "" && nodeGoMemLimit == "" {
-			nodeGoMemLimit = node.Labels[goMemLimitLabel]
+		if node.Labels[schedulerutil.GoMemLimitLabel] != "" && nodeGoMemLimit == "" {
+			nodeGoMemLimit = node.Labels[schedulerutil.GoMemLimitLabel]
 		}
-		if node.Labels[lbSubnetsLabel] != "" && lbSubnets == "" {
-			lbSubnets = node.Labels[lbSubnetsLabel]
+		if node.Labels[schedulerutil.LBSubnetsLabel] != "" && lbSubnets == "" {
+			lbSubnets = node.Labels[schedulerutil.LBSubnetsLabel]
 			// If subnets are separated by periods, replace them with commas
 			lbSubnets = strings.ReplaceAll(lbSubnets, ".", ",")
 		}
@@ -544,7 +542,7 @@ func (r *DedicatedServingComponentSchedulerAndSizer) Reconcile(ctx context.Conte
 		}
 	} else {
 		// If there isn't a current pair label, then we can select from available nodes selected by placeholders.
-		sizeConfig := sizeConfiguration(&config, desiredSize)
+		sizeConfig := schedulerutil.SizeConfiguration(&config, desiredSize)
 		if sizeConfig == nil {
 			return ctrl.Result{}, fmt.Errorf("could not find size configuration for size %s", desiredSize)
 		}
@@ -589,7 +587,7 @@ func (r *DedicatedServingComponentSchedulerAndSizer) Reconcile(ctx context.Conte
 	if len(nodesByZone) > 1 {
 		log.Info("sufficient nodes exist for placement")
 		// If we have enough nodes, update the hosted cluster.
-		if err := r.updateHostedCluster(ctx, hc, desiredSize, &config, goalNodes); err != nil {
+		if err := schedulerutil.UpdateHostedCluster(ctx, r.Client, hc, desiredSize, &config, goalNodes); err != nil {
 			return ctrl.Result{}, err
 		}
 		// Ensure we don't have a placeholder deployment, since we have nodes
@@ -755,84 +753,6 @@ func (r *DedicatedServingComponentSchedulerAndSizer) ensureHostedClusterLabelAnd
 	return nil
 }
 
-func (r *DedicatedServingComponentSchedulerAndSizer) updateHostedCluster(ctx context.Context, hc *hyperv1.HostedCluster, size string, config *schedulingv1alpha1.ClusterSizingConfiguration, nodes []corev1.Node) error {
-	original := hc.DeepCopy()
-	hc.Annotations[hyperv1.HostedClusterScheduledAnnotation] = "true"
-	sizeConfig := sizeConfiguration(config, size)
-	if sizeConfig == nil {
-		return fmt.Errorf("could not find size configuration for size %s", size)
-	}
-
-	goMemLimit := ""
-	if sizeConfig.Effects != nil && sizeConfig.Effects.KASGoMemLimit != nil {
-		goMemLimit = sizeConfig.Effects.KASGoMemLimit.String()
-	}
-	for _, node := range nodes {
-		if node.Labels[goMemLimitLabel] != "" {
-			goMemLimit = node.Labels[goMemLimitLabel]
-			break
-		}
-	}
-	if goMemLimit != "" {
-		hc.Annotations[hyperv1.KubeAPIServerGOMemoryLimitAnnotation] = goMemLimit
-	}
-
-	if sizeConfig.Effects != nil && sizeConfig.Effects.ControlPlanePriorityClassName != nil {
-		hc.Annotations[hyperv1.ControlPlanePriorityClass] = *sizeConfig.Effects.ControlPlanePriorityClassName
-	}
-	if sizeConfig.Effects != nil && sizeConfig.Effects.EtcdPriorityClassName != nil {
-		hc.Annotations[hyperv1.EtcdPriorityClass] = *sizeConfig.Effects.EtcdPriorityClassName
-	}
-	if sizeConfig.Effects != nil && sizeConfig.Effects.APICriticalPriorityClassName != nil {
-		hc.Annotations[hyperv1.APICriticalPriorityClass] = *sizeConfig.Effects.APICriticalPriorityClassName
-	}
-	if sizeConfig.Effects != nil && sizeConfig.Effects.MachineHealthCheckTimeout != nil {
-		hc.Annotations[hyperv1.MachineHealthCheckTimeoutAnnotation] = sizeConfig.Effects.MachineHealthCheckTimeout.Duration.String()
-	} else {
-		// If mhc timeout is configured for any size in the config, remove the annotation
-		// to fallback to the default
-		if configHasMHCTimeout(config) {
-			delete(hc.Annotations, hyperv1.MachineHealthCheckTimeoutAnnotation)
-		}
-	}
-	if sizeConfig.Effects != nil && sizeConfig.Effects.MaximumRequestsInflight != nil {
-		hc.Annotations[hyperv1.KubeAPIServerMaximumRequestsInFlight] = fmt.Sprint(*sizeConfig.Effects.MaximumRequestsInflight)
-	}
-	if sizeConfig.Effects != nil && sizeConfig.Effects.MaximumMutatingRequestsInflight != nil {
-		hc.Annotations[hyperv1.KubeAPIServerMaximumMutatingRequestsInFlight] = fmt.Sprint(*sizeConfig.Effects.MaximumMutatingRequestsInflight)
-	}
-
-	var resourceRequestAnnotations map[string]string
-	if sizeConfig.Effects != nil {
-		resourceRequestAnnotations = resourceRequestsToOverrideAnnotations(sizeConfig.Effects.ResourceRequests)
-	}
-	for k, v := range resourceRequestAnnotations {
-		hc.Annotations[k] = v
-	}
-
-	lbSubnets := ""
-	for _, node := range nodes {
-		if node.Labels[lbSubnetsLabel] != "" {
-			lbSubnets = node.Labels[lbSubnetsLabel]
-			break
-		}
-	}
-	if lbSubnets != "" {
-		// If subnets are separated by periods, replace them with commas
-		lbSubnets = strings.ReplaceAll(lbSubnets, ".", ",")
-		hc.Annotations[hyperv1.AWSLoadBalancerSubnetsAnnotation] = lbSubnets
-	}
-
-	hc.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] = fmt.Sprintf("%s=%s", hyperv1.NodeSizeLabel, size)
-
-	if !equality.Semantic.DeepEqual(hc, original) {
-		if err := r.Patch(ctx, hc, client.MergeFrom(original)); err != nil {
-			return fmt.Errorf("failed to update hostedcluster: %w", err)
-		}
-	}
-	return nil
-}
-
 func (r *DedicatedServingComponentSchedulerAndSizer) deletePlaceholderDeployment(ctx context.Context, hc *hyperv1.HostedCluster) error {
 	deployment := placeholderDeployment(hc)
 	_, err := util.DeleteIfNeeded(ctx, r, deployment)
@@ -1007,40 +927,3 @@ func placeholderDeployment(hc *hyperv1.HostedCluster) *appsv1.Deployment {
 func clusterKey(hc *hyperv1.HostedCluster) string {
 	return fmt.Sprintf("%s-%s", hc.Namespace, hc.Name)
 }
-
-func sizeConfiguration(config *schedulingv1alpha1.ClusterSizingConfiguration, size string) *schedulingv1alpha1.SizeConfiguration {
-	for i := range config.Spec.Sizes {
-		if config.Spec.Sizes[i].Name == size {
-			return &config.Spec.Sizes[i]
-		}
-	}
-	return nil
-}
-
-func resourceRequestsToOverrideAnnotations(requests []schedulingv1alpha1.ResourceRequest) map[string]string {
-	annotations := map[string]string{}
-	for _, request := range requests {
-		key := fmt.Sprintf("%s/%s.%s", hyperv1.ResourceRequestOverrideAnnotationPrefix, request.DeploymentName, request.ContainerName)
-		var value string
-		if request.Memory != nil {
-			value = fmt.Sprintf("memory=%s", request.Memory.String())
-		}
-		if request.CPU != nil {
-			if value != "" {
-				value += ","
-			}
-			value += fmt.Sprintf("cpu=%s", request.CPU.String())
-		}
-		annotations[key] = value
-	}
-	return annotations
-}
-
-func configHasMHCTimeout(config *schedulingv1alpha1.ClusterSizingConfiguration) bool {
-	for _, size := range config.Spec.Sizes {
-		if size.Effects != nil && size.Effects.MachineHealthCheckTimeout != nil {
-			return true
-		}
-	}
-	return false
-}