diff --git a/go.mod b/go.mod index b1816ddd7e92..6af9dc063c0a 100644 --- a/go.mod +++ b/go.mod @@ -165,7 +165,7 @@ require ( github.com/DataDog/datadog-agent/pkg/util/winutil v0.77.0-devel.0.20260213154712-e02b9359151a github.com/DataDog/datadog-agent/pkg/version v0.78.0 github.com/DataDog/datadog-go/v5 v5.8.3 - github.com/DataDog/datadog-operator/api v0.0.0-20260323152500-0887e50ccf73 + github.com/DataDog/datadog-operator/api v0.0.0-20260503193602-adf766128732 github.com/DataDog/datadog-traceroute v1.0.15 github.com/DataDog/dd-trace-go/v2 v2.7.4 github.com/DataDog/ebpf-manager v0.7.18 diff --git a/go.sum b/go.sum index 9b506699c145..060279582060 100644 --- a/go.sum +++ b/go.sum @@ -2558,6 +2558,12 @@ github.com/DataDog/datadog-go/v5 v5.8.3 h1:s58CUJ9s8lezjhTNJO/SxkPBv2qZjS3ktpRSq github.com/DataDog/datadog-go/v5 v5.8.3/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw= github.com/DataDog/datadog-operator/api v0.0.0-20260323152500-0887e50ccf73 h1:c9uNLuEey0R0xP1ETZ620UhE67zp+WsGoSYJmhC5A8Y= github.com/DataDog/datadog-operator/api v0.0.0-20260323152500-0887e50ccf73/go.mod h1:dMnKkWiv5j5UuRfaRUPXVDHG/79JG4S3llffsX4J+ko= +github.com/DataDog/datadog-operator/api v0.0.0-20260414104914-c59fc90bbc2c h1:a1JCU63rjbvKVtDy/tUH/ZQwr9QGcPfcFyz2YMP/dd8= +github.com/DataDog/datadog-operator/api v0.0.0-20260414104914-c59fc90bbc2c/go.mod h1:ptousqBWjpi/Rioc5Apy7ljh1k7II0XMn5s9XJV61uw= +github.com/DataDog/datadog-operator/api v0.0.0-20260503185032-d63253638e40 h1:Kj0EwAj5XK33Ewql8VpwW8qYTBRRRSfCQ8K7qM9TDrc= +github.com/DataDog/datadog-operator/api v0.0.0-20260503185032-d63253638e40/go.mod h1:ptousqBWjpi/Rioc5Apy7ljh1k7II0XMn5s9XJV61uw= +github.com/DataDog/datadog-operator/api v0.0.0-20260503193602-adf766128732 h1:5wnz56qGCb10Z5HMpXTpwzqP0/N54jeF0lHt/qhCfmY= +github.com/DataDog/datadog-operator/api v0.0.0-20260503193602-adf766128732/go.mod h1:ptousqBWjpi/Rioc5Apy7ljh1k7II0XMn5s9XJV61uw= github.com/DataDog/datadog-traceroute v1.0.15 h1:CTki7WWwx/tLZ9tcFvQAFBZYENi8oJJjAHsNqAY1pNU= github.com/DataDog/datadog-traceroute v1.0.15/go.mod h1:4KQA88cpktnEFlgm1ULZ1JZlLjZz71lquagOz2sVOH8= github.com/DataDog/dd-trace-go/v2 v2.7.4 h1:a9gWlz6276LjFvGsp9vTC3yR5P532QLxOPosG5Xezy8= diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever.go b/pkg/clusteragent/autoscaling/workload/config_retriever.go index 1202a802b867..71bf4c6875d7 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever.go @@ -14,6 +14,7 @@ import ( "k8s.io/utils/clock" "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/config/remote/data" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -47,12 +48,12 @@ type autoscalingProcessor interface { } // NewConfigRetriever creates a new ConfigRetriever -func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient) (*ConfigRetriever, error) { +func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient, builder *model.PodAutoscalerInternalBuilder) (*ConfigRetriever, error) { cr := &ConfigRetriever{ isLeader: isLeader, clock: clock, - settingsProcessor: newAutoscalingSettingsProcessor(store), + settingsProcessor: newAutoscalingSettingsProcessor(store, builder), valuesProcessor: newAutoscalingValuesProcessor(store), } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go index 8b747f91ad53..8dc8bdb782e6 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go @@ -40,7 +40,8 @@ type settingsItem struct { } type autoscalingSettingsProcessor struct { - store *store + store *store + builder *model.PodAutoscalerInternalBuilder // State is kept nil until the first full config is processed state map[string]settingsItem // We are guaranteed to be called in a single thread for pre/process/post @@ -51,9 +52,10 @@ type autoscalingSettingsProcessor struct { lastProcessingError bool } -func newAutoscalingSettingsProcessor(store *store) autoscalingSettingsProcessor { +func newAutoscalingSettingsProcessor(store *store, builder *model.PodAutoscalerInternalBuilder) autoscalingSettingsProcessor { return autoscalingSettingsProcessor{ - store: store, + store: store, + builder: builder, } } @@ -153,7 +155,7 @@ func (p *autoscalingSettingsProcessor) reconcile(isLeader bool) { if podAutoscalerFound { podAutoscaler.UpdateFromSettings(item.spec, item.receivedTimestamp) } else { - podAutoscaler = model.NewPodAutoscalerFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) + podAutoscaler = p.builder.NewFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) } p.store.UnlockSet(paID, podAutoscaler, configRetrieverStoreID) } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go index 351627c4cdfd..d02f0348fa08 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" "k8s.io/utils/clock" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" ) @@ -43,7 +44,7 @@ func newMockConfigRetriever(t *testing.T, isLeader func() bool, store *store, cl mockRCClient := &mockRCClient{} - cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient) + cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient, model.NewPodAutoscalerInternalBuilder(false)) assert.NoError(t, err) return cr, mockRCClient diff --git a/pkg/clusteragent/autoscaling/workload/controller.go b/pkg/clusteragent/autoscaling/workload/controller.go index 7a7eb83c541f..dd1307c17d7f 100644 --- a/pkg/clusteragent/autoscaling/workload/controller.go +++ b/pkg/clusteragent/autoscaling/workload/controller.go @@ -72,6 +72,7 @@ type Controller struct { eventRecorder record.EventRecorder store *store + builder *model.PodAutoscalerInternalBuilder limitHeap *limitHeap @@ -103,12 +104,14 @@ func NewController( localSender sender.Sender, limitHeap *limitHeap, globalTagsFunc func() []string, + builder *model.PodAutoscalerInternalBuilder, ) (*Controller, error) { c := &Controller{ clusterID: clusterID, clock: clock, eventRecorder: eventRecorder, localSender: localSender, + builder: builder, isFallbackEnabled: false, // keep fallback disabled by default } @@ -206,7 +209,7 @@ func (c *Controller) processPodAutoscaler(ctx context.Context, key, ns, name str // If the object is present in Kubernetes, we will update our local version // Otherwise, we clear it from our local store if podAutoscaler != nil { - c.store.Set(key, model.NewPodAutoscalerInternal(podAutoscaler), c.ID) + c.store.Set(key, c.builder.NewFromKubernetes(podAutoscaler), c.ID) } else { c.store.Delete(key, c.ID) } @@ -225,7 +228,7 @@ func (c *Controller) syncPodAutoscaler(ctx context.Context, key, ns, name string if podAutoscaler != nil { // If we don't have an instance locally, we create it. Deletion is handled through setting the `Deleted` flag log.Debugf("Creating internal PodAutoscaler: %s from Kubernetes object", key) - pai := model.NewPodAutoscalerInternal(podAutoscaler) + pai := c.builder.NewFromKubernetes(podAutoscaler) c.store.UnlockSet(key, pai, c.ID) } else { // If podAutoscaler == nil, both objects are nil, nothing to do diff --git a/pkg/clusteragent/autoscaling/workload/controller_test.go b/pkg/clusteragent/autoscaling/workload/controller_test.go index 5fd7759a5961..5f536ef8f3d3 100755 --- a/pkg/clusteragent/autoscaling/workload/controller_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_test.go @@ -66,7 +66,7 @@ func newFixture(t *testing.T, testTime time.Time) *fixture { ControllerFixture: autoscaling.NewFixture( t, podAutoscalerGVR, func(fakeClient *fake.FakeDynamicClient, informer dynamicinformer.DynamicSharedInformerFactory, isLeader func() bool) (*autoscaling.Controller, error) { - c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil) + c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil, model.NewPodAutoscalerInternalBuilder(false)) if err != nil { return nil, err } @@ -1520,6 +1520,9 @@ func TestProfileManagedDPA(t *testing.T) { condition(datadoghqcommon.DatadogPodAutoscalerHorizontalAbleToScaleCondition, corev1.ConditionUnknown, "", "", testTime), condition(datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, corev1.ConditionUnknown, "", "", testTime), }, + Options: &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), + }, }, } f.ExpectCreateAction(mustUnstructured(t, expectedDPA)) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical.go b/pkg/clusteragent/autoscaling/workload/controller_vertical.go index 32abd600e236..0232eaae0a16 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical.go @@ -87,6 +87,11 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. return autoscaling.NoRequeue, nil } + // Fetch pods early: the QOS class of existing pods determines the effective burstable mode + // when burstable is not explicitly configured (Guaranteed QOS defaults to non-burstable). + pods := u.podWatcher.GetPodsForOwner(target) + autoscalerInternal.SetPodsGuaranteedQOS(isPodsGuaranteedQOS(pods)) + // Deep-copy to avoid mutating the original recommendation stored in mainScalingValues/fallbackScalingValues. // Without this, clamped values would persist and the VerticalScalingLimited condition would be // cleared on the next sync since constraints re-applied to already-clamped values are no-ops. @@ -104,8 +109,6 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. // differs from non-burstable — no extra suffix required. recommendationID := constrainedVertical.ResourcesHash - // Get the pods for the pod owner - pods := u.podWatcher.GetPodsForOwner(target) if len(pods) == 0 { // If we found nothing, we'll wait just until the next sync log.Debugf("No pods found for autoscaler: %s, gvk: %s, name: %s", autoscalerInternal.ID(), targetGVK.String(), autoscalerInternal.Spec().TargetRef.Name) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index b4497cd370f4..d11a433fab4f 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -42,10 +42,11 @@ const ( const inPlaceResizeSupportedCacheTTL = 15 * time.Minute -// removeLimitSentinel is stored in a container Limits map by applyVerticalConstraints (burstable -// mode) to signal "delete this limit from the pod instead of setting it to this value". -// Negative quantities are never valid as real Kubernetes resource values, making the intent -// unambiguous and easy to identify with quantity.Sign() < 0. +// removeLimitSentinel is placed in ContainerResources.Limits by applyVerticalConstraints +// to signal that an existing limit must be actively deleted from the live pod, rather +// than set to a new value. Negative quantities are never valid Kubernetes resource values, +// making this unambiguous. The sentinel must be inserted AFTER the limits >= requests +// invariant check to prevent it from being overwritten. var removeLimitSentinel = resource.MustParse("-1") // isInPlaceResizeSupported checks whether the API server exposes the pods/resize @@ -422,6 +423,18 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 1 of 2): + // Remove CPU from limits before clamping and the invariant check so neither + // touches the CPU limit. The sentinel is inserted in phase 2, after the invariant. + isCPURemoveLimits := constraint.ControlledValues != nil && + *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits + if isCPURemoveLimits { + if _, hasCPULimit := cr.Limits[corev1.ResourceCPU]; hasCPULimit { + delete(cr.Limits, corev1.ResourceCPU) + modified = true + } + } + // Resolve min/max bounds for clamping. // New top-level MinAllowed/MaxAllowed apply to both requests and limits. // Deprecated Requests.MinAllowed/MaxAllowed apply to requests only. @@ -444,6 +457,18 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 2 of 2): + // Insert sentinel AFTER the invariant check to prevent it from being overwritten. + // The sentinel signals to the pod patcher that any pre-existing CPU limit must be + // actively deleted from the live pod, even if the backend never included a CPU limit. + if isCPURemoveLimits { + if cr.Limits == nil { + cr.Limits = corev1.ResourceList{} + } + cr.Limits[corev1.ResourceCPU] = removeLimitSentinel + modified = true + } + kept = append(kept, cr) } @@ -613,6 +638,20 @@ func getPodResizeStatus(pod *workloadmeta.KubernetesPod, recommendationID string return PodResizeStatusCompleted, time.Time{} } +// isPodsGuaranteedQOS returns true if all pods have Guaranteed QOS class. +// Returns false when the slice is empty (unknown QOS → no override applied). +func isPodsGuaranteedQOS(pods []*workloadmeta.KubernetesPod) bool { + if len(pods) == 0 { + return false + } + for _, pod := range pods { + if pod.QOSClass != string(corev1.PodQOSGuaranteed) { + return false + } + } + return true +} + func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutoscalerInternal, pod *workloadmeta.KubernetesPod) []workloadpatcher.ContainerResourcePatch { containersResources := autoscalerInternal.ScalingValues().Vertical.ContainerResources @@ -622,6 +661,8 @@ func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutos recoByName[cr.Name] = cr } + // IsBurstable() is safe to call here: SetPodsGuaranteedQOS was called at the top of sync(), + // so the Guaranteed-QOS override is already reflected in the returned value. burstable := autoscalerInternal.IsBurstable() // Build the list of patches ordered to API server pod container order. diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 6551116d67f9..dc5e92da2430 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -691,6 +691,50 @@ func TestApplyVerticalConstraints_AllFeatures(t *testing.T) { assert.Equal(t, expectedHash, vertical.ResourcesHash) } +func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) { + vertical := &model.VerticalScalingValues{ + ResourcesHash: "original-hash", + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{ + { + Name: "app", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": resource.MustParse("600m"), "memory": resource.MustParse("512Mi")}, + }, + }, + } + constraints := &datadoghqcommon.DatadogPodAutoscalerConstraints{ + Containers: []datadoghqcommon.DatadogPodAutoscalerContainerConstraints{ + { + Name: "app", + ControlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + }, + }, + } + + limitErr, err := applyVerticalConstraints(vertical, constraints, false) + require.NoError(t, err) + assert.Nil(t, limitErr) + + require.Len(t, vertical.ContainerResources, 1) + app := vertical.ContainerResources[0] + + // CPU limit must carry the sentinel value so patchContainerResources removes it from the pod + cpuLimit, exists := app.Limits[corev1.ResourceCPU] + require.True(t, exists, "CPU key must be present in limits (sentinel)") + assert.Equal(t, 0, cpuLimit.Cmp(removeLimitSentinel), "CPU limit must be the remove-limit sentinel value") + // Memory limit must be preserved + assert.Equal(t, resource.MustParse("512Mi"), app.Limits[corev1.ResourceMemory], "memory limit must be preserved") + // CPU and memory requests must be preserved + assert.Equal(t, resource.MustParse("300m"), app.Requests[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("256Mi"), app.Requests[corev1.ResourceMemory]) + + // Hash must be recomputed + assert.NotEqual(t, "original-hash", vertical.ResourcesHash) + expectedHash, err := autoscaling.ObjectHash(vertical.ContainerResources) + require.NoError(t, err) + assert.Equal(t, expectedHash, vertical.ResourcesHash) +} + func TestApplyVerticalConstraints_ValidationErrors(t *testing.T) { vertical := &model.VerticalScalingValues{ ResourcesHash: "original-hash", @@ -810,4 +854,54 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be set when not burstable") assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty when not burstable") }) + + t.Run("Guaranteed QOS suppresses cluster-default burstable: cpu limit preserved", func(t *testing.T) { + ai := (&model.FakePodAutoscalerInternal{ + Namespace: "default", + Name: "ai", + ScalingValues: model.ScalingValues{Vertical: sv}, + ClusterBurstableDefault: true, + PodsGuaranteedQOS: true, + }).Build() + + patches := fromAutoscalerToContainerResourcePatches(&ai, pod) + + require.Len(t, patches, 1) + p := patches[0] + assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be preserved for Guaranteed QOS pods") + assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty for Guaranteed QOS pods") + }) +} + +func TestIsPodsGuaranteedQOS(t *testing.T) { + guaranteed := string(corev1.PodQOSGuaranteed) + burstable := string(corev1.PodQOSBurstable) + + t.Run("empty slice returns false", func(t *testing.T) { + assert.False(t, isPodsGuaranteedQOS(nil)) + assert.False(t, isPodsGuaranteedQOS([]*workloadmeta.KubernetesPod{})) + }) + + t.Run("all Guaranteed returns true", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, + {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: guaranteed}, + } + assert.True(t, isPodsGuaranteedQOS(pods)) + }) + + t.Run("mixed QOS returns false", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, + {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: burstable}, + } + assert.False(t, isPodsGuaranteedQOS(pods)) + }) + + t.Run("single Burstable pod returns false", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: burstable}, + } + assert.False(t, isPodsGuaranteedQOS(pods)) + }) } diff --git a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go index 601ef7b27f4d..943d4cd42050 100644 --- a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go +++ b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go @@ -1660,7 +1660,7 @@ func TestCalculateHorizontalRecommendations(t *testing.T) { Conditions: []datadoghqcommon.DatadogPodAutoscalerCondition{}, }, } - dpai := model.NewPodAutoscalerInternal(dpa) + dpai := model.NewPodAutoscalerInternalBuilder(false).NewFromKubernetes(dpa) r := newReplicaCalculator(clock.RealClock{}, pw) res, err := r.calculateHorizontalRecommendations(dpai, lStore) diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index f6459057adcb..79b1af399ad0 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -83,6 +83,17 @@ type PodAutoscalerInternal struct { // previewOptions holds the parsed preview feature flags from the DPA annotations previewOptions previewOptions + // clusterBurstableDefault is the cluster-level default for burstable mode, read once + // from config at controller startup and injected via PodAutoscalerInternalBuilder. + // It is the lowest-priority fallback: spec.options.burstable > preview annotation > this. + clusterBurstableDefault bool + + // podsGuaranteedQOS is true when all observed pods are in Guaranteed QOS class. + // Set by the vertical controller each sync via SetPodsGuaranteedQOS. + // When true and burstable is not explicitly configured, IsBurstable returns false to avoid + // silently changing the pod's QOS class by removing CPU limits. + podsGuaranteedQOS bool + // scalingValues represents the active scaling values that should be used scalingValues ScalingValues @@ -186,31 +197,44 @@ type PodAutoscalerInternal struct { customRecommenderConfiguration *RecommenderConfiguration } -// NewPodAutoscalerInternal creates a new PodAutoscalerInternal from a Kubernetes CR -func NewPodAutoscalerInternal(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { +// PodAutoscalerInternalBuilder creates PodAutoscalerInternal instances with a cluster-level +// burstable default read once at controller startup. Create one instance per process via +// NewPodAutoscalerInternalBuilder and reuse it for all autoscaler construction calls. +type PodAutoscalerInternalBuilder struct { + clusterBurstableDefault bool +} + +// NewPodAutoscalerInternalBuilder creates a builder. clusterBurstableDefault is the +// lowest-priority fallback for IsBurstable() and should be read from config once at startup. +func NewPodAutoscalerInternalBuilder(clusterBurstableDefault bool) *PodAutoscalerInternalBuilder { + return &PodAutoscalerInternalBuilder{clusterBurstableDefault: clusterBurstableDefault} +} + +// NewFromKubernetes creates a PodAutoscalerInternal from a Kubernetes CR. +func (b *PodAutoscalerInternalBuilder) NewFromKubernetes(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: podAutoscaler.Namespace, - name: podAutoscaler.Name, + namespace: podAutoscaler.Namespace, + name: podAutoscaler.Name, + clusterBurstableDefault: b.clusterBurstableDefault, } pai.UpdateFromPodAutoscaler(podAutoscaler) pai.UpdateFromStatus(&podAutoscaler.Status) - return pai } -// NewPodAutoscalerFromSettings creates a new PodAutoscalerInternal from settings received through remote configuration -func NewPodAutoscalerFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { +// NewFromSettings creates a PodAutoscalerInternal from settings received through remote configuration. +func (b *PodAutoscalerInternalBuilder) NewFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { pda := PodAutoscalerInternal{ - namespace: ns, - name: name, + namespace: ns, + name: name, + clusterBurstableDefault: b.clusterBurstableDefault, } pda.UpdateFromSettings(podAutoscalerSpec, settingsTimestamp) - return pda } -// NewPodAutoscalerFromProfile creates a PodAutoscalerInternal for a profile-managed workload. -func NewPodAutoscalerFromProfile( +// NewFromProfile creates a PodAutoscalerInternal for a profile-managed workload. +func (b *PodAutoscalerInternalBuilder) NewFromProfile( ns, name, profileName string, template *datadoghq.DatadogPodAutoscalerTemplate, targetRef autoscalingv2.CrossVersionObjectReference, @@ -218,8 +242,9 @@ func NewPodAutoscalerFromProfile( previewAnnotation string, ) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: ns, - name: name, + namespace: ns, + name: name, + clusterBurstableDefault: b.clusterBurstableDefault, upstreamCR: &datadoghq.DatadogPodAutoscaler{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns, @@ -228,7 +253,6 @@ func NewPodAutoscalerFromProfile( }, } pai.UpdateFromProfile(profileName, template, targetRef, templateHash, previewAnnotation) - return pai } @@ -672,12 +696,31 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool { return !(scaleUpDisabled && scaleDownDisabled) } -// IsBurstable returns true if the burstable preview option is enabled for this autoscaler. -// The value is read directly from the cached previewOptions struct — no JSON decode per call. -// For profile-managed DPAs previewOptions is populated by UpdateFromProfile; for standalone -// DPAs it is populated by UpdateFromPodAutoscaler. +// SetPodsGuaranteedQOS records whether all currently observed pods are in Guaranteed QOS class. +// Must be called by the vertical controller before IsBurstable() is consulted in each sync cycle. +func (p *PodAutoscalerInternal) SetPodsGuaranteedQOS(guaranteed bool) { + p.podsGuaranteedQOS = guaranteed +} + +// IsBurstable returns true if burstable mode is enabled for this autoscaler. +// +// Priority: +// 1. spec.options.burstable (explicit, highest) +// 2. preview annotation (explicit via profile) +// 3. podsGuaranteedQOS=true → false (avoid silently changing pod QOS class by removing CPU limits) +// 4. cluster-level default (from builder) func (p *PodAutoscalerInternal) IsBurstable() bool { - return p.previewOptions.Burstable + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + return *spec.Options.Burstable + } + if p.previewOptions.Burstable { + return true + } + if p.podsGuaranteedQOS { + return false + } + return p.clusterBurstableDefault } // PreviewAnnotation returns the JSON-encoded preview annotation forwarded from the cluster @@ -991,7 +1034,7 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat Source: p.scalingValues.Vertical.Source, GeneratedAt: metav1.NewTime(p.scalingValues.Vertical.Timestamp), Version: p.scalingValues.Vertical.ResourcesHash, - DesiredResources: containerResourcesForStatus(p.scalingValues.Vertical.ContainerResources), + DesiredResources: p.scalingValues.Vertical.ContainerResourcesForStatus(), Scaled: p.scaledReplicas, Evicted: p.evictedReplicas, PodCPURequest: cpuReqSum, @@ -1088,6 +1131,20 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat } status.Conditions = append(status.Conditions, newCondition(rolloutStatus, verticalReason, verticalMessage, currentTime, datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, existingConditions)) + // Populate effective options status. + // spec.options.burstable is reported when explicitly set; the annotation fallback is + // reported only when true (it has no explicit false — absence means default). + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(*spec.Options.Burstable), + } + } else if p.previewOptions.Burstable { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), + } + } + return status } diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index 3785c44aa6a6..a0aed02211d1 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -515,7 +515,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { } t.Run("NewPodAutoscalerFromProfile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.Equal(t, "prod", pai.Namespace()) assert.Equal(t, "web-app-a1b2c3d4", pai.Name()) @@ -531,7 +531,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile same profile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") // Simulate some scaling state pai.SetCurrentReplicas(5) @@ -556,7 +556,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile different profile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") newTemplate := &datadoghq.DatadogPodAutoscalerTemplate{ Objectives: []datadoghqcommon.DatadogPodAutoscalerObjective{ @@ -572,7 +572,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { t.Run("IsProfileManaged true/false", func(t *testing.T) { // Profile-managed - pai := NewPodAutoscalerFromProfile("ns", "name", "prof", template, targetRef, "", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("ns", "name", "prof", template, targetRef, "", "") assert.True(t, pai.IsProfileManaged()) assert.Equal(t, "prof", pai.ProfileName()) @@ -621,17 +621,17 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("NewPodAutoscalerFromProfile with burstable annotation sets IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) assert.True(t, pai.IsBurstable()) }) t.Run("NewPodAutoscalerFromProfile without burstable annotation does not set IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) }) t.Run("UpdateFromProfile burstable annotation toggling", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) // Enable burstable via annotation @@ -642,6 +642,175 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { pai.UpdateFromProfile("high-cpu", template, targetRef, "hash1-v3", "") assert.False(t, pai.IsBurstable()) }) + + t.Run("IsBurstable spec.options.burstable=true with no annotation", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec.options.burstable=false overrides annotation", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.False(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec.options.burstable=nil falls back to annotation", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{}, + }, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable uses ClusterBurstableDefault when no spec or annotation", func(t *testing.T) { + paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() + assert.False(t, paiOff.IsBurstable()) + paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() + assert.True(t, paiOn.IsBurstable()) + }) + + t.Run("IsBurstable annotation overrides ClusterBurstableDefault", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + ClusterBurstableDefault: false, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec overrides ClusterBurstableDefault and annotation", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + ClusterBurstableDefault: true, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.False(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable: Guaranteed QOS overrides cluster default", func(t *testing.T) { + pai := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: true}.Build() + assert.False(t, pai.IsBurstable(), "Guaranteed QOS must suppress cluster default burstable=true") + + paiNonGuaranteed := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: false}.Build() + assert.True(t, paiNonGuaranteed.IsBurstable(), "non-Guaranteed pods must fall back to cluster default") + }) + + t.Run("IsBurstable: spec.options.burstable=true wins over Guaranteed QOS", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PodsGuaranteedQOS: true, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.True(t, pai.IsBurstable(), "explicit spec.options.burstable=true must win even for Guaranteed pods") + }) + + t.Run("IsBurstable: preview annotation wins over Guaranteed QOS", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + PodsGuaranteedQOS: true, + }.Build() + assert.True(t, pai.IsBurstable(), "explicit preview annotation must win even for Guaranteed pods") + }) + + t.Run("IsBurstable: no explicit config and non-Guaranteed pods use cluster default", func(t *testing.T) { + paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() + assert.False(t, paiOff.IsBurstable()) + paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() + assert.True(t, paiOn.IsBurstable()) + }) + + t.Run("BuildStatus options.burstable from spec", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable=false from spec is reported", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.False(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable from annotation", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options nil when burstable not set", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + assert.Nil(t, status.Options) + }) } func TestContainerResourcesForStatus(t *testing.T) { diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go index c5f6a24ca9ab..9f4fc811ccf6 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go @@ -71,6 +71,8 @@ type FakePodAutoscalerInternal struct { AppliedProfileHash string TargetGVK schema.GroupVersionKind CustomRecommenderConfiguration *RecommenderConfiguration + ClusterBurstableDefault bool + PodsGuaranteedQOS bool } // Build creates a PodAutoscalerInternal object from the FakePodAutoscalerInternal. @@ -107,6 +109,8 @@ func (f FakePodAutoscalerInternal) Build() PodAutoscalerInternal { namespace: f.Namespace, name: f.Name, generation: f.Generation, + clusterBurstableDefault: f.ClusterBurstableDefault, + podsGuaranteedQOS: f.PodsGuaranteedQOS, upstreamCR: upstreamCR, settingsTimestamp: f.SettingsTimestamp, creationTimestamp: f.CreationTimestamp, diff --git a/pkg/clusteragent/autoscaling/workload/model/recommendations.go b/pkg/clusteragent/autoscaling/workload/model/recommendations.go index edbc9df005c1..fdd5acf29c5e 100644 --- a/pkg/clusteragent/autoscaling/workload/model/recommendations.go +++ b/pkg/clusteragent/autoscaling/workload/model/recommendations.go @@ -104,6 +104,36 @@ func (v *VerticalScalingValues) DeepCopy() *VerticalScalingValues { return out } +// ContainerResourcesForStatus returns a copy of ContainerResources safe to write to the DPA status. +// Any limit entry carrying a negative quantity is removed: negative quantities are never valid +// Kubernetes resource values and are used internally as sentinels (e.g. to signal that a limit +// must be actively deleted from a live pod). Exposing them in the status would be confusing. +func (v *VerticalScalingValues) ContainerResourcesForStatus() []datadoghqcommon.DatadogPodAutoscalerContainerResources { + if v.ContainerResources == nil { + return nil + } + result := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(v.ContainerResources)) + for i, cr := range v.ContainerResources { + cp := datadoghqcommon.DatadogPodAutoscalerContainerResources{Name: cr.Name} + if cr.Requests != nil { + cp.Requests = make(corev1.ResourceList, len(cr.Requests)) + for k, q := range cr.Requests { + cp.Requests[k] = q.DeepCopy() + } + } + for res, qty := range cr.Limits { + if qty.Sign() >= 0 { + if cp.Limits == nil { + cp.Limits = make(corev1.ResourceList) + } + cp.Limits[res] = qty.DeepCopy() + } + } + result[i] = cp + } + return result +} + // SumCPUMemoryRequests sums the CPU and memory requests of all containers func (v *VerticalScalingValues) SumCPUMemoryRequests() (cpu, memory resource.Quantity) { for _, container := range v.ContainerResources { diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher.go b/pkg/clusteragent/autoscaling/workload/pod_patcher.go index 9a3fc5f22086..a59dbb3fd296 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher.go @@ -237,22 +237,22 @@ func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerR if cont.Resources.Requests == nil { cont.Resources.Requests = corev1.ResourceList{} } - for resourceName, limit := range reco.Limits { - if limit.Sign() < 0 { - // Negative value (removeLimitSentinel) is set by applyVerticalConstraints in burstable - // mode, meaning "remove this limit from the container". - if _, hasCurrent := cont.Resources.Limits[resourceName]; hasCurrent { - delete(cont.Resources.Limits, resourceName) + for res, limit := range reco.Limits { + if limit.Cmp(removeLimitSentinel) == 0 { + // Sentinel: applyVerticalConstraints signalled that this limit must be actively + // removed from the pod (e.g. CPURequestsRemoveLimitsMemoryRequestsAndLimits). + if _, exists := cont.Resources.Limits[res]; exists { + delete(cont.Resources.Limits, res) patched = true } - } else if cont.Resources.Limits[resourceName] != limit { - cont.Resources.Limits[resourceName] = limit + } else if limit.Cmp(cont.Resources.Limits[res]) != 0 { + cont.Resources.Limits[res] = limit patched = true } } - for resourceName, request := range reco.Requests { - if cont.Resources.Requests[resourceName] != request { - cont.Resources.Requests[resourceName] = request + for res, request := range reco.Requests { + if request.Cmp(cont.Resources.Requests[res]) != 0 { + cont.Resources.Requests[res] = request patched = true } } diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go index 9e3663e1c59b..3b1890db3b36 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go @@ -1133,6 +1133,42 @@ func TestPatchContainerResources(t *testing.T) { expectedLimits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m")}, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes existing CPU limit", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits is idempotent when CPU limit already absent", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + }, + expectedPatched: false, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { @@ -1275,6 +1311,30 @@ func TestPatchPod(t *testing.T) { }, expectedPatched: false, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes CPU limit from pod container", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "app-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "app-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + }, + }, + }, + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go index 450c0c30fb1b..9ae1cac31c79 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go @@ -39,6 +39,7 @@ type AutoscalerSyncer struct { profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal] dpaStore *autoscaling.Store[model.PodAutoscalerInternal] isLeader func() bool + builder *model.PodAutoscalerInternalBuilder mu sync.Mutex dpaOwnership map[string]string // dpa store key → profile name @@ -59,12 +60,14 @@ func NewAutoscalerSyncer( profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal], dpaStore *autoscaling.Store[model.PodAutoscalerInternal], isLeader func() bool, + builder *model.PodAutoscalerInternalBuilder, readyDeps ...cache.InformerSynced, ) *AutoscalerSyncer { s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: isLeader, + builder: builder, dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), readyDeps: readyDeps, @@ -275,7 +278,7 @@ func (s *AutoscalerSyncer) ensureDPA(dpaKey string, d desiredDPA) { if !found { _, name, _ := cache.SplitMetaNamespaceKey(dpaKey) log.Infof("Creating DPA %s for profile %s", dpaKey, d.profileName) - pai = model.NewPodAutoscalerFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) + pai = s.builder.NewFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) s.dpaStore.UnlockSet(dpaKey, pai, syncerStoreID) return } diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go index 7faf912e3f5b..ff7e78a869ae 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go @@ -36,6 +36,7 @@ func newTestSyncer() (*AutoscalerSyncer, *autoscaling.Store[model.PodAutoscalerP profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -413,7 +414,7 @@ func TestAutoscalerSyncerWaitsForReadyDeps(t *testing.T) { profileStore.Set("high-cpu", prof, "test") var ready atomic.Bool - s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, ready.Load) + s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, model.NewPodAutoscalerInternalBuilder(false), ready.Load) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -461,13 +462,14 @@ func TestAutoscalerSyncerRebuildOwnershipCleansOrphans(t *testing.T) { targetRef := autoscalingv2.CrossVersionObjectReference{ Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } - orphanDPA := model.NewPodAutoscalerFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + orphanDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set("prod/web-app-9526aeb3", orphanDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -503,13 +505,14 @@ func TestAutoscalerSyncerRebuildOwnershipKeepsActiveDPAs(t *testing.T) { Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } _, dpaName, _ := cache.SplitMetaNamespaceKey(dpaKey) - existingDPA := model.NewPodAutoscalerFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + existingDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set(dpaKey, existingDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } diff --git a/pkg/clusteragent/autoscaling/workload/provider/provider.go b/pkg/clusteragent/autoscaling/workload/provider/provider.go index 48e59ecb63c3..17b5a69fda88 100644 --- a/pkg/clusteragent/autoscaling/workload/provider/provider.go +++ b/pkg/clusteragent/autoscaling/workload/provider/provider.go @@ -66,8 +66,15 @@ func StartWorkloadAutoscaling( podPatcher := workload.NewPodPatcher(store, patcher, eventRecorder) podWatcher := workload.NewPodWatcher(wlm, podPatcher) + // Read cluster-level config once at startup and bake it into the builder. + // All PodAutoscalerInternal instances created via the builder inherit these defaults + // without any per-reconcile config lookups. + builder := model.NewPodAutoscalerInternalBuilder( + pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable"), + ) + clock := clock.RealClock{} - _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient) + _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient, builder) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling config retriever: %w", err) } @@ -91,7 +98,7 @@ func StartWorkloadAutoscaling( maxDatadogPodAutoscalerObjects := pkgconfigsetup.Datadog().GetInt("autoscaling.workload.limit") limitHeap := autoscaling.NewHashHeap(maxDatadogPodAutoscalerObjects, store, (*model.PodAutoscalerInternal).CreationTimestamp) - controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc) + controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc, builder) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling controller: %w", err) } @@ -128,7 +135,7 @@ func StartWorkloadAutoscaling( profileController.InitialSyncDone, ) - autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, profileController.InitialSyncDone, workloadWatcher.HasSynced) + autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, builder, profileController.InitialSyncDone, workloadWatcher.HasSynced) builtinManager := profile.NewBuiltinProfileManager(apiCl.DynamicInformerCl, isLeaderFunc) // Start informers & controllers (informers can be started multiple times) diff --git a/pkg/config/setup/common_settings.go b/pkg/config/setup/common_settings.go index 02f78ad2933f..a12d00c4783e 100644 --- a/pkg/config/setup/common_settings.go +++ b/pkg/config/setup/common_settings.go @@ -1332,6 +1332,7 @@ func autoscaling(config pkgconfigmodel.Setup) { config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.cert_file", "") config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.key_file", "") config.BindEnvAndSetDefault("autoscaling.workload.in_place_vertical_scaling.enabled", false) + config.BindEnvAndSetDefault("autoscaling.workload.options.burstable", true) config.BindEnvAndSetDefault("autoscaling.failover.metrics", []string{"container.memory.usage", "container.cpu.usage"}) // Cluster autoscaling product diff --git a/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml new file mode 100644 index 000000000000..d5cf1e21e78b --- /dev/null +++ b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add support for ``CPURequestsRemoveLimitsMemoryRequestsAndLimits`` as a container + ``controlledValues`` in ``DatadogPodAutoscaler`` and ``DatadogPodAutoscalerClusterProfile``. + When set, CPU requests are controlled and any existing CPU limits are removed, + allowing containers to burst freely. Memory requests and limits are controlled as usual.