diff --git a/api/scaler/v1alpha1/budaiscaler_types.go b/api/scaler/v1alpha1/budaiscaler_types.go index cabba86..a26e437 100644 --- a/api/scaler/v1alpha1/budaiscaler_types.go +++ b/api/scaler/v1alpha1/budaiscaler_types.go @@ -102,6 +102,10 @@ type BudAIScalerSpec struct { // Behavior configures the scaling behavior for scale up and scale down. // +optional Behavior *ScalingBehavior `json:"behavior,omitempty"` + + // ScaleToZeroConfig configures scale-to-zero behavior. + // +optional + ScaleToZeroConfig *ScaleToZeroConfig `json:"scaleToZeroConfig,omitempty"` } // ScalingStrategyType defines the type of scaling algorithm. @@ -488,6 +492,25 @@ type StartingPodsConfig struct { BypassGateOnPanic *bool `json:"bypassGateOnPanic,omitempty"` } +// ScaleToZeroConfig configures scale-to-zero behavior. +type ScaleToZeroConfig struct { + // Enabled turns on scale-to-zero behavior. + // +optional + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + + // ActivationScale is the number of replicas to scale to when waking from zero. + // +optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + ActivationScale *int32 `json:"activationScale,omitempty"` + + // GracePeriod is the duration to wait with zero demand before scaling to zero. + // +optional + // +kubebuilder:default="5m" + GracePeriod *metav1.Duration `json:"gracePeriod,omitempty"` +} + // ScalingRules defines rules for scaling in a particular direction. type ScalingRules struct { // StabilizationWindowSeconds is the number of seconds to look back @@ -564,6 +587,11 @@ type BudAIScalerStatus struct { // MultiClusterStatus contains federation status. // +optional MultiClusterStatus *MultiClusterStatus `json:"multiClusterStatus,omitempty"` + + // ZeroDemandSince tracks when zero demand was first detected. + // Used for scale-to-zero grace period calculation. + // +optional + ZeroDemandSince *metav1.Time `json:"zeroDemandSince,omitempty"` } // ScalingDecision records a single scaling decision. diff --git a/api/scaler/v1alpha1/zz_generated.deepcopy.go b/api/scaler/v1alpha1/zz_generated.deepcopy.go index 6928154..4d89d91 100644 --- a/api/scaler/v1alpha1/zz_generated.deepcopy.go +++ b/api/scaler/v1alpha1/zz_generated.deepcopy.go @@ -135,6 +135,11 @@ func (in *BudAIScalerSpec) DeepCopyInto(out *BudAIScalerSpec) { *out = new(ScalingBehavior) (*in).DeepCopyInto(*out) } + if in.ScaleToZeroConfig != nil { + in, out := &in.ScaleToZeroConfig, &out.ScaleToZeroConfig + *out = new(ScaleToZeroConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BudAIScalerSpec. @@ -193,6 +198,10 @@ func (in *BudAIScalerStatus) DeepCopyInto(out *BudAIScalerStatus) { *out = new(MultiClusterStatus) (*in).DeepCopyInto(*out) } + if in.ZeroDemandSince != nil { + in, out := &in.ZeroDemandSince, &out.ZeroDemandSince + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BudAIScalerStatus. @@ -463,6 +472,31 @@ func (in *PredictionStatus) DeepCopy() *PredictionStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScaleToZeroConfig) DeepCopyInto(out *ScaleToZeroConfig) { + *out = *in + if in.ActivationScale != nil { + in, out := &in.ActivationScale, &out.ActivationScale + *out = new(int32) + **out = **in + } + if in.GracePeriod != nil { + in, out := &in.GracePeriod, &out.GracePeriod + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScaleToZeroConfig. +func (in *ScaleToZeroConfig) DeepCopy() *ScaleToZeroConfig { + if in == nil { + return nil + } + out := new(ScaleToZeroConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ScalingBehavior) DeepCopyInto(out *ScalingBehavior) { *out = *in diff --git a/config/crd/scaler.bud.studio_budaiscalers.yaml b/config/crd/scaler.bud.studio_budaiscalers.yaml index f8d42d8..d78c40f 100644 --- a/config/crd/scaler.bud.studio_budaiscalers.yaml +++ b/config/crd/scaler.bud.studio_budaiscalers.yaml @@ -140,8 +140,8 @@ spec: maxStartingPodPercent: description: |- MaxStartingPodPercent is the maximum percentage of total pods that can - be in starting state before gating scale-up. - Set to 0 to disable. Default: 0 (disabled) + be in starting state before gating scale-up. Set to 0 to disable. + Default: 50 (gate scale-up if more than 50% of pods are starting) format: int32 maximum: 100 minimum: 0 @@ -149,8 +149,8 @@ spec: maxStartingPods: description: |- MaxStartingPods is the maximum number of starting pods allowed before - gating further scale-up operations. Set to 0 to disable the gate. - Default: 0 (disabled) + gating further scale-up operations. Set to 0 to disable this gate. + Default: 0 (disabled, uses MaxStartingPodPercent instead) format: int32 minimum: 0 type: integer @@ -242,8 +242,8 @@ spec: maxStartingPodPercent: description: |- MaxStartingPodPercent is the maximum percentage of total pods that can - be in starting state before gating scale-up. - Set to 0 to disable. Default: 0 (disabled) + be in starting state before gating scale-up. Set to 0 to disable. + Default: 50 (gate scale-up if more than 50% of pods are starting) format: int32 maximum: 100 minimum: 0 @@ -251,8 +251,8 @@ spec: maxStartingPods: description: |- MaxStartingPods is the maximum number of starting pods allowed before - gating further scale-up operations. Set to 0 to disable the gate. - Default: 0 (disabled) + gating further scale-up operations. Set to 0 to disable this gate. + Default: 0 (disabled, uses MaxStartingPodPercent instead) format: int32 minimum: 0 type: integer @@ -583,6 +583,26 @@ spec: type: string type: object x-kubernetes-map-type: atomic + scaleToZeroConfig: + description: ScaleToZeroConfig configures scale-to-zero behavior. + properties: + activationScale: + default: 1 + description: ActivationScale is the number of replicas to scale + to when waking from zero. + format: int32 + minimum: 1 + type: integer + enabled: + default: false + description: Enabled turns on scale-to-zero behavior. + type: boolean + gracePeriod: + default: 5m + description: GracePeriod is the duration to wait with zero demand + before scaling to zero. + type: string + type: object scalingStrategy: default: BudScaler description: ScalingStrategy defines the algorithm to use for scaling @@ -910,6 +930,12 @@ spec: type: object maxItems: 10 type: array + zeroDemandSince: + description: |- + ZeroDemandSince tracks when zero demand was first detected. + Used for scale-to-zero grace period calculation. + format: date-time + type: string type: object type: object served: true diff --git a/pkg/context/context.go b/pkg/context/context.go index b653ca5..d9a0bcc 100644 --- a/pkg/context/context.go +++ b/pkg/context/context.go @@ -38,6 +38,7 @@ const ( DefaultPanicWindow = 60 * time.Second DefaultStableWindow = 180 * time.Second DefaultActivationScale = int32(1) + DefaultScaleToZeroGrace = 5 * time.Minute // Starting pods defaults - enabled by default for LLM workloads with long cold starts DefaultStartingPodWeight = 0.5 // Count starting pods as 50% capacity @@ -79,6 +80,7 @@ type ScalingContext interface { // Scale-to-zero configuration GetScaleToZero() bool GetActivationScale() int32 + GetScaleToZeroGrace() time.Duration // Per-metric target values GetTargetValueForMetric(metricName string) (float64, bool) @@ -147,8 +149,9 @@ type baseScalingContext struct { inPanicMode bool // Scale-to-zero - scaleToZero bool - activationScale int32 + scaleToZero bool + activationScale int32 + scaleToZeroGrace time.Duration // Per-metric targets metricTargets map[string]float64 @@ -214,7 +217,7 @@ func NewBaseScalingContext() ScalingContext { startingPodWeight: DefaultStartingPodWeight, maxStartingPods: DefaultMaxStartingPods, maxStartingPodPercent: DefaultMaxStartingPodPercent, - bypassGateOnPanic: DefaultBypassGateOnPanic + bypassGateOnPanic: DefaultBypassGateOnPanic, } } @@ -322,6 +325,19 @@ func NewScalingContextFromScaler(scaler *scalerv1alpha1.BudAIScaler) ScalingCont } } + // Parse scale-to-zero configuration from CRD (takes precedence over annotations) + if scaler.Spec.ScaleToZeroConfig != nil { + if scaler.Spec.ScaleToZeroConfig.Enabled { + ctx.scaleToZero = true + } + if scaler.Spec.ScaleToZeroConfig.ActivationScale != nil { + ctx.activationScale = *scaler.Spec.ScaleToZeroConfig.ActivationScale + } + if scaler.Spec.ScaleToZeroConfig.GracePeriod != nil { + ctx.scaleToZeroGrace = scaler.Spec.ScaleToZeroConfig.GracePeriod.Duration + } + } + // Parse per-metric target values for _, ms := range scaler.Spec.MetricsSources { if targetValue, err := strconv.ParseFloat(ms.TargetValue, 64); err == nil { @@ -403,6 +419,11 @@ func (c *baseScalingContext) parseAnnotations(annotations map[string]string) { c.activationScale = int32(scale) } } + if v, ok := annotations[types.ScaleToZeroGraceAnnotation]; ok { + if dur, err := time.ParseDuration(v); err == nil { + c.scaleToZeroGrace = dur + } + } } // Replica bounds implementation @@ -437,6 +458,12 @@ func (c *baseScalingContext) SetInPanicMode(inPanic bool) { c.inPanicMode = inPa // Scale-to-zero implementation func (c *baseScalingContext) GetScaleToZero() bool { return c.scaleToZero } func (c *baseScalingContext) GetActivationScale() int32 { return c.activationScale } +func (c *baseScalingContext) GetScaleToZeroGrace() time.Duration { + if c.scaleToZeroGrace == 0 { + return DefaultScaleToZeroGrace + } + return c.scaleToZeroGrace +} // Per-metric targets implementation func (c *baseScalingContext) GetTargetValueForMetric(metricName string) (float64, bool) { diff --git a/pkg/controller/budaiscaler/algorithm/algorithm.go b/pkg/controller/budaiscaler/algorithm/algorithm.go index 63b27db..8c882f3 100644 --- a/pkg/controller/budaiscaler/algorithm/algorithm.go +++ b/pkg/controller/budaiscaler/algorithm/algorithm.go @@ -70,6 +70,10 @@ type ScalingRequest struct { // LastScaleTime is when scaling last occurred. LastScaleTime *time.Time + // ZeroDemandSince tracks when zero demand was first detected. + // Used for scale-to-zero grace period calculation. + ZeroDemandSince *time.Time + // ScalingContext provides scaling configuration. ScalingContext ScalingContextProvider } @@ -95,6 +99,10 @@ type ScalingContextProvider interface { GetMaxStartingPods() int32 GetMaxStartingPodPercent() int32 GetBypassGateOnPanic() bool + // Scale-to-zero configuration + GetScaleToZero() bool + GetActivationScale() int32 + GetScaleToZeroGrace() time.Duration } // ScalingRecommendation contains the result of a scaling decision. @@ -253,8 +261,8 @@ func ApplyScaleDownPolicies(currentReplicas, desiredReplicas int32, sctx Scaling } minReplicas := currentReplicas - maxAllowed - if minReplicas < 1 { - minReplicas = 1 + if minReplicas < 0 { + minReplicas = 0 } if desiredReplicas < minReplicas { return minReplicas diff --git a/pkg/controller/budaiscaler/algorithm/algorithm_test.go b/pkg/controller/budaiscaler/algorithm/algorithm_test.go index 011e4fe..c1af515 100644 --- a/pkg/controller/budaiscaler/algorithm/algorithm_test.go +++ b/pkg/controller/budaiscaler/algorithm/algorithm_test.go @@ -18,6 +18,7 @@ package algorithm import ( "testing" + "time" scalerv1alpha1 "github.com/BudEcosystem/scaler/api/scaler/v1alpha1" ) @@ -43,6 +44,10 @@ type mockScalingContext struct { maxStartingPods int32 maxStartingPodPercent int32 bypassGateOnPanic bool + // Scale-to-zero config + scaleToZero bool + activationScale int32 + scaleToZeroGrace time.Duration } func (m *mockScalingContext) GetMinReplicas() int32 { return m.minReplicas } @@ -73,6 +78,14 @@ func (m *mockScalingContext) GetStartingPodWeight() float64 { return m.startin func (m *mockScalingContext) GetMaxStartingPods() int32 { return m.maxStartingPods } func (m *mockScalingContext) GetMaxStartingPodPercent() int32 { return m.maxStartingPodPercent } func (m *mockScalingContext) GetBypassGateOnPanic() bool { return m.bypassGateOnPanic } +func (m *mockScalingContext) GetScaleToZero() bool { return m.scaleToZero } +func (m *mockScalingContext) GetActivationScale() int32 { return m.activationScale } +func (m *mockScalingContext) GetScaleToZeroGrace() time.Duration { + if m.scaleToZeroGrace == 0 { + return 5 * time.Minute + } + return m.scaleToZeroGrace +} func TestApplyScaleUpPolicies(t *testing.T) { tests := []struct { @@ -249,12 +262,12 @@ func TestApplyScaleDownPolicies(t *testing.T) { expected: 8, // desired is within limit }, { - name: "ensure minimum 1 replica", + name: "allow scale to 0 when policy permits", currentReplicas: 2, desiredReplicas: 0, policies: []scalerv1alpha1.ScalingPolicy{{Type: scalerv1alpha1.PodsScalingPolicy, Value: 10, PeriodSeconds: 60}}, selectPolicy: scalerv1alpha1.MaxChangePolicySelect, - expected: 1, // minimum 1 replica + expected: 0, // scale-to-zero now allowed by policy }, } diff --git a/pkg/controller/budaiscaler/algorithm/budscaler.go b/pkg/controller/budaiscaler/algorithm/budscaler.go index 31503db..92ca690 100644 --- a/pkg/controller/budaiscaler/algorithm/budscaler.go +++ b/pkg/controller/budaiscaler/algorithm/budscaler.go @@ -148,6 +148,32 @@ func (a *BudScalerAlgorithm) ComputeRecommendation(ctx context.Context, request // Step 6: Apply min/max constraints rec.DesiredReplicas = a.applyConstraints(costRec, sctx.GetMinReplicas(), sctx.GetMaxReplicas()) + // Step 6.5: Handle scale-to-zero with grace period + if sctx.GetScaleToZero() && rec.DesiredReplicas == 0 && request.CurrentReplicas > 0 { + gracePeriod := sctx.GetScaleToZeroGrace() + if request.ZeroDemandSince != nil { + elapsed := time.Since(*request.ZeroDemandSince) + if elapsed < gracePeriod { + // Not enough time at zero demand, keep at 1 + rec.DesiredReplicas = 1 + rec.Reason = fmt.Sprintf("Scale-to-zero grace period: %v remaining", gracePeriod-elapsed) + } + } else { + // First detection of zero demand, keep at 1 and start tracking + rec.DesiredReplicas = 1 + rec.Reason = "Zero demand detected, starting grace period" + } + } + + // Step 6.6: Apply activation scale when scaling from zero + if request.CurrentReplicas == 0 && rec.DesiredReplicas > 0 { + activationScale := sctx.GetActivationScale() + if activationScale > rec.DesiredReplicas { + rec.DesiredReplicas = activationScale + rec.Reason = fmt.Sprintf("Scaling from zero to activation scale %d", activationScale) + } + } + // Step 7: Apply scaling policies (limit rate of change) if rec.DesiredReplicas > request.CurrentReplicas { rec.DesiredReplicas = ApplyScaleUpPolicies(request.CurrentReplicas, rec.DesiredReplicas, sctx) @@ -328,8 +354,13 @@ func (a *BudScalerAlgorithm) calculateDesiredForMetric(currentValue, targetValue } } - if desired < 1 { - desired = 1 + // Apply minimum based on scale-to-zero configuration + minAllowed := int32(1) + if sctx.GetScaleToZero() && sctx.GetMinReplicas() == 0 { + minAllowed = 0 + } + if desired < minAllowed { + desired = minAllowed } return desired @@ -375,9 +406,13 @@ func (a *BudScalerAlgorithm) calculateGPUBasedRecommendation(request ScalingRequ desiredFloat := float64(request.CurrentReplicas) * ratio desired := int32(math.Ceil(desiredFloat)) - // Apply constraints - if desired < 1 { - desired = 1 + // Apply minimum based on scale-to-zero configuration + minAllowed := int32(1) + if sctx.GetScaleToZero() && sctx.GetMinReplicas() == 0 { + minAllowed = 0 + } + if desired < minAllowed { + desired = minAllowed } return desired @@ -461,8 +496,8 @@ func (a *BudScalerAlgorithm) applyPredictionAdjustment(desired int32, request Sc adjustedFloat := float64(desired)*(1-weight) + float64(predictedReplicas)*weight adjusted := int32(math.Round(adjustedFloat)) - if adjusted < 1 { - adjusted = 1 + if adjusted < 0 { + adjusted = 0 } klog.V(5).InfoS("Applied prediction adjustment", diff --git a/pkg/controller/budaiscaler/algorithm/kpa.go b/pkg/controller/budaiscaler/algorithm/kpa.go index a3ecc74..cbca708 100644 --- a/pkg/controller/budaiscaler/algorithm/kpa.go +++ b/pkg/controller/budaiscaler/algorithm/kpa.go @@ -130,6 +130,30 @@ func (a *KPAAlgorithm) ComputeRecommendation(ctx context.Context, request Scalin // Apply min/max constraints rec.DesiredReplicas = a.applyConstraints(maxDesired, sctx.GetMinReplicas(), sctx.GetMaxReplicas()) + // Handle scale-to-zero with grace period + if sctx.GetScaleToZero() && rec.DesiredReplicas == 0 && request.CurrentReplicas > 0 { + gracePeriod := sctx.GetScaleToZeroGrace() + if request.ZeroDemandSince != nil { + elapsed := time.Since(*request.ZeroDemandSince) + if elapsed < gracePeriod { + // Not enough time at zero demand, keep at 1 + rec.DesiredReplicas = 1 + } + } else { + // First detection of zero demand, keep at 1 and start tracking + rec.DesiredReplicas = 1 + } + } + + // Apply activation scale when scaling from zero + if request.CurrentReplicas == 0 && rec.DesiredReplicas > 0 { + activationScale := sctx.GetActivationScale() + if activationScale > rec.DesiredReplicas { + rec.DesiredReplicas = activationScale + rec.Reason = fmt.Sprintf("Scaling from zero to activation scale %d", activationScale) + } + } + // Apply scaling policies (limit rate of change) if rec.DesiredReplicas > request.CurrentReplicas && !inPanicMode { rec.DesiredReplicas = ApplyScaleUpPolicies(request.CurrentReplicas, rec.DesiredReplicas, sctx) @@ -265,9 +289,13 @@ func (a *KPAAlgorithm) calculateDesiredReplicas( } } - // Ensure at least 1 replica - if desired < 1 { - desired = 1 + // Apply minimum based on scale-to-zero configuration + minAllowed := int32(1) + if sctx.GetScaleToZero() && sctx.GetMinReplicas() == 0 { + minAllowed = 0 + } + if desired < minAllowed { + desired = minAllowed } return desired, inPanicMode diff --git a/pkg/controller/budaiscaler/autoscaler.go b/pkg/controller/budaiscaler/autoscaler.go index 7cfecdd..3773f0c 100644 --- a/pkg/controller/budaiscaler/autoscaler.go +++ b/pkg/controller/budaiscaler/autoscaler.go @@ -153,6 +153,7 @@ func (a *AutoScaler) Scale(ctx context.Context, scaler *scalerv1alpha1.BudAIScal StartingPodCount: startingPodCount, MetricSnapshots: metricSnapshots, ScalingContext: scalingCtx, + ZeroDemandSince: a.getZeroDemandSince(scaler), } klog.V(4).InfoS("Pod counts", "scaler", scaler.Name, @@ -747,3 +748,12 @@ func (a *AutoScaler) CleanupLearningSystem(scaler *scalerv1alpha1.BudAIScaler) { key := scaler.Namespace + "/" + scaler.Name delete(a.learningSystems, key) } + +// getZeroDemandSince returns the time when zero demand was first detected. +func (a *AutoScaler) getZeroDemandSince(scaler *scalerv1alpha1.BudAIScaler) *time.Time { + if scaler.Status.ZeroDemandSince != nil { + t := scaler.Status.ZeroDemandSince.Time + return &t + } + return nil +} diff --git a/pkg/controller/budaiscaler/budaiscaler_controller.go b/pkg/controller/budaiscaler/budaiscaler_controller.go index c9bf1bf..727154a 100644 --- a/pkg/controller/budaiscaler/budaiscaler_controller.go +++ b/pkg/controller/budaiscaler/budaiscaler_controller.go @@ -299,6 +299,20 @@ func (r *BudAIScalerReconciler) reconcileCustomScaler(ctx context.Context, scale scaler.Status.ActualScale = result.DesiredReplicas scaler.Status.DesiredScale = result.DesiredReplicas + // Track zero demand state for grace period + if result.Recommendation != nil { + if result.Recommendation.DesiredReplicas == 0 && result.CurrentReplicas > 0 { + // Entering zero demand state + if scaler.Status.ZeroDemandSince == nil { + zeroDemandTimestamp := metav1.Now() + scaler.Status.ZeroDemandSince = &zeroDemandTimestamp + } + } else if result.Recommendation.DesiredReplicas > 0 { + // Exiting zero demand state + scaler.Status.ZeroDemandSince = nil + } + } + if result.Scaled { scaler.Status.LastScaleTime = &now diff --git a/pkg/metrics/collector.go b/pkg/metrics/collector.go index 1ccc57e..a11b631 100644 --- a/pkg/metrics/collector.go +++ b/pkg/metrics/collector.go @@ -47,7 +47,20 @@ func NewDefaultMetricCollector(factory MetricFetcherFactory, aggregator *aggrega // CollectMetrics collects metrics from all pods and returns a snapshot. func (c *DefaultMetricCollector) CollectMetrics(ctx context.Context, pods []corev1.Pod, source scalerv1alpha1.MetricSource) (*types.MetricSnapshot, error) { if len(pods) == 0 { - return nil, fmt.Errorf("no pods to collect metrics from") + // External sources can still be collected without pods + if IsExternalSource(source.MetricSourceType) { + return c.collectExternalMetrics(ctx, pods, source) + } + // For pod-based sources, return empty snapshot (zero demand indicator) + return &types.MetricSnapshot{ + Values: make(map[string]types.MetricValue), + Timestamp: time.Now(), + Average: 0, + Sum: 0, + Min: 0, + Max: 0, + Count: 0, + }, nil } // Check if this is an external source diff --git a/pkg/types/annotations.go b/pkg/types/annotations.go index 0f8d6e6..fd707cf 100644 --- a/pkg/types/annotations.go +++ b/pkg/types/annotations.go @@ -110,6 +110,10 @@ const ( // ActivationScaleAnnotation defines the minimum non-zero scale. // Value format: integer (e.g., "1", "2"). ActivationScaleAnnotation = AnnotationPrefix + "activation-scale" + + // ScaleToZeroGraceAnnotation defines the grace period before scaling to zero. + // Value format: duration string (e.g., "5m", "300s"). + ScaleToZeroGraceAnnotation = AnnotationPrefix + "scale-to-zero-grace" ) // Label constants.