Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions api/scaler/v1alpha1/budaiscaler_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ type BudAIScalerSpec struct {
// Behavior configures the scaling behavior for scale up and scale down.
// +optional
Behavior *ScalingBehavior `json:"behavior,omitempty"`

// ScaleToZeroConfig configures scale-to-zero behavior.
// +optional
ScaleToZeroConfig *ScaleToZeroConfig `json:"scaleToZeroConfig,omitempty"`
}

// ScalingStrategyType defines the type of scaling algorithm.
Expand Down Expand Up @@ -488,6 +492,25 @@ type StartingPodsConfig struct {
BypassGateOnPanic *bool `json:"bypassGateOnPanic,omitempty"`
}

// ScaleToZeroConfig configures scale-to-zero behavior.
type ScaleToZeroConfig struct {
// Enabled turns on scale-to-zero behavior.
// +optional
// +kubebuilder:default=false
Enabled bool `json:"enabled,omitempty"`

// ActivationScale is the number of replicas to scale to when waking from zero.
// +optional
// +kubebuilder:default=1
// +kubebuilder:validation:Minimum=1
ActivationScale *int32 `json:"activationScale,omitempty"`

// GracePeriod is the duration to wait with zero demand before scaling to zero.
// +optional
// +kubebuilder:default="5m"
GracePeriod *metav1.Duration `json:"gracePeriod,omitempty"`
}

// ScalingRules defines rules for scaling in a particular direction.
type ScalingRules struct {
// StabilizationWindowSeconds is the number of seconds to look back
Expand Down Expand Up @@ -564,6 +587,11 @@ type BudAIScalerStatus struct {
// MultiClusterStatus contains federation status.
// +optional
MultiClusterStatus *MultiClusterStatus `json:"multiClusterStatus,omitempty"`

// ZeroDemandSince tracks when zero demand was first detected.
// Used for scale-to-zero grace period calculation.
// +optional
ZeroDemandSince *metav1.Time `json:"zeroDemandSince,omitempty"`
}

// ScalingDecision records a single scaling decision.
Expand Down
34 changes: 34 additions & 0 deletions api/scaler/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 34 additions & 8 deletions config/crd/scaler.bud.studio_budaiscalers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,17 +140,17 @@ spec:
maxStartingPodPercent:
description: |-
MaxStartingPodPercent is the maximum percentage of total pods that can
be in starting state before gating scale-up.
Set to 0 to disable. Default: 0 (disabled)
be in starting state before gating scale-up. Set to 0 to disable.
Default: 50 (gate scale-up if more than 50% of pods are starting)
format: int32
maximum: 100
minimum: 0
type: integer
maxStartingPods:
description: |-
MaxStartingPods is the maximum number of starting pods allowed before
gating further scale-up operations. Set to 0 to disable the gate.
Default: 0 (disabled)
gating further scale-up operations. Set to 0 to disable this gate.
Default: 0 (disabled, uses MaxStartingPodPercent instead)
format: int32
minimum: 0
type: integer
Expand Down Expand Up @@ -242,17 +242,17 @@ spec:
maxStartingPodPercent:
description: |-
MaxStartingPodPercent is the maximum percentage of total pods that can
be in starting state before gating scale-up.
Set to 0 to disable. Default: 0 (disabled)
be in starting state before gating scale-up. Set to 0 to disable.
Default: 50 (gate scale-up if more than 50% of pods are starting)
format: int32
maximum: 100
minimum: 0
type: integer
maxStartingPods:
description: |-
MaxStartingPods is the maximum number of starting pods allowed before
gating further scale-up operations. Set to 0 to disable the gate.
Default: 0 (disabled)
gating further scale-up operations. Set to 0 to disable this gate.
Default: 0 (disabled, uses MaxStartingPodPercent instead)
format: int32
minimum: 0
type: integer
Expand Down Expand Up @@ -583,6 +583,26 @@ spec:
type: string
type: object
x-kubernetes-map-type: atomic
scaleToZeroConfig:
description: ScaleToZeroConfig configures scale-to-zero behavior.
properties:
activationScale:
default: 1
description: ActivationScale is the number of replicas to scale
to when waking from zero.
format: int32
minimum: 1
type: integer
enabled:
default: false
description: Enabled turns on scale-to-zero behavior.
type: boolean
gracePeriod:
default: 5m
description: GracePeriod is the duration to wait with zero demand
before scaling to zero.
type: string
type: object
scalingStrategy:
default: BudScaler
description: ScalingStrategy defines the algorithm to use for scaling
Expand Down Expand Up @@ -910,6 +930,12 @@ spec:
type: object
maxItems: 10
type: array
zeroDemandSince:
description: |-
ZeroDemandSince tracks when zero demand was first detected.
Used for scale-to-zero grace period calculation.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
33 changes: 30 additions & 3 deletions pkg/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
DefaultPanicWindow = 60 * time.Second
DefaultStableWindow = 180 * time.Second
DefaultActivationScale = int32(1)
DefaultScaleToZeroGrace = 5 * time.Minute

// Starting pods defaults - enabled by default for LLM workloads with long cold starts
DefaultStartingPodWeight = 0.5 // Count starting pods as 50% capacity
Expand Down Expand Up @@ -79,6 +80,7 @@ type ScalingContext interface {
// Scale-to-zero configuration
GetScaleToZero() bool
GetActivationScale() int32
GetScaleToZeroGrace() time.Duration

// Per-metric target values
GetTargetValueForMetric(metricName string) (float64, bool)
Expand Down Expand Up @@ -147,8 +149,9 @@ type baseScalingContext struct {
inPanicMode bool

// Scale-to-zero
scaleToZero bool
activationScale int32
scaleToZero bool
activationScale int32
scaleToZeroGrace time.Duration

// Per-metric targets
metricTargets map[string]float64
Expand Down Expand Up @@ -214,7 +217,7 @@ func NewBaseScalingContext() ScalingContext {
startingPodWeight: DefaultStartingPodWeight,
maxStartingPods: DefaultMaxStartingPods,
maxStartingPodPercent: DefaultMaxStartingPodPercent,
bypassGateOnPanic: DefaultBypassGateOnPanic
bypassGateOnPanic: DefaultBypassGateOnPanic,
}
}

Expand Down Expand Up @@ -322,6 +325,19 @@ func NewScalingContextFromScaler(scaler *scalerv1alpha1.BudAIScaler) ScalingCont
}
}

// Parse scale-to-zero configuration from CRD (takes precedence over annotations)
if scaler.Spec.ScaleToZeroConfig != nil {
if scaler.Spec.ScaleToZeroConfig.Enabled {
ctx.scaleToZero = true
}
if scaler.Spec.ScaleToZeroConfig.ActivationScale != nil {
ctx.activationScale = *scaler.Spec.ScaleToZeroConfig.ActivationScale
}
if scaler.Spec.ScaleToZeroConfig.GracePeriod != nil {
ctx.scaleToZeroGrace = scaler.Spec.ScaleToZeroConfig.GracePeriod.Duration
}
}

// Parse per-metric target values
for _, ms := range scaler.Spec.MetricsSources {
if targetValue, err := strconv.ParseFloat(ms.TargetValue, 64); err == nil {
Expand Down Expand Up @@ -403,6 +419,11 @@ func (c *baseScalingContext) parseAnnotations(annotations map[string]string) {
c.activationScale = int32(scale)
}
}
if v, ok := annotations[types.ScaleToZeroGraceAnnotation]; ok {
if dur, err := time.ParseDuration(v); err == nil {
c.scaleToZeroGrace = dur
}
}
}

// Replica bounds implementation
Expand Down Expand Up @@ -437,6 +458,12 @@ func (c *baseScalingContext) SetInPanicMode(inPanic bool) { c.inPanicMode = inPa
// Scale-to-zero implementation
func (c *baseScalingContext) GetScaleToZero() bool { return c.scaleToZero }
func (c *baseScalingContext) GetActivationScale() int32 { return c.activationScale }
func (c *baseScalingContext) GetScaleToZeroGrace() time.Duration {
if c.scaleToZeroGrace == 0 {
return DefaultScaleToZeroGrace
}
return c.scaleToZeroGrace
}

// Per-metric targets implementation
func (c *baseScalingContext) GetTargetValueForMetric(metricName string) (float64, bool) {
Expand Down
12 changes: 10 additions & 2 deletions pkg/controller/budaiscaler/algorithm/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ type ScalingRequest struct {
// LastScaleTime is when scaling last occurred.
LastScaleTime *time.Time

// ZeroDemandSince tracks when zero demand was first detected.
// Used for scale-to-zero grace period calculation.
ZeroDemandSince *time.Time

// ScalingContext provides scaling configuration.
ScalingContext ScalingContextProvider
}
Expand All @@ -95,6 +99,10 @@ type ScalingContextProvider interface {
GetMaxStartingPods() int32
GetMaxStartingPodPercent() int32
GetBypassGateOnPanic() bool
// Scale-to-zero configuration
GetScaleToZero() bool
GetActivationScale() int32
GetScaleToZeroGrace() time.Duration
}

// ScalingRecommendation contains the result of a scaling decision.
Expand Down Expand Up @@ -253,8 +261,8 @@ func ApplyScaleDownPolicies(currentReplicas, desiredReplicas int32, sctx Scaling
}

minReplicas := currentReplicas - maxAllowed
if minReplicas < 1 {
minReplicas = 1
if minReplicas < 0 {
minReplicas = 0
}
if desiredReplicas < minReplicas {
return minReplicas
Expand Down
17 changes: 15 additions & 2 deletions pkg/controller/budaiscaler/algorithm/algorithm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package algorithm

import (
"testing"
"time"

scalerv1alpha1 "github.com/BudEcosystem/scaler/api/scaler/v1alpha1"
)
Expand All @@ -43,6 +44,10 @@ type mockScalingContext struct {
maxStartingPods int32
maxStartingPodPercent int32
bypassGateOnPanic bool
// Scale-to-zero config
scaleToZero bool
activationScale int32
scaleToZeroGrace time.Duration
}

func (m *mockScalingContext) GetMinReplicas() int32 { return m.minReplicas }
Expand Down Expand Up @@ -73,6 +78,14 @@ func (m *mockScalingContext) GetStartingPodWeight() float64 { return m.startin
func (m *mockScalingContext) GetMaxStartingPods() int32 { return m.maxStartingPods }
func (m *mockScalingContext) GetMaxStartingPodPercent() int32 { return m.maxStartingPodPercent }
func (m *mockScalingContext) GetBypassGateOnPanic() bool { return m.bypassGateOnPanic }
func (m *mockScalingContext) GetScaleToZero() bool { return m.scaleToZero }
func (m *mockScalingContext) GetActivationScale() int32 { return m.activationScale }
func (m *mockScalingContext) GetScaleToZeroGrace() time.Duration {
if m.scaleToZeroGrace == 0 {
return 5 * time.Minute
}
return m.scaleToZeroGrace
}

func TestApplyScaleUpPolicies(t *testing.T) {
tests := []struct {
Expand Down Expand Up @@ -249,12 +262,12 @@ func TestApplyScaleDownPolicies(t *testing.T) {
expected: 8, // desired is within limit
},
{
name: "ensure minimum 1 replica",
name: "allow scale to 0 when policy permits",
currentReplicas: 2,
desiredReplicas: 0,
policies: []scalerv1alpha1.ScalingPolicy{{Type: scalerv1alpha1.PodsScalingPolicy, Value: 10, PeriodSeconds: 60}},
selectPolicy: scalerv1alpha1.MaxChangePolicySelect,
expected: 1, // minimum 1 replica
expected: 0, // scale-to-zero now allowed by policy
},
}

Expand Down
Loading