BudEcosystem · dittops · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026
diff --git a/api/scaler/v1alpha1/budaiscaler_types.go b/api/scaler/v1alpha1/budaiscaler_types.go
@@ -102,6 +102,10 @@ type BudAIScalerSpec struct {
 	// Behavior configures the scaling behavior for scale up and scale down.
 	// +optional
 	Behavior *ScalingBehavior `json:"behavior,omitempty"`
+
+	// ScaleToZeroConfig configures scale-to-zero behavior.
+	// +optional
+	ScaleToZeroConfig *ScaleToZeroConfig `json:"scaleToZeroConfig,omitempty"`
 }
 
 // ScalingStrategyType defines the type of scaling algorithm.
@@ -488,6 +492,25 @@ type StartingPodsConfig struct {
 	BypassGateOnPanic *bool `json:"bypassGateOnPanic,omitempty"`
 }
 
+// ScaleToZeroConfig configures scale-to-zero behavior.
+type ScaleToZeroConfig struct {
+	// Enabled turns on scale-to-zero behavior.
+	// +optional
+	// +kubebuilder:default=false
+	Enabled bool `json:"enabled,omitempty"`
+
+	// ActivationScale is the number of replicas to scale to when waking from zero.
+	// +optional
+	// +kubebuilder:default=1
+	// +kubebuilder:validation:Minimum=1
+	ActivationScale *int32 `json:"activationScale,omitempty"`
+
+	// GracePeriod is the duration to wait with zero demand before scaling to zero.
+	// +optional
+	// +kubebuilder:default="5m"
+	GracePeriod *metav1.Duration `json:"gracePeriod,omitempty"`
+}
+
 // ScalingRules defines rules for scaling in a particular direction.
 type ScalingRules struct {
 	// StabilizationWindowSeconds is the number of seconds to look back
@@ -564,6 +587,11 @@ type BudAIScalerStatus struct {
 	// MultiClusterStatus contains federation status.
 	// +optional
 	MultiClusterStatus *MultiClusterStatus `json:"multiClusterStatus,omitempty"`
+
+	// ZeroDemandSince tracks when zero demand was first detected.
+	// Used for scale-to-zero grace period calculation.
+	// +optional
+	ZeroDemandSince *metav1.Time `json:"zeroDemandSince,omitempty"`
 }
 
 // ScalingDecision records a single scaling decision.

diff --git a/api/scaler/v1alpha1/zz_generated.deepcopy.go b/api/scaler/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/scaler.bud.studio_budaiscalers.yaml b/config/crd/scaler.bud.studio_budaiscalers.yaml
@@ -140,17 +140,17 @@ spec:
                           maxStartingPodPercent:
                             description: |-
                               MaxStartingPodPercent is the maximum percentage of total pods that can
-                              be in starting state before gating scale-up.
-                              Set to 0 to disable. Default: 0 (disabled)
+                              be in starting state before gating scale-up. Set to 0 to disable.
+                              Default: 50 (gate scale-up if more than 50% of pods are starting)
                             format: int32
                             maximum: 100
                             minimum: 0
                             type: integer
                           maxStartingPods:
                             description: |-
                               MaxStartingPods is the maximum number of starting pods allowed before
-                              gating further scale-up operations. Set to 0 to disable the gate.
-                              Default: 0 (disabled)
+                              gating further scale-up operations. Set to 0 to disable this gate.
+                              Default: 0 (disabled, uses MaxStartingPodPercent instead)
                             format: int32
                             minimum: 0
                             type: integer
@@ -242,17 +242,17 @@ spec:
                           maxStartingPodPercent:
                             description: |-
                               MaxStartingPodPercent is the maximum percentage of total pods that can
-                              be in starting state before gating scale-up.
-                              Set to 0 to disable. Default: 0 (disabled)
+                              be in starting state before gating scale-up. Set to 0 to disable.
+                              Default: 50 (gate scale-up if more than 50% of pods are starting)
                             format: int32
                             maximum: 100
                             minimum: 0
                             type: integer
                           maxStartingPods:
                             description: |-
                               MaxStartingPods is the maximum number of starting pods allowed before
-                              gating further scale-up operations. Set to 0 to disable the gate.
-                              Default: 0 (disabled)
+                              gating further scale-up operations. Set to 0 to disable this gate.
+                              Default: 0 (disabled, uses MaxStartingPodPercent instead)
                             format: int32
                             minimum: 0
                             type: integer
@@ -583,6 +583,26 @@ spec:
                     type: string
                 type: object
                 x-kubernetes-map-type: atomic
+              scaleToZeroConfig:
+                description: ScaleToZeroConfig configures scale-to-zero behavior.
+                properties:
+                  activationScale:
+                    default: 1
+                    description: ActivationScale is the number of replicas to scale
+                      to when waking from zero.
+                    format: int32
+                    minimum: 1
+                    type: integer
+                  enabled:
+                    default: false
+                    description: Enabled turns on scale-to-zero behavior.
+                    type: boolean
+                  gracePeriod:
+                    default: 5m
+                    description: GracePeriod is the duration to wait with zero demand
+                      before scaling to zero.
+                    type: string
+                type: object
               scalingStrategy:
                 default: BudScaler
                 description: ScalingStrategy defines the algorithm to use for scaling
@@ -910,6 +930,12 @@ spec:
                   type: object
                 maxItems: 10
                 type: array
+              zeroDemandSince:
+                description: |-
+                  ZeroDemandSince tracks when zero demand was first detected.
+                  Used for scale-to-zero grace period calculation.
+                format: date-time
+                type: string
             type: object
         type: object
     served: true

diff --git a/pkg/context/context.go b/pkg/context/context.go
@@ -38,6 +38,7 @@ const (
 	DefaultPanicWindow              = 60 * time.Second
 	DefaultStableWindow             = 180 * time.Second
 	DefaultActivationScale          = int32(1)
+	DefaultScaleToZeroGrace         = 5 * time.Minute
 
 	// Starting pods defaults - enabled by default for LLM workloads with long cold starts
 	DefaultStartingPodWeight     = 0.5 // Count starting pods as 50% capacity
@@ -79,6 +80,7 @@ type ScalingContext interface {
 	// Scale-to-zero configuration
 	GetScaleToZero() bool
 	GetActivationScale() int32
+	GetScaleToZeroGrace() time.Duration
 
 	// Per-metric target values
 	GetTargetValueForMetric(metricName string) (float64, bool)
@@ -147,8 +149,9 @@ type baseScalingContext struct {
 	inPanicMode    bool
 
 	// Scale-to-zero
-	scaleToZero     bool
-	activationScale int32
+	scaleToZero      bool
+	activationScale  int32
+	scaleToZeroGrace time.Duration
 
 	// Per-metric targets
 	metricTargets map[string]float64
@@ -214,7 +217,7 @@ func NewBaseScalingContext() ScalingContext {
 		startingPodWeight:     DefaultStartingPodWeight,
 		maxStartingPods:       DefaultMaxStartingPods,
 		maxStartingPodPercent: DefaultMaxStartingPodPercent,
-		bypassGateOnPanic:     DefaultBypassGateOnPanic
+		bypassGateOnPanic:     DefaultBypassGateOnPanic,
 	}
 }
 
@@ -322,6 +325,19 @@ func NewScalingContextFromScaler(scaler *scalerv1alpha1.BudAIScaler) ScalingCont
 		}
 	}
 
+	// Parse scale-to-zero configuration from CRD (takes precedence over annotations)
+	if scaler.Spec.ScaleToZeroConfig != nil {
+		if scaler.Spec.ScaleToZeroConfig.Enabled {
+			ctx.scaleToZero = true
+		}
+		if scaler.Spec.ScaleToZeroConfig.ActivationScale != nil {
+			ctx.activationScale = *scaler.Spec.ScaleToZeroConfig.ActivationScale
+		}
+		if scaler.Spec.ScaleToZeroConfig.GracePeriod != nil {
+			ctx.scaleToZeroGrace = scaler.Spec.ScaleToZeroConfig.GracePeriod.Duration
+		}
+	}
+
 	// Parse per-metric target values
 	for _, ms := range scaler.Spec.MetricsSources {
 		if targetValue, err := strconv.ParseFloat(ms.TargetValue, 64); err == nil {
@@ -403,6 +419,11 @@ func (c *baseScalingContext) parseAnnotations(annotations map[string]string) {
 			c.activationScale = int32(scale)
 		}
 	}
+	if v, ok := annotations[types.ScaleToZeroGraceAnnotation]; ok {
+		if dur, err := time.ParseDuration(v); err == nil {
+			c.scaleToZeroGrace = dur
+		}
+	}
 }
 
 // Replica bounds implementation
@@ -437,6 +458,12 @@ func (c *baseScalingContext) SetInPanicMode(inPanic bool) { c.inPanicMode = inPa
 // Scale-to-zero implementation
 func (c *baseScalingContext) GetScaleToZero() bool      { return c.scaleToZero }
 func (c *baseScalingContext) GetActivationScale() int32 { return c.activationScale }
+func (c *baseScalingContext) GetScaleToZeroGrace() time.Duration {
+	if c.scaleToZeroGrace == 0 {
+		return DefaultScaleToZeroGrace
+	}
+	return c.scaleToZeroGrace
+}
 
 // Per-metric targets implementation
 func (c *baseScalingContext) GetTargetValueForMetric(metricName string) (float64, bool) {

diff --git a/pkg/controller/budaiscaler/algorithm/algorithm.go b/pkg/controller/budaiscaler/algorithm/algorithm.go
@@ -70,6 +70,10 @@ type ScalingRequest struct {
 	// LastScaleTime is when scaling last occurred.
 	LastScaleTime *time.Time
 
+	// ZeroDemandSince tracks when zero demand was first detected.
+	// Used for scale-to-zero grace period calculation.
+	ZeroDemandSince *time.Time
+
 	// ScalingContext provides scaling configuration.
 	ScalingContext ScalingContextProvider
 }
@@ -95,6 +99,10 @@ type ScalingContextProvider interface {
 	GetMaxStartingPods() int32
 	GetMaxStartingPodPercent() int32
 	GetBypassGateOnPanic() bool
+	// Scale-to-zero configuration
+	GetScaleToZero() bool
+	GetActivationScale() int32
+	GetScaleToZeroGrace() time.Duration
 }
 
 // ScalingRecommendation contains the result of a scaling decision.
@@ -253,8 +261,8 @@ func ApplyScaleDownPolicies(currentReplicas, desiredReplicas int32, sctx Scaling
 	}
 
 	minReplicas := currentReplicas - maxAllowed
-	if minReplicas < 1 {
-		minReplicas = 1
+	if minReplicas < 0 {
+		minReplicas = 0
 	}
 	if desiredReplicas < minReplicas {
 		return minReplicas

diff --git a/pkg/controller/budaiscaler/algorithm/algorithm_test.go b/pkg/controller/budaiscaler/algorithm/algorithm_test.go
@@ -18,6 +18,7 @@ package algorithm
 
 import (
 	"testing"
+	"time"
 
 	scalerv1alpha1 "github.com/BudEcosystem/scaler/api/scaler/v1alpha1"
 )
@@ -43,6 +44,10 @@ type mockScalingContext struct {
 	maxStartingPods       int32
 	maxStartingPodPercent int32
 	bypassGateOnPanic     bool
+	// Scale-to-zero config
+	scaleToZero      bool
+	activationScale  int32
+	scaleToZeroGrace time.Duration
 }
 
 func (m *mockScalingContext) GetMinReplicas() int32                 { return m.minReplicas }
@@ -73,6 +78,14 @@ func (m *mockScalingContext) GetStartingPodWeight() float64   { return m.startin
 func (m *mockScalingContext) GetMaxStartingPods() int32       { return m.maxStartingPods }
 func (m *mockScalingContext) GetMaxStartingPodPercent() int32 { return m.maxStartingPodPercent }
 func (m *mockScalingContext) GetBypassGateOnPanic() bool      { return m.bypassGateOnPanic }
+func (m *mockScalingContext) GetScaleToZero() bool            { return m.scaleToZero }
+func (m *mockScalingContext) GetActivationScale() int32       { return m.activationScale }
+func (m *mockScalingContext) GetScaleToZeroGrace() time.Duration {
+	if m.scaleToZeroGrace == 0 {
+		return 5 * time.Minute
+	}
+	return m.scaleToZeroGrace
+}
 
 func TestApplyScaleUpPolicies(t *testing.T) {
 	tests := []struct {
@@ -249,12 +262,12 @@ func TestApplyScaleDownPolicies(t *testing.T) {
 			expected:        8, // desired is within limit
 		},
 		{
-			name:            "ensure minimum 1 replica",
+			name:            "allow scale to 0 when policy permits",
 			currentReplicas: 2,
 			desiredReplicas: 0,
 			policies:        []scalerv1alpha1.ScalingPolicy{{Type: scalerv1alpha1.PodsScalingPolicy, Value: 10, PeriodSeconds: 60}},
 			selectPolicy:    scalerv1alpha1.MaxChangePolicySelect,
-			expected:        1, // minimum 1 replica
+			expected:        0, // scale-to-zero now allowed by policy
 		},
 	}