diff --git a/docs/reference/api.md b/docs/reference/api.md index 4b495fef69e..6c9808c0c3c 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -142,6 +142,25 @@ _Appears in:_ | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | +#### IncrementalUpgradeOptions + + + + + + + +_Appears in:_ +- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | | +| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | | +| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | | +| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | | + + #### JobSubmissionMode @@ -319,7 +338,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | | +| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports
`NewCluster`, `IncrementalUpgrade`, and `None`. | | | +| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. | | | #### RayServiceUpgradeType diff --git a/go.mod b/go.mod index 472e6d593df..e93dc132eda 100644 --- a/go.mod +++ b/go.mod @@ -73,7 +73,7 @@ require ( github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/moby/term v0.5.0 // indirect @@ -95,12 +95,12 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.39.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect + golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/term v0.31.0 // indirect + golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.10.0 // indirect golang.org/x/tools v0.31.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect @@ -112,6 +112,7 @@ require ( k8s.io/component-base v0.33.1 // indirect k8s.io/component-helpers v0.33.1 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + sigs.k8s.io/gateway-api v1.3.0 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/kustomize/api v0.19.0 // indirect sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect diff --git a/go.sum b/go.sum index dddab9f7e86..22e4f1113d9 100644 --- a/go.sum +++ b/go.sum @@ -139,8 +139,9 @@ github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUt github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= @@ -263,8 +264,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= @@ -274,8 +275,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -292,12 +293,12 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4= golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -380,6 +381,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= +sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ= diff --git a/helm-chart/kuberay-operator/README.md b/helm-chart/kuberay-operator/README.md index 6837698d597..43ea4144af3 100644 --- a/helm-chart/kuberay-operator/README.md +++ b/helm-chart/kuberay-operator/README.md @@ -165,6 +165,8 @@ spec: | featureGates[0].enabled | bool | `true` | | | featureGates[1].name | string | `"RayJobDeletionPolicy"` | | | featureGates[1].enabled | bool | `false` | | +| featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | | +| featureGates[2].enabled | bool | `false` | | | metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. | | metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor | | metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index a86457fac1a..41bda880d9a 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -8225,6 +8225,25 @@ spec: type: integer upgradeStrategy: properties: + incrementalUpgradeOptions: + properties: + gatewayClassName: + type: string + intervalSeconds: + format: int32 + type: integer + maxSurgePercent: + default: 100 + format: int32 + type: integer + stepSizePercent: + format: int32 + type: integer + required: + - gatewayClassName + - intervalSeconds + - stepSizePercent + type: object type: type: string type: object @@ -8253,6 +8272,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8367,6 +8389,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object conditions: items: @@ -8436,6 +8464,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8550,6 +8581,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object serviceStatus: type: string diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl index 5d14510a61b..d5e0e7352d0 100644 --- a/helm-chart/kuberay-operator/templates/_helpers.tpl +++ b/helm-chart/kuberay-operator/templates/_helpers.tpl @@ -222,6 +222,17 @@ rules: - patch - update - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways + - httproutes + verbs: + - create + - get + - list + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml index 6010d7f2b3e..f1464ba3a30 100644 --- a/helm-chart/kuberay-operator/values.yaml +++ b/helm-chart/kuberay-operator/values.yaml @@ -88,6 +88,8 @@ featureGates: enabled: true - name: RayJobDeletionPolicy enabled: false +- name: RayServiceIncrementalUpgrade + enabled: false # Configurations for KubeRay operator metrics. metrics: diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 3eda8a616c4..04451030ad2 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -76,8 +76,16 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests. go test -timeout 30m -v $(WHAT) +test-e2e-rayservice: WHAT ?= ./test/e2erayservice +test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests. + go test -timeout 30m -v $(WHAT) + test-e2e-upgrade: WHAT ?= ./test/e2eupgrade -test-e2e-upgrade: manifests fmt vet ## Run e2e tests. +test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests. + go test -timeout 30m -v $(WHAT) + +test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade +test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests. go test -timeout 30m -v $(WHAT) test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index e7d73e07d8e..e331fba27ae 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -22,6 +22,9 @@ const ( type RayServiceUpgradeType string const ( + // During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale + // and migrate traffic to using Gateway API. + IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade" // During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready NewCluster RayServiceUpgradeType = "NewCluster" // No new cluster will be created while the strategy is set to None @@ -57,10 +60,27 @@ var DeploymentStatusEnum = struct { UNHEALTHY: "UNHEALTHY", } +type IncrementalUpgradeOptions struct { + // The capacity of serve requests the upgraded cluster should scale to handle each interval. + // Defaults to 100%. + // +kubebuilder:default:=100 + MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"` + // The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. + StepSizePercent *int32 `json:"stepSizePercent"` + // The interval in seconds between transferring StepSize traffic from the old to new RayCluster. + IntervalSeconds *int32 `json:"intervalSeconds"` + // The name of the Gateway Class installed by the Kubernetes Cluster admin. + GatewayClassName string `json:"gatewayClassName"` +} + type RayServiceUpgradeStrategy struct { - // Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. + // Type represents the strategy used when upgrading the RayService. Currently supports + // `NewCluster`, `IncrementalUpgrade`, and `None`. // +optional Type *RayServiceUpgradeType `json:"type,omitempty"` + // IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. + // RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. + IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"` } // RayServiceSpec defines the desired state of RayService @@ -130,6 +150,12 @@ type RayServiceStatus struct { // +optional Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` // +optional + TargetCapacity *int32 `json:"targetCapacity,omitempty"` + // +optional + TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + // +optional + LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` + // +optional RayClusterName string `json:"rayClusterName,omitempty"` // +optional RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` @@ -162,6 +188,8 @@ const ( RayServiceReady RayServiceConditionType = "Ready" // UpgradeInProgress means the RayService is currently performing a zero-downtime upgrade. UpgradeInProgress RayServiceConditionType = "UpgradeInProgress" + // RollbackInProgress means the RayService is currently rolling back an in-progress upgrade to the original cluster state. + RollbackInProgress RayServiceConditionType = "RollbackInProgress" ) const ( @@ -171,6 +199,7 @@ const ( BothActivePendingClustersExist RayServiceConditionReason = "BothActivePendingClustersExist" NoPendingCluster RayServiceConditionReason = "NoPendingCluster" NoActiveCluster RayServiceConditionReason = "NoActiveCluster" + GoalClusterChanged RayServiceConditionReason = "GoalClusterChanged" ) // +kubebuilder:object:root=true @@ -184,8 +213,7 @@ const ( type RayService struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec RayServiceSpec `json:"spec,omitempty"` + Spec RayServiceSpec `json:"spec,omitempty"` // +optional Status RayServiceStatuses `json:"status,omitempty"` } diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index b4cb5decf12..c9f5974f116 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -213,6 +213,36 @@ func (in *HeadInfo) DeepCopy() *HeadInfo { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *IncrementalUpgradeOptions) DeepCopyInto(out *IncrementalUpgradeOptions) { + *out = *in + if in.MaxSurgePercent != nil { + in, out := &in.MaxSurgePercent, &out.MaxSurgePercent + *out = new(int32) + **out = **in + } + if in.StepSizePercent != nil { + in, out := &in.StepSizePercent, &out.StepSizePercent + *out = new(int32) + **out = **in + } + if in.IntervalSeconds != nil { + in, out := &in.IntervalSeconds, &out.IntervalSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IncrementalUpgradeOptions. +func (in *IncrementalUpgradeOptions) DeepCopy() *IncrementalUpgradeOptions { + if in == nil { + return nil + } + out := new(IncrementalUpgradeOptions) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayCluster) DeepCopyInto(out *RayCluster) { *out = *in @@ -663,6 +693,20 @@ func (in *RayServiceStatus) DeepCopyInto(out *RayServiceStatus) { (*out)[key] = *val.DeepCopy() } } + if in.TargetCapacity != nil { + in, out := &in.TargetCapacity, &out.TargetCapacity + *out = new(int32) + **out = **in + } + if in.TrafficRoutedPercent != nil { + in, out := &in.TrafficRoutedPercent, &out.TrafficRoutedPercent + *out = new(int32) + **out = **in + } + if in.LastTrafficMigratedTime != nil { + in, out := &in.LastTrafficMigratedTime, &out.LastTrafficMigratedTime + *out = (*in).DeepCopy() + } in.RayClusterStatus.DeepCopyInto(&out.RayClusterStatus) } @@ -712,6 +756,11 @@ func (in *RayServiceUpgradeStrategy) DeepCopyInto(out *RayServiceUpgradeStrategy *out = new(RayServiceUpgradeType) **out = **in } + if in.IncrementalUpgradeOptions != nil { + in, out := &in.IncrementalUpgradeOptions, &out.IncrementalUpgradeOptions + *out = new(IncrementalUpgradeOptions) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RayServiceUpgradeStrategy. diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index a86457fac1a..41bda880d9a 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -8225,6 +8225,25 @@ spec: type: integer upgradeStrategy: properties: + incrementalUpgradeOptions: + properties: + gatewayClassName: + type: string + intervalSeconds: + format: int32 + type: integer + maxSurgePercent: + default: 100 + format: int32 + type: integer + stepSizePercent: + format: int32 + type: integer + required: + - gatewayClassName + - intervalSeconds + - stepSizePercent + type: object type: type: string type: object @@ -8253,6 +8272,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8367,6 +8389,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object conditions: items: @@ -8436,6 +8464,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8550,6 +8581,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object serviceStatus: type: string diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index ba840f0c27f..9ea1db93190 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -107,6 +107,17 @@ rules: - patch - update - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways + - httproutes + verbs: + - create + - get + - list + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 63eefa94bc4..922a31d924f 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -203,3 +203,19 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp }, } } + +func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + gatewayName := utils.CheckGatewayName(fmt.Sprintf("%s-gateway", rayService.Name)) + return types.NamespacedName{ + Name: gatewayName, + Namespace: rayService.Namespace, + } +} + +func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", rayService.Name)) + return types.NamespacedName{ + Name: httpRouteName, + Namespace: rayService.Namespace, + } +} diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 71cea97c005..7675a30b3bb 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -184,7 +184,10 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus namespace := rayCluster.Namespace crdType := utils.RayClusterCRD if isRayService { - name = rayService.Name + // For IncrementalUpgrade, the name is based on the unique RayCluster. + if !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) { + name = rayService.Name + } namespace = rayService.Namespace crdType = utils.RayServiceCRD } @@ -225,7 +228,7 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus "otherwise, the Kubernetes service for Ray Serve will not be created.") } - if rayService.Spec.ServeService != nil { + if rayService.Spec.ServeService != nil && !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) { // Use the provided "custom" ServeService. // Deep copy the ServeService to avoid modifying the original object serveService := rayService.Spec.ServeService.DeepCopy() diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 7e51c018fbf..a8b07cf6d82 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -6,6 +6,7 @@ import ( "fmt" "math" "os" + "reflect" "strconv" "strings" "time" @@ -21,6 +22,7 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/client-go/tools/record" "k8s.io/utils/lru" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -28,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -90,6 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut // +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update // +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update; +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update; // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete @@ -142,10 +147,42 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } + // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // If an upgrade is in progress, check if rollback is necessary. + if activeRayClusterInstance != nil && pendingRayClusterInstance != nil { + if err := r.reconcileRollbackState(ctx, rayServiceInstance, activeRayClusterInstance, pendingRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + } + + // Ensure per-cluster Serve service exists for the active and pending RayClusters. + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + // Creates or updates a Gateway CR that points to the Serve services of + // the active and pending (if it exists) RayClusters. For incremental upgrades, + // the Gateway endpoint is used rather than the Serve service. + err = r.reconcileGateway(ctx, rayServiceInstance) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + // Create or update the HTTPRoute attached to this RayService's Gateway. + err = r.reconcileHTTPRoute(ctx, rayServiceInstance) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + } + // Reconcile serve applications for active and/or pending clusters // 1. If there is a pending cluster, reconcile serve applications for the pending cluster. // 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only. // 3. If there is no pending cluster, reconcile serve applications for the active cluster. + // 4. During an IncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster based + // on total TargetCapacity. var isActiveClusterReady, isPendingClusterReady bool = false, false var activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus = nil, nil if pendingRayClusterInstance != nil { @@ -162,6 +199,11 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } + } else if activeRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + logger.Info("Reconciling the Serve applications for active cluster during IncrementalUpgrade", "clusterName", activeRayClusterInstance.Name) + if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } } // Reconcile K8s services and make sure it points to the correct RayCluster. @@ -229,6 +271,27 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn rayServiceInstance.Status.ObservedGeneration = rayServiceInstance.ObjectMeta.Generation + if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) { + activeStatus := &rayServiceInstance.Status.ActiveServiceStatus + pendingStatus := &rayServiceInstance.Status.PendingServiceStatus + + // A rollback is complete when the active cluster is back at 100% TargetCapacity and TrafficRoutedPercent, + // and the pending cluster is at 0% TargetCapacity and TrafficRoutedPercent. + if ptr.Deref(activeStatus.TargetCapacity, -1) == 100 && + ptr.Deref(activeStatus.TrafficRoutedPercent, -1) == 100 && + ptr.Deref(pendingStatus.TargetCapacity, -1) == 0 && + ptr.Deref(pendingStatus.TrafficRoutedPercent, -1) == 0 { + + logger.Info("Rollback to original cluster is complete. Cleaning up pending cluster from prior upgrade.") + + // Clear the RayService pending service status to clean up the pending cluster. + rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + pendingCluster = nil + + meta.RemoveStatusCondition(&rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) + } + } + // Update RayClusterStatus in RayService status. var activeClusterStatus, pendingClusterStatus rayv1.RayClusterStatus if activeCluster != nil { @@ -278,10 +341,30 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn } logger.Info("Preparing a new pending RayCluster instance by setting RayClusterName", "clusterName", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) + + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled + if rayServiceInstance.Status.ActiveServiceStatus.RayClusterName == "" { + // If no Active RayCluster exists - default to starting with 100% TargetCapacity. + if rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity == nil { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(100)) + } + } else if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + // Pending RayCluster during an upgrade should start with 0% TargetCapacity. + if rayServiceInstance.Status.PendingServiceStatus.TargetCapacity == nil { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(0)) + } + } + } } serveEndPoints := &corev1.Endpoints{} - if err := r.Get(ctx, common.RayServiceServeServiceNamespacedName(rayServiceInstance), serveEndPoints); err != nil && !errors.IsNotFound(err) { + serveServiceName := common.RayServiceServeServiceNamespacedName(rayServiceInstance) + // For IncrementalUpgrade, the Serve service name is based on the RayCluster. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeCluster != nil { + serveServiceName.Name = utils.GenerateServeServiceName(activeCluster.Name) + } + if err := r.Get(ctx, serveServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { return err } @@ -294,6 +377,21 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn if numServeEndpoints > math.MaxInt32 { return errstd.New("numServeEndpoints exceeds math.MaxInt32") } + + // During an IncrementalUpgrade, the pending RayCluster is also serving. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil { + pendingServeServiceName := common.RayClusterServeServiceNamespacedName(pendingCluster) + if err := r.Get(ctx, pendingServeServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { + return err + } + for _, subset := range serveEndPoints.Subsets { + numServeEndpoints += len(subset.Addresses) + } + if numServeEndpoints > math.MaxInt32 { + return errstd.New("numServeEndpoints exceeds math.MaxInt32") + } + } + rayServiceInstance.Status.NumServeEndpoints = int32(numServeEndpoints) //nolint:gosec // This is a false positive from gosec. See https://github.com/securego/gosec/issues/1212 for more details. calculateConditions(rayServiceInstance) @@ -302,6 +400,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RayServiceReady)) { rayServiceInstance.Status.ServiceStatus = rayv1.Running } + return nil } @@ -392,7 +491,12 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra if upgradeStrategy != nil { upgradeType := upgradeStrategy.Type if upgradeType != nil { - if *upgradeType != rayv1.NewCluster { + if features.Enabled(features.RayServiceIncrementalUpgrade) { + if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade { + logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade)) + return false + } + } else if *upgradeType != rayv1.NewCluster { logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to NewCluster.") return false } @@ -407,6 +511,288 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra return true } +func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade.") + } + + gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway") + // Define the desired Gateway object + rayServiceGateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: gatewayName, + Namespace: rayServiceInstance.Namespace, + }, + Spec: gwv1.GatewaySpec{ + GatewayClassName: gwv1.ObjectName(options.GatewayClassName), + }, + } + + rayServiceGateway.Spec.Listeners = utils.GetGatewayListenersForRayService(rayServiceInstance) + + return rayServiceGateway, nil +} + +// `reconcileGateway` reconciles a Gateway resource for a RayService. The possible cases are: +// (1) Create a new Gateway instance. (2) Update the Gateway instance if RayService has updated. (3) Do nothing. +func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) error { + logger := ctrl.LoggerFrom(ctx) + var err error + + // Construct desired Gateway object for RayService + desiredGateway, err := r.createGateway(rayServiceInstance) + if err != nil { + logger.Error(err, "Failed to build Gateway object for Rayservice") + return err + } + if desiredGateway == nil { + logger.Info("Skipping Gateway reconciliation: desired Gateway is nil") + return nil + } + + // Check for existing RayService Gateway, create the desired Gateway if none is found + existingGateway := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), existingGateway); err != nil { + if errors.IsNotFound(err) { + // Set the ownership in order to do the garbage collection by k8s. + if err := ctrl.SetControllerReference(rayServiceInstance, desiredGateway, r.Scheme); err != nil { + return err + } + logger.Info("Creating a new Gateway instance", "Gateway Listeners", desiredGateway.Spec.Listeners) + if err := r.Create(ctx, desiredGateway); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err) + return err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name) + return nil + } + return err + } + + // If Gateway already exists, check if update is needed to reach desired state + if !reflect.DeepEqual(existingGateway.Spec, desiredGateway.Spec) { + logger.Info("Updating existing Gateway", "name", existingGateway.Name) + existingGateway.Spec = desiredGateway.Spec + if err := r.Update(ctx, existingGateway); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err) + return err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name) + } + + return nil +} + +// reconcileTrafficRoutedPercent determines the traffic split between the active and pending clusters during an upgrade, +// returning the weights for the old and new clusters respectively, or an error if misconfigured. +func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, hasPendingCluster bool) (activeClusterWeight, pendingClusterWeight int32, err error) { + logger := ctrl.LoggerFrom(ctx) + activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus + pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus + + // Default to 100% traffic on the active cluster. + activeClusterWeight = 100 + pendingClusterWeight = 0 + + if hasPendingCluster { + // Zero-downtime upgrade in progress. + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return 0, 0, errstd.New("IncrementalUpgradeOptions are not set during upgrade.") + } + + // Check that target_capacity has been updated before migrating traffic. + pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0) + pendingClusterTargetCapacity := ptr.Deref(pendingServiceStatus.TargetCapacity, 0) + + activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100) + activeClusterTargetCapacity := ptr.Deref(activeServiceStatus.TargetCapacity, 100) + + isRollbackInProgress := meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) + + if (pendingClusterWeight == pendingClusterTargetCapacity && !isRollbackInProgress) || (isRollbackInProgress && activeClusterWeight == activeClusterTargetCapacity) { + // return without changing current traffic weights since cluster being migrated to is at capacity. + return activeClusterWeight, pendingClusterWeight, nil + } + + // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate StepSizePercent traffic + // from the active RayCluster to the pending RayCluster. + intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second + lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime + if lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds { + if isRollbackInProgress { + // Gradually shift traffic from the pending to the active cluster. + logger.Info("Rollback in progress. Shifting traffic back to active cluster.", "stepSize", *options.StepSizePercent) + // cluster weight should never exceed current TargetCapacity and should sum to 100% + proposedActiveWeight := activeClusterWeight + *options.StepSizePercent + activeClusterWeight = min(100, proposedActiveWeight, activeClusterTargetCapacity) + pendingClusterWeight = 100 - activeClusterWeight + } else { + // Gradually shift traffic from the active to the pending cluster. + logger.Info("Upgrade in progress. Migrating traffic by StepSizePercent.", "stepSize", *options.StepSizePercent) + proposedPendingWeight := pendingClusterWeight + *options.StepSizePercent + pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity) + activeClusterWeight = 100 - pendingClusterWeight + } + + pendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + activeServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + } + } + + // Update the RayService status with the calculated traffic weights. + activeServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) + pendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight) + logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight, "pendingClusterWeight", pendingClusterWeight) + + return activeClusterWeight, pendingClusterWeight, nil +} + +// createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with +// weights based on TrafficRoutedPercent. +func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { + logger := ctrl.LoggerFrom(ctx) + + // Retrieve Gateway instance to attach this HTTPRoute to. + gatewayInstance := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { + return nil, err + } + + // Retrieve the active RayCluster + activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance)) + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Failed to retrieve active RayCluster") + return nil, err + } + if activeRayCluster == nil { + logger.Info("Active RayCluster not found, skipping HTTPRoute creation.") + return nil, nil + } + + // Attempt to retrieve pending RayCluster + pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) + hasPendingCluster := (err == nil && pendingRayCluster != nil) + if err != nil && !errors.IsNotFound(err) { + logger.Info("Failed to retrieve pending RayCluster.") + return nil, err + } + + activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster) + if err != nil { + logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.") + return nil, err + } + + activeClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name) + + backendRefs := []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(activeClusterServeSvcName), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: ptr.To(activeClusterWeight), + }, + }, + } + + if hasPendingCluster { + pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) + + backendRefs = append(backendRefs, gwv1.HTTPBackendRef{ + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(pendingClusterServeSvcName), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: ptr.To(pendingClusterWeight), + }, + }) + } + + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) + desiredHTTPRoute := &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: httpRouteName, Namespace: gatewayInstance.Namespace}, + Spec: gwv1.HTTPRouteSpec{ + CommonRouteSpec: gwv1.CommonRouteSpec{ + ParentRefs: []gwv1.ParentReference{ + { + Name: gwv1.ObjectName(gatewayInstance.Name), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + }, + }, + }, + Rules: []gwv1.HTTPRouteRule{ + { + Matches: []gwv1.HTTPRouteMatch{ + { + Path: &gwv1.HTTPPathMatch{ + Type: ptr.To(gwv1.PathMatchPathPrefix), + Value: ptr.To("/"), + }, + }, + }, + BackendRefs: backendRefs, + }, + }, + }, + } + + return desiredHTTPRoute, nil +} + +// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade. +func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) error { + logger := ctrl.LoggerFrom(ctx) + var err error + + desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance) + if err != nil { + logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") + return err + } + if desiredHTTPRoute == nil { + logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil") + return nil + } + + // Check for existing HTTPRoute for RayService + existingHTTPRoute := &gwv1.HTTPRoute{} + if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), existingHTTPRoute); err != nil { + if errors.IsNotFound(err) { + // Set the ownership in order to do the garbage collection by k8s. + if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil { + return err + } + if err = r.Create(ctx, desiredHTTPRoute); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err) + return err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.FailedToCreateHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) + return nil + } + return err + } + + // If HTTPRoute already exists, check if update is needed + if !reflect.DeepEqual(existingHTTPRoute.Spec, desiredHTTPRoute.Spec) { + logger.Info("Updating existing HTTPRoute", "name", desiredHTTPRoute.Name) + existingHTTPRoute.Spec = desiredHTTPRoute.Spec + if err := r.Update(ctx, existingHTTPRoute); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err) + return err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name) + } + + return nil +} + // `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases: // (1) Create a new pending cluster. (2) Update the active cluster. (3) Update the pending cluster. (4) Do nothing. func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServiceInstance *rayv1.RayService) (*rayv1.RayCluster, *rayv1.RayCluster, error) { @@ -767,6 +1153,193 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer return nil } +// checkIfNeedIncrementalUpgradeUpdate returns whether the controller should adjust the target_capacity +// of the Serve config associated with a RayCluster during an IncrementalUpgrade. +func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) { + activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus + pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus + + if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" { + return false, "Both active and pending RayCluster instances required for incremental upgrade." + } + + // Validate Gateway and HTTPRoute objects are ready + gatewayInstance := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { + return false, fmt.Sprintf("Failed to retrieve Gateway for RayService: %v", err) + } + if !utils.IsGatewayReady(gatewayInstance) { + return false, "Gateway for RayService IncrementalUpgrade is not ready." + } + + httpRouteInstance := &gwv1.HTTPRoute{} + if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRouteInstance); err != nil { + return false, fmt.Sprintf("Failed to retrieve HTTPRoute for RayService: %v", err) + } + if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) { + return false, "HTTPRoute for RayService IncrementalUpgrade is not ready." + } + + // Retrieve the current observed IncrementalUpgrade Status fields for each RayService. + if activeRayServiceStatus.TargetCapacity == nil || activeRayServiceStatus.TrafficRoutedPercent == nil { + return true, "Active RayServiceStatus missing TargetCapacity or TrafficRoutedPercent." + } + if pendingRayServiceStatus.TargetCapacity == nil || pendingRayServiceStatus.TrafficRoutedPercent == nil { + return true, "Pending RayServiceStatus missing TargetCapacity or TrafficRoutedPercent." + } + activeTargetCapacity := int(*activeRayServiceStatus.TargetCapacity) + pendingTargetCapacity := int(*pendingRayServiceStatus.TargetCapacity) + pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent) + + if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 { + return true, "Pending RayCluster has not finished scaling up." + } else if activeTargetCapacity == 0 && pendingTargetCapacity == 100 { + return false, "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." + } + return true, "Active RayCluster TargetCapacity has not finished scaling down." +} + +// applyServeTargetCapacity updates the target_capacity for a given RayCluster's Serve applications. +func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface, goalTargetCapacity int32) error { + logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name) + + // Retrieve cached ServeConfig from last reconciliation for cluster to update + cachedConfig := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name) + if cachedConfig == "" { + cachedConfig = rayServiceInstance.Spec.ServeConfigV2 + } + + serveConfig := make(map[string]interface{}) + if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil { + return err + } + + // Check if ServeConfig requires update + if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok { + if int32(currentTargetCapacity) == goalTargetCapacity { + logger.Info("target_capacity already updated on RayCluster", "target_capacity", currentTargetCapacity) + // No update required, return early + return nil + } + } + + serveConfig["target_capacity"] = goalTargetCapacity + configJson, err := json.Marshal(serveConfig) + if err != nil { + return fmt.Errorf("failed to marshal serve config: %w", err) + } + + logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity) + if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { + return fmt.Errorf("failed to update target_capacity for Serve applications: %w", err) + } + + // Update the status fields and cache new Serve config. + if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName { + rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + } else if rayClusterInstance.Name == rayServiceInstance.Status.PendingServiceStatus.RayClusterName { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + } + r.cacheServeConfig(rayServiceInstance, rayClusterInstance.Name) + + return nil +} + +// reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during +// an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. +func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error { + logger := ctrl.LoggerFrom(ctx) + logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) + + if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + return nil + } + + activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus + pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus + + // Set initial TargetCapacity values if unset + if activeRayServiceStatus.TargetCapacity == nil { + activeRayServiceStatus.TargetCapacity = ptr.To(int32(100)) + } + if pendingRayServiceStatus.TargetCapacity == nil { + pendingRayServiceStatus.TargetCapacity = ptr.To(int32(0)) + } + + // Retrieve the current observed Status fields for IncrementalUpgrade + activeTargetCapacity := ptr.Deref(activeRayServiceStatus.TargetCapacity, 100) + pendingTargetCapacity := ptr.Deref(pendingRayServiceStatus.TargetCapacity, 0) + pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0) + + // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") + } + maxSurgePercent := ptr.Deref(options.MaxSurgePercent, 100) + + if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) { + // Rollback the upgrade. The active RayCluster should be scaled back to 100% target_capacity, + // while the pending RayCluster is scaled to 0%. This is the inverse of the regular upgrade path. + activeTrafficRoutedPercent := ptr.Deref(activeRayServiceStatus.TrafficRoutedPercent, 0) + if activeTargetCapacity != activeTrafficRoutedPercent { + logger.Info("Traffic is rolling back to active cluster, deferring capacity update.", "ActiveTargetCapacity", activeTargetCapacity, "ActiveTrafficRoutedPercent", activeTrafficRoutedPercent) + return nil + } + + if activeTargetCapacity+pendingTargetCapacity > 100 { + if rayClusterInstance.Name == pendingRayServiceStatus.RayClusterName { + goalTargetCapacity := max(0, pendingTargetCapacity-maxSurgePercent) + logger.Info("Rollback: Scaling down pending cluster `target_capacity`.", "goal", goalTargetCapacity) + return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity) + } + } else { + if rayClusterInstance.Name == activeRayServiceStatus.RayClusterName { + goalTargetCapacity := min(100, activeTargetCapacity+maxSurgePercent) + logger.Info("Rollback: Scaling up active cluster `target_capacity`.", "goal", goalTargetCapacity) + return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity) + } + } + } + + // Defer updating the target_capacity until traffic weights are updated + if pendingTargetCapacity != pendingTrafficRoutedPercent { + logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent) + return nil + } + + // There are two cases: + // 1. The total target_capacity is greater than 100. This means the pending RayCluster has + // scaled up traffic and the active RayCluster can be scaled down by MaxSurgePercent. + // 2. The total target_capacity is equal to 100. This means the pending RayCluster can + // increase its target_capacity by MaxSurgePercent. + // If the rayClusterInstance passed into this function is not the cluster to update based + // on the above conditions, we return without doing anything. + var clusterName string + var goalTargetCapacity int32 + if activeTargetCapacity+pendingTargetCapacity > int32(100) { + // Scale down the Active RayCluster TargetCapacity on this iteration. + goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent) + clusterName = activeRayServiceStatus.RayClusterName + if clusterName != rayClusterInstance.Name { + return nil + } + activeRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + logger.Info("Setting target_capacity for active Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) + } else { + // Scale up the Pending RayCluster TargetCapacity on this iteration. + goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent) + clusterName = pendingRayServiceStatus.RayClusterName + if clusterName != rayClusterInstance.Name { + return nil + } + pendingRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) + } + + return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity) +} + // `getAndCheckServeStatus` gets Serve applications' and deployments' statuses and check whether the // Serve applications are ready to serve incoming traffic or not. It returns three values: // @@ -965,6 +1538,24 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) } + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance) + logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) + if incrementalUpgradeUpdate { + if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err) + return false, serveApplications, err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity), + "Updated target_capacity of serve applications to to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) + + // Don't switch to the pending RayCluster until IncrementalUpgrade is complete. + if rayServiceInstance.Status.PendingServiceStatus.RayClusterName == rayClusterInstance.Name { + return false, serveApplications, nil + } + } + } + return isReady, serveApplications, nil } @@ -1041,3 +1632,67 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins } return utils.IsRunningAndReady(headPod), nil } + +// reconcilePerClusterServeService reconciles a load-balancing serve service for a given RayCluster. +func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) error { + if rayClusterInstance == nil { + return nil + } + + logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name) + + logger.Info("Building per-cluster RayService") + + // Create a serve service for the RayCluster associated with this RayService. During an incremental + // upgrade, this will be called for the pending RayCluster instance. + desiredSvc, err := common.BuildServeService(ctx, *rayServiceInstance, *rayClusterInstance, true) + if err != nil { + logger.Error(err, "Failed to build per-cluster serve service spec") + return err + } + if err := ctrl.SetControllerReference(rayClusterInstance, desiredSvc, r.Scheme); err != nil { + return err + } + + existingSvc := &corev1.Service{} + err = r.Get(ctx, client.ObjectKey{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc) + if errors.IsNotFound(err) { + logger.Info("Creating new per-cluster serve service for incremental upgrade.", "Service", desiredSvc.Name) + return r.Create(ctx, desiredSvc) + } + + return err +} + +// reconcileRollbackState determines whether to initiate a rollback by setting the RollbackInProgress condition. +func (r *RayServiceReconciler) reconcileRollbackState(ctx context.Context, rayServiceInstance *rayv1.RayService, activeCluster, pendingCluster *rayv1.RayCluster) error { + logger := ctrl.LoggerFrom(ctx) + + goalHash, err := generateHashWithoutReplicasAndWorkersToDelete(rayServiceInstance.Spec.RayClusterSpec) + if err != nil { + return fmt.Errorf("failed to generate hash for goal cluster spec: %w", err) + } + + originalHash := activeCluster.Annotations[utils.HashWithoutReplicasAndWorkersToDeleteKey] + pendingHash := pendingCluster.Annotations[utils.HashWithoutReplicasAndWorkersToDeleteKey] + + isRollbackInProgress := meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) + + // Case 1: The goal spec matches the pending cluster's spec. In this case, we should revert the rollback attempt + // and continue to upgrade as normal. + if goalHash == pendingHash { + if isRollbackInProgress { + logger.Info("Goal state matches pending cluster. Canceling rollback and resuming upgrade.") + meta.RemoveStatusCondition(&rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) + } + return nil + } + + // Case 2: The goal spec differs from pending cluster's spec. Rollback to original cluster. + if !isRollbackInProgress { + logger.Info("Goal state has changed during upgrade. Initiating rollback to the original cluster.", "goalHash", goalHash, "originalHash", originalHash, "pendingHash", pendingHash) + setCondition(rayServiceInstance, rayv1.RollbackInProgress, metav1.ConditionTrue, rayv1.GoalClusterChanged, "Goal state changed, rolling back to original cluster.") + } + + return nil +} diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 638af6b26fb..cfb165bcb3e 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -13,13 +13,16 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" + "k8s.io/utils/lru" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -27,6 +30,7 @@ import ( "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types" "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" + "github.com/ray-project/kuberay/ray-operator/pkg/features" "github.com/ray-project/kuberay/ray-operator/test/support" ) @@ -1319,3 +1323,920 @@ func TestRayClusterDeletionDelaySeconds(t *testing.T) { }) } } + +// Helper function to create a RayService object undergoing an incremental upgrade. +func makeIncrementalUpgradeRayService( + withOptions bool, + gatewayClassName string, + stepSizePercent *int32, + intervalSeconds *int32, + routedPercent *int32, + lastTrafficMigratedTime *metav1.Time, +) *rayv1.RayService { + spec := rayv1.RayServiceSpec{ + ServeService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "serve-service", + Namespace: "test-ns", + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Name: "http", + Port: 8000, + }, + }, + }, + }, + } + if withOptions { + spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + GatewayClassName: gatewayClassName, + StepSizePercent: stepSizePercent, + IntervalSeconds: intervalSeconds, + }, + } + } + + return &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "incremental-ray-service", + Namespace: "test-ns", + }, + Spec: spec, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "active-ray-cluster", + RayClusterStatus: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ServiceName: "active-service"}, + }, + TrafficRoutedPercent: routedPercent, + LastTrafficMigratedTime: lastTrafficMigratedTime, + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending-ray-cluster", + RayClusterStatus: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ServiceName: "pending-service"}, + }, + TrafficRoutedPercent: ptr.To(int32(100) - *routedPercent), + LastTrafficMigratedTime: lastTrafficMigratedTime, + }, + }, + } +} + +func TestCreateGateway(t *testing.T) { + serveService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "serve-service", + Namespace: "test-ns", + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Port: 8000, + }, + }, + }, + } + newScheme := runtime.NewScheme() + _ = corev1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(serveService).Build() + reconciler := &RayServiceReconciler{ + Client: fakeClient, + } + + tests := []struct { + rayService *rayv1.RayService + name string + expectedGatewayName string + expectedClass string + expectedListeners int + expectErr bool + }{ + { + name: "valid gateway creation", + expectedGatewayName: "incremental-ray-service-gateway", + rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(10)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), + expectErr: false, + expectedClass: "gateway-class", + expectedListeners: 1, + }, + { + name: "missing IncrementalUpgradeOptions", + rayService: makeIncrementalUpgradeRayService(false, "istio", ptr.To(int32(0)), ptr.To(int32(0)), ptr.To(int32(0)), &metav1.Time{Time: time.Now()}), + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gw, err := reconciler.createGateway(tt.rayService) + if tt.expectErr { + require.Error(t, err) + assert.Nil(t, gw) + } else { + require.NoError(t, err) + require.NotNil(t, gw) + assert.Equal(t, tt.expectedGatewayName, gw.Name) + assert.Equal(t, tt.rayService.Namespace, gw.Namespace) + assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName) + assert.Len(t, gw.Spec.Listeners, tt.expectedListeners) + } + }) + } +} + +func TestCreateHTTPRoute(t *testing.T) { + ctx := context.TODO() + namespace := "test-ns" + stepSize := int32(10) + interval := int32(30) + + activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-active", Namespace: namespace}} + pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-pending", Namespace: namespace}} + gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice-gateway", Namespace: namespace}} + activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} + pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} + + baseRayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + StepSizePercent: &stepSize, + IntervalSeconds: &interval, + GatewayClassName: "istio", + }, + }, + }, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: activeCluster.Name, + TrafficRoutedPercent: ptr.To(int32(100)), + TargetCapacity: ptr.To(int32(100)), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: pendingCluster.Name, + TrafficRoutedPercent: ptr.To(int32(0)), + TargetCapacity: ptr.To(int32(30)), + }, + }, + } + + tests := []struct { + name string + modifier func(rs *rayv1.RayService) + runtimeObjects []runtime.Object + expectError bool + expectedActiveWeight int32 + expectedPendingWeight int32 + }{ + { + name: "Incremental upgrade, time since LastTrafficMigratedTime < IntervalSeconds.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + }, + { + name: "Incremental upgrade, time since LastTrafficMigratedTime >= IntervalSeconds.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 90, + expectedPendingWeight: 10, + }, + { + name: "Incremental upgrade, TrafficRoutedPercent capped to pending TargetCapacity.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 95, + expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached + }, + { + name: "Rollback from upgrade, IntervalSeconds have passed since LastTrafficMigratedTime.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.Conditions = append(rs.Status.Conditions, metav1.Condition{Type: string(rayv1.RollbackInProgress), Status: metav1.ConditionTrue}) + + // mock a partially completed upgrade + rs.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(70)) + rs.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(int32(30)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 80, + expectedPendingWeight: 20, + }, + { + name: "Rollback from upgrade, TrafficRoutedPercent capped to active TargetCapacity.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.Conditions = append(rs.Status.Conditions, metav1.Condition{Type: string(rayv1.RollbackInProgress), Status: metav1.ConditionTrue}) + rs.Status.ActiveServiceStatus.TargetCapacity = ptr.To(int32(65)) + rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(40)) + + // mock a partially completed upgrade + rs.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(60)) + rs.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(int32(40)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 65, + expectedPendingWeight: 35, // can only migrate 5% to pending until TargetCapacity reached + }, + { + name: "Create HTTPRoute called with missing IncrementalUpgradeOptions.", + modifier: func(rs *rayv1.RayService) { + rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectError: true, + }, + { + name: "No on-going upgrade, pending cluster does not exist.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + }, + runtimeObjects: []runtime.Object{activeCluster, gateway, activeServeService}, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rayService := baseRayService.DeepCopy() + tt.modifier(rayService) + tt.runtimeObjects = append(tt.runtimeObjects, rayService) + + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(1), + } + + route, err := reconciler.createHTTPRoute(ctx, rayService) + + if tt.expectError { + require.Error(t, err) + assert.Nil(t, route) + } else { + require.NoError(t, err) + require.NotNil(t, route) + + assert.Equal(t, "httproute-test-rayservice-gateway", route.Name) + assert.Equal(t, "test-ns", route.Namespace) + + require.Len(t, route.Spec.Rules, 1) + rule := route.Spec.Rules[0] + + require.GreaterOrEqual(t, len(rule.BackendRefs), 1) + assert.Equal(t, gwv1.ObjectName(activeServeService.Name), rule.BackendRefs[0].BackendRef.Name) + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + + if len(rule.BackendRefs) > 1 { + assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name) + assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + } else { + assert.Equal(t, int32(0), tt.expectedPendingWeight) + } + } + }) + } +} + +func TestReconcileHTTPRoute(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + ctx := context.TODO() + namespace := "test-ns" + stepSize := int32(10) + interval := int32(30) + gatewayName := "test-rayservice-gateway" + routeName := fmt.Sprintf("httproute-%s", gatewayName) + + activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active-ray-cluster", Namespace: namespace}} + pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending-ray-cluster", Namespace: namespace}} + activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} + pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} + gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}} + + baseRayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + StepSizePercent: &stepSize, + IntervalSeconds: &interval, + GatewayClassName: "istio", + }, + }, + }, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: activeCluster.Name, + TrafficRoutedPercent: ptr.To(int32(80)), + TargetCapacity: ptr.To(int32(100)), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: pendingCluster.Name, + TrafficRoutedPercent: ptr.To(int32(20)), + TargetCapacity: ptr.To(int32(100)), + }, + }, + } + + tests := []struct { + modifier func(rs *rayv1.RayService) + existingRoute *gwv1.HTTPRoute + name string + expectedActiveWeight int32 + expectedPendingWeight int32 + }{ + { + name: "Create new HTTPRoute with weights.", + expectedActiveWeight: 70, + expectedPendingWeight: 30, + }, + { + name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + }, + existingRoute: &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, + Spec: gwv1.HTTPRouteSpec{}, + }, + expectedActiveWeight: 70, + expectedPendingWeight: 30, + }, + { + name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + }, + expectedActiveWeight: 80, + expectedPendingWeight: 20, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rayService := baseRayService.DeepCopy() + if tt.modifier != nil { + tt.modifier(rayService) + } + + runtimeObjects := []runtime.Object{rayService, activeCluster, pendingCluster, gateway, activeServeService, pendingServeService} + if tt.existingRoute != nil { + runtimeObjects = append(runtimeObjects, tt.existingRoute) + } + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() + reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)} + + err := reconciler.reconcileHTTPRoute(ctx, rayService) + require.NoError(t, err) + + reconciledRoute := &gwv1.HTTPRoute{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: routeName, Namespace: namespace}, reconciledRoute) + require.NoError(t, err, "Failed to fetch the reconciled HTTPRoute") + + require.Len(t, reconciledRoute.Spec.Rules, 1) + rule := reconciledRoute.Spec.Rules[0] + require.Len(t, rule.BackendRefs, 2) + + // Assert weights are set as expected. + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + + // Assert ParentRef namespace is now correctly set. + parent := reconciledRoute.Spec.ParentRefs[0] + assert.Equal(t, gwv1.ObjectName(gatewayName), parent.Name) + assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace) + }) + } +} + +func TestReconcileGateway(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + ctx := context.TODO() + namespace := "test-ns" + + rayService := makeIncrementalUpgradeRayService( + true, + "gateway-class", + ptr.To(int32(20)), + ptr.To(int32(30)), + ptr.To(int32(80)), + ptr.To(metav1.Now()), + ) + gateway := makeGateway(fmt.Sprintf("%s-gateway", rayService.Name), rayService.Namespace, true) + + tests := []struct { + name string + expectedGatewayName string + expectedClass string + runtimeObjects []runtime.Object + expectedNumListeners int + }{ + { + name: "creates new Gateway if missing", + runtimeObjects: []runtime.Object{rayService}, + expectedGatewayName: "incremental-ray-service-gateway", + expectedClass: "gateway-class", + expectedNumListeners: 1, + }, + { + name: "updates Gateway if spec differs", + runtimeObjects: []runtime.Object{rayService, gateway}, + expectedGatewayName: "incremental-ray-service-gateway", + expectedClass: "gateway-class", + expectedNumListeners: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient := clientFake.NewClientBuilder(). + WithScheme(newScheme). + WithRuntimeObjects(tt.runtimeObjects...). + Build() + + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(10), + } + + err := reconciler.reconcileGateway(ctx, rayService) + require.NoError(t, err) + + reconciledGateway := &gwv1.Gateway{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedGatewayName, Namespace: namespace}, reconciledGateway) + require.NoError(t, err, "Failed to get the reconciled Gateway") + + assert.Equal(t, tt.expectedGatewayName, reconciledGateway.Name) + assert.Equal(t, namespace, reconciledGateway.Namespace) + assert.Equal(t, gwv1.ObjectName(tt.expectedClass), reconciledGateway.Spec.GatewayClassName) + assert.Len(t, reconciledGateway.Spec.Listeners, tt.expectedNumListeners) + }) + } +} + +func TestReconcileServeTargetCapacity(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + tests := []struct { + name string + updatedCluster string + activeCapacity int32 + pendingCapacity int32 + activeRoutedPercent int32 + pendingRoutedPercent int32 + maxSurgePercent int32 + expectedActiveCapacity int32 + expectedPendingCapacity int32 + isRollback bool + }{ + { + name: "Scale up pending RayCluster when total TargetCapacity < 100", + pendingRoutedPercent: 10, + activeCapacity: 70, + pendingCapacity: 10, + maxSurgePercent: 20, + expectedActiveCapacity: 70, + expectedPendingCapacity: 30, + updatedCluster: "pending", + }, + { + name: "Scale down active RayCluster when total TargetCapacity > 100", + pendingRoutedPercent: 30, + activeCapacity: 80, + pendingCapacity: 30, + maxSurgePercent: 20, + expectedActiveCapacity: 60, + expectedPendingCapacity: 30, + updatedCluster: "active", + }, + { + name: "Rollback: Scale up active RayCluster when total TargetCapacity < 100", + isRollback: true, + activeRoutedPercent: 60, + pendingRoutedPercent: 40, + activeCapacity: 60, + pendingCapacity: 30, + maxSurgePercent: 20, + expectedActiveCapacity: 80, + expectedPendingCapacity: 30, + updatedCluster: "active", + }, + { + name: "Rollback: Scale down pending RayCluster when total TargetCapacity > 100", + isRollback: true, + activeRoutedPercent: 90, + pendingRoutedPercent: 10, + activeCapacity: 90, + pendingCapacity: 20, + maxSurgePercent: 20, + expectedActiveCapacity: 90, + expectedPendingCapacity: 0, + updatedCluster: "pending", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.TODO() + rayService := &rayv1.RayService{ + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + MaxSurgePercent: ptr.To(tt.maxSurgePercent), + }, + }, + ServeConfigV2: `{"target_capacity": 0}`, + }, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(tt.activeCapacity), + TrafficRoutedPercent: ptr.To(tt.activeRoutedPercent), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(tt.pendingCapacity), + TrafficRoutedPercent: ptr.To(tt.pendingRoutedPercent), + }, + }, + } + if tt.isRollback { + rayService.Status.Conditions = []metav1.Condition{ + { + Type: string(rayv1.RollbackInProgress), + Status: metav1.ConditionTrue, + }, + } + } + + var rayCluster *rayv1.RayCluster + if tt.updatedCluster == "active" { + rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active"}} + } else { + rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending"}} + } + + fakeDashboard := &utils.FakeRayDashboardClient{} + reconciler := &RayServiceReconciler{ + ServeConfigs: lru.New(10), + } + + err := reconciler.reconcileServeTargetCapacity(ctx, rayService, rayCluster, fakeDashboard) + require.NoError(t, err) + require.NotEmpty(t, fakeDashboard.LastUpdatedConfig) + + if tt.updatedCluster == "active" { + assert.Equal(t, tt.expectedActiveCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity) + assert.Equal(t, tt.pendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity) + expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedActiveCapacity)) + `}` + assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig)) + } else { + assert.Equal(t, tt.expectedPendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity) + assert.Equal(t, tt.activeCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity) + expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedPendingCapacity)) + `}` + assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig)) + } + }) + } +} + +// MakeGateway is a helper function to return an Gateway object +func makeGateway(name, namespace string, isReady bool) *gwv1.Gateway { + status := metav1.ConditionFalse + if isReady { + status = metav1.ConditionTrue + } + return &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Status: gwv1.GatewayStatus{ + Conditions: []metav1.Condition{ + { + Type: string(gwv1.GatewayConditionAccepted), + Status: status, + }, + { + Type: string(gwv1.GatewayConditionProgrammed), + Status: status, + }, + }, + }, + } +} + +// MakeHTTPRoute is a helper function to return an HTTPRoute object +func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute { + status := metav1.ConditionFalse + if isReady { + status = metav1.ConditionTrue + } + return &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Status: gwv1.HTTPRouteStatus{ + RouteStatus: gwv1.RouteStatus{ + Parents: []gwv1.RouteParentStatus{ + { + ParentRef: gwv1.ParentReference{ + Name: gwv1.ObjectName("test-rayservice-gateway"), + Namespace: ptr.To(gwv1.Namespace(namespace)), + }, + Conditions: []metav1.Condition{ + { + Type: string(gwv1.RouteConditionAccepted), + Status: status, + }, + { + Type: string(gwv1.RouteConditionResolvedRefs), + Status: status, + }, + }, + }, + }, + }, + }, + } +} + +func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { + rayServiceName := "test-rayservice" + gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") + httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) + namespace := "test-ns" + + tests := []struct { + name string + expectedReason string + runtimeObjects []runtime.Object + activeStatus rayv1.RayServiceStatus + pendingStatus rayv1.RayServiceStatus + expectedNeedsUpdate bool + }{ + { + name: "Missing RayClusterNames", + expectedNeedsUpdate: false, + expectedReason: "Both active and pending RayCluster instances required for incremental upgrade.", + }, + { + name: "Gateway not ready", + activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"}, + pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"}, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, false), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: false, + expectedReason: "Gateway for RayService IncrementalUpgrade is not ready.", + }, + { + name: "HTTPRoute not ready", + activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"}, + pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"}, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, false), + }, + expectedNeedsUpdate: false, + expectedReason: "HTTPRoute for RayService IncrementalUpgrade is not ready.", + }, + { + name: "Incremental upgrade is complete", + activeStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(int32(0)), + TrafficRoutedPercent: ptr.To(int32(0)), + }, + pendingStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(int32(100)), + TrafficRoutedPercent: ptr.To(int32(100)), + }, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: false, + expectedReason: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete.", + }, + { + name: "Pending RayCluster is still incrementally scaling", + activeStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(int32(70)), + TrafficRoutedPercent: ptr.To(int32(70)), + }, + pendingStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(int32(30)), + TrafficRoutedPercent: ptr.To(int32(30)), + }, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: true, + expectedReason: "Pending RayCluster has not finished scaling up.", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + newScheme := runtime.NewScheme() + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + // Initialize RayService reconciler. + ctx := context.TODO() + r := RayServiceReconciler{ + Client: fakeClient, + Recorder: &record.FakeRecorder{}, + Scheme: scheme.Scheme, + } + rayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace}, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: tt.activeStatus, + PendingServiceStatus: tt.pendingStatus, + }, + } + needsUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayService) + assert.Equal(t, tt.expectedNeedsUpdate, needsUpdate) + assert.Equal(t, tt.expectedReason, reason) + }) + } +} + +func TestReconcilePerClusterServeService(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + ctx := context.TODO() + namespace := "test-ns" + + // Minimal RayCluster with at least one container. + rayCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ray-cluster", + Namespace: namespace, + UID: "test-uid", + }, + Spec: rayv1.RayClusterSpec{ + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "ray-head"}, + }, + }, + }, + }, + }, + } + rayService := makeIncrementalUpgradeRayService( + true, + "istio", + ptr.To(int32(20)), + ptr.To(int32(30)), + ptr.To(int32(80)), + ptr.To(metav1.Now()), + ) + + // The expected pending RayCluster serve service. + expectedServeSvcName := utils.GenerateServeServiceName(rayCluster.Name) + expectedServeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: expectedServeSvcName, + Namespace: namespace, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + utils.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterServingServiceLabelKey: "true", + }, + }, + } + + tests := []struct { + name string + rayCluster *rayv1.RayCluster + runtimeObjects []runtime.Object + expectServiceCreated bool + expectError bool + }{ + { + name: "RayCluster is nil, no-op.", + rayCluster: nil, + runtimeObjects: []runtime.Object{rayService}, + expectServiceCreated: false, + expectError: false, + }, + { + name: "Create a new Serve service for the RayCluster.", + rayCluster: rayCluster, + runtimeObjects: []runtime.Object{rayService, rayCluster}, + expectServiceCreated: true, + expectError: false, + }, + { + name: "Pending RayCluster serve service already exists, no-op.", + rayCluster: rayCluster, + runtimeObjects: []runtime.Object{rayService, rayCluster, expectedServeService}, + expectServiceCreated: false, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(1), + } + + err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster) + + if tt.expectError { + require.Error(t, err) + return + } + require.NoError(t, err) + + reconciledSvc := &corev1.Service{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, reconciledSvc) + + // No-op case, no service should be created when RayCluster is nil. + if tt.rayCluster == nil { + assert.True(t, errors.IsNotFound(err)) + return + } + + // Otherwise, a valid serve service should be created for the RayCluster. + require.NoError(t, err, "The Serve service should exist in the client") + + // Validate the expected Serve service exists for the RayCluster. + require.NotNil(t, reconciledSvc) + assert.Equal(t, expectedServeSvcName, reconciledSvc.Name) + + createdSvc := &corev1.Service{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, createdSvc) + require.NoError(t, err, "The Serve service should exist in the client") + + // Verify the Serve service selector. + expectedSelector := map[string]string{ + utils.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterServingServiceLabelKey: "true", + } + assert.Equal(t, expectedSelector, createdSvc.Spec.Selector) + + // Validate owner ref is set to the expected RayCluster. + if tt.expectServiceCreated { + require.Len(t, createdSvc.OwnerReferences, 1) + ownerRef := createdSvc.OwnerReferences[0] + assert.Equal(t, rayCluster.Name, ownerRef.Name) + assert.Equal(t, "RayCluster", ownerRef.Kind) + assert.Equal(t, rayCluster.UID, ownerRef.UID) + } + }) + } +} diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go index 2c2ba0fe616..4d04e9f5e3d 100644 --- a/ray-operator/controllers/ray/utils/consistency.go +++ b/ray-operator/controllers/ray/utils/consistency.go @@ -4,6 +4,7 @@ import ( "reflect" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) // Checks whether the old and new RayClusterStatus are inconsistent by comparing different fields. If the only @@ -74,6 +75,15 @@ func inconsistentRayServiceStatus(oldStatus rayv1.RayServiceStatus, newStatus ra } } + if features.Enabled(features.RayServiceIncrementalUpgrade) { + // Also check for changes in IncrementalUpgrade related Status fields. + if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent || + oldStatus.TargetCapacity != newStatus.TargetCapacity || + oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { + return true + } + } + return false } diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index fca1e4f8a00..025ae9968ef 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -317,9 +317,17 @@ const ( InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec" InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata" UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel" + UpdatedGateway K8sEventType = "UpdatedGateway" + UpdatedHTTPRoute K8sEventType = "UpdatedHTTPRoute" UpdatedServeApplications K8sEventType = "UpdatedServeApplications" + UpdatedServeTargetCapacity K8sEventType = "UpdatedServeTargetCapacity" FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel" FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications" + FailedToUpdateTargetCapacity K8sEventType = "FailedToUpdateTargetCapacity" + FailedToCreateGateway K8sEventType = "FailedToCreateGateway" + FailedToUpdateGateway K8sEventType = "FailedToUpdateGateway" + FailedToCreateHTTPRoute K8sEventType = "FailedToCreateHTTPRoute" + FailedToUpdateHTTPRoute K8sEventType = "FailedToUpdateHTTPRoute" // Generic Pod event list DeletedPod K8sEventType = "DeletedPod" diff --git a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go index 21a3fdb91be..1bf0588c403 100644 --- a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go +++ b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go @@ -12,9 +12,10 @@ import ( ) type FakeRayDashboardClient struct { - multiAppStatuses map[string]*utiltypes.ServeApplicationStatus - GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)] - serveDetails utiltypes.ServeDetails + multiAppStatuses map[string]*utiltypes.ServeApplicationStatus + GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)] + serveDetails utiltypes.ServeDetails + LastUpdatedConfig []byte } var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(nil) @@ -22,7 +23,8 @@ var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(ni func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string) { } -func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, _ []byte) error { +func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error { + r.LastUpdatedConfig = configJson fmt.Print("UpdateDeployments fake succeeds.") return nil } diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index cf6b9066323..fff9b0cf707 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -24,9 +24,11 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) const ( @@ -209,6 +211,40 @@ func CheckName(s string) string { return s } +func CheckGatewayName(name string) string { + const maxLength = 63 + + if len(name) > maxLength { + offset := len(name) - maxLength + fmt.Printf("Gateway name too long (len = %d), shortening by offset = %d", len(name), offset) + name = name[offset:] + } + + // Cannot start with a digit or punctuation + if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { + name = "g" + name[1:] + } + + return name +} + +func CheckHTTPRouteName(name string) string { + const maxLength = 63 + + if len(name) > maxLength { + offset := len(name) - maxLength + fmt.Printf("HTTPRoute name too long (len = %d), shortening by offset = %d", len(name), offset) + name = name[offset:] + } + + // Cannot start with a digit or punctuation + if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { + name = "h" + name[1:] + } + + return name +} + // TrimJobName uses CheckLabel to trim Kubernetes job to constrains func TrimJobName(jobName string) string { return CheckLabel(jobName) @@ -675,6 +711,89 @@ func GetRayClusterNameFromService(svc *corev1.Service) string { return svc.Spec.Selector[RayClusterLabelKey] } +func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool { + if gatewayInstance == nil { + return false + } + hasAccepted := false + hasProgrammed := false + + for _, condition := range gatewayInstance.Status.Conditions { + if condition.Type == string(gwv1.GatewayConditionAccepted) && condition.Status == metav1.ConditionTrue { + hasAccepted = true + } + if condition.Type == string(gwv1.GatewayConditionProgrammed) && condition.Status == metav1.ConditionTrue { + hasProgrammed = true + } + } + + // If no ready condition found return false + return hasAccepted && hasProgrammed +} + +// IsHTTPRouteReady returns whether the HTTPRoute associated with a given Gateway has a ready condition +func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool { + if httpRouteInstance == nil { + return false + } + for _, parent := range httpRouteInstance.Status.Parents { + if parent.ParentRef.Name != gwv1.ObjectName(gatewayInstance.Name) { + continue + } + if parent.ParentRef.Namespace != nil && *parent.ParentRef.Namespace != gwv1.Namespace(gatewayInstance.Namespace) { + continue + } + hasAccepted := false + hasResolved := false + + for _, condition := range parent.Conditions { + switch gwv1.RouteConditionType(condition.Type) { + case gwv1.RouteConditionAccepted: + if condition.Status == metav1.ConditionTrue { + hasAccepted = true + } + case gwv1.RouteConditionResolvedRefs: + if condition.Status == metav1.ConditionTrue { + hasResolved = true + } + } + } + if hasAccepted && hasResolved { + return true + } + } + return false +} + +func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool { + if !features.Enabled(features.RayServiceIncrementalUpgrade) { + return false + } + return spec != nil && spec.UpgradeStrategy != nil && + *spec.UpgradeStrategy.Type == rayv1.IncrementalUpgrade +} + +func GetRayServiceIncrementalUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.IncrementalUpgradeOptions { + if spec != nil && spec.UpgradeStrategy != nil { + return spec.UpgradeStrategy.IncrementalUpgradeOptions + } + return nil +} + +// addGatewayListenersForRayService is a helper function to returns Gateway Listeners +func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener { + listeners := make([]gwv1.Listener, 0, 1) + listenerName := fmt.Sprintf("%s-listener", rayServiceInstance.Name) + listener := gwv1.Listener{ + Name: gwv1.SectionName(listenerName), + Protocol: gwv1.HTTPProtocolType, // only support HTTP + Port: gwv1.PortNumber(int32(80)), + } + listeners = append(listeners, listener) + + return listeners +} + // Check where we are running. We are trying to distinguish here whether // this is vanilla kubernetes cluster or Openshift func GetClusterType() bool { diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 851e37af3ea..2d87c12ac46 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -12,9 +12,11 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) func TestGetClusterDomainName(t *testing.T) { @@ -1248,6 +1250,235 @@ func TestCalculateResources(t *testing.T) { } } +// helper function to return a Gateway object with GatewayStatus Conditions for testing. +func makeGatewayWithCondition(accepted bool, programmed bool) *gwv1.Gateway { + var conditions []metav1.Condition + + if accepted { + conditions = append(conditions, metav1.Condition{ + Type: string(gwv1.GatewayConditionAccepted), + Status: metav1.ConditionTrue, + }) + } + + if programmed { + conditions = append(conditions, metav1.Condition{ + Type: string(gwv1.GatewayConditionProgrammed), + Status: metav1.ConditionTrue, + }) + } + + return &gwv1.Gateway{ + Status: gwv1.GatewayStatus{ + Conditions: conditions, + }, + } +} + +func TestIsGatewayReady(t *testing.T) { + tests := []struct { + gateway *gwv1.Gateway + name string + expected bool + }{ + { + name: "missing Gateway instance", + gateway: nil, + expected: false, + }, + { + name: "Gateway created with Programmed condition only", + gateway: makeGatewayWithCondition(false, true), + expected: false, + }, + { + name: "Gateway created with Accepted condition only", + gateway: makeGatewayWithCondition(true, false), + expected: false, + }, + { + name: "Gateway created with both Accepted and Programmed conditions", + gateway: makeGatewayWithCondition(true, true), + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, IsGatewayReady(tt.gateway)) + }) + } +} + +// helper function to return a HTTPRoute with HTTPRouteStatus for testing +func makeHTTPRouteWithParentRef( + parentRefName string, + namespace string, + accepted bool, + resolvedRefs bool, +) *gwv1.HTTPRoute { + var acceptedStatus, resolvedRefsStatus metav1.ConditionStatus + if accepted { + acceptedStatus = metav1.ConditionTrue + } else { + acceptedStatus = metav1.ConditionFalse + } + if resolvedRefs { + resolvedRefsStatus = metav1.ConditionTrue + } else { + resolvedRefsStatus = metav1.ConditionFalse + } + + return &gwv1.HTTPRoute{ + Status: gwv1.HTTPRouteStatus{ + RouteStatus: gwv1.RouteStatus{ + Parents: []gwv1.RouteParentStatus{ + { + ParentRef: gwv1.ParentReference{ + Name: gwv1.ObjectName(parentRefName), + Namespace: ptr.To(gwv1.Namespace(namespace)), + }, + Conditions: []metav1.Condition{ + { + Type: string(gwv1.RouteConditionAccepted), + Status: acceptedStatus, + }, + { + Type: string(gwv1.RouteConditionResolvedRefs), + Status: resolvedRefsStatus, + }, + }, + }, + }, + }, + }, + } +} + +func TestIsHTTPRouteReady(t *testing.T) { + gateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{Name: "test-gateway", Namespace: "test-ns"}, + } + + tests := []struct { + httpRoute *gwv1.HTTPRoute + name string + expected bool + }{ + { + name: "missing HTTPRoute", + httpRoute: nil, + expected: false, + }, + { + name: "ParentRef does not match", + httpRoute: makeHTTPRouteWithParentRef("not-a-match", "other-test-ns", true, true), + expected: false, + }, + { + name: "matching ParentRef with Accepted condition but without ResolvedRefs", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, false), + expected: false, + }, + { + name: "matching ParentRef with ResolvedRefs but without Accepted", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", false, true), + expected: false, + }, + { + name: "ready HTTPRoute with all required conditions", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, true), + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, IsHTTPRouteReady(gateway, tt.httpRoute)) + }) + } +} + +func TestIsIncrementalUpgradeEnabled(t *testing.T) { + tests := []struct { + spec *rayv1.RayServiceSpec + name string + featureEnabled bool + expected bool + }{ + { + name: "missing UpgradeStrategy Type", + spec: &rayv1.RayServiceSpec{}, + featureEnabled: true, + expected: false, + }, + { + name: "UpgradeStrategy Type is IncrementalUpgrade but feature disabled", + spec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + }, + }, + featureEnabled: false, + expected: false, + }, + { + name: "UpgradeStrategy Type is IncrementalUpgrade and feature enabled", + spec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + }, + }, + featureEnabled: true, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, tc.featureEnabled) + assert.Equal(t, tc.expected, IsIncrementalUpgradeEnabled(tc.spec)) + }) + } +} + +func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) { + upgradeOptions := &rayv1.IncrementalUpgradeOptions{GatewayClassName: "gateway-class"} + + tests := []struct { + rayServiceSpec *rayv1.RayServiceSpec + expectedOptions *rayv1.IncrementalUpgradeOptions + name string + }{ + { + name: "RayServiceSpec is nil, return nil IncrementalUpgradeOptions", + rayServiceSpec: nil, + expectedOptions: nil, + }, + { + name: "UpgradeStrategy is nil, return nil IncrementalUpgradeOptions", + rayServiceSpec: &rayv1.RayServiceSpec{}, + expectedOptions: nil, + }, + { + name: "Valid IncrementalUpgradeOptions", + rayServiceSpec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + IncrementalUpgradeOptions: upgradeOptions, + }, + }, + expectedOptions: upgradeOptions, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actualOptions := GetRayServiceIncrementalUpgradeOptions(tt.rayServiceSpec) + assert.Equal(t, tt.expectedOptions, actualOptions) + }) + } +} + func TestGetContainerCommand(t *testing.T) { tests := []struct { name string diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 74d2b4fe0e6..a9debeac7a9 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -286,12 +286,13 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set") } - // only NewCluster and None are valid upgradeType + // only IncrementalUpgrade, NewCluster, and None are valid upgradeType if rayService.Spec.UpgradeStrategy != nil && rayService.Spec.UpgradeStrategy.Type != nil && *rayService.Spec.UpgradeStrategy.Type != rayv1.None && - *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster { - return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewCluster, rayv1.None) + *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster && + *rayService.Spec.UpgradeStrategy.Type != rayv1.IncrementalUpgrade { + return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.IncrementalUpgrade, rayv1.NewCluster, rayv1.None) } if rayService.Spec.RayClusterDeletionDelaySeconds != nil && @@ -299,5 +300,40 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds) } + // If type is IncrementalUpgrade, validate the IncrementalUpgradeOptions + if IsIncrementalUpgradeEnabled(&rayService.Spec) { + return ValidateIncrementalUpgradeOptions(rayService) + } + + return nil +} + +func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error { + if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) { + return fmt.Errorf("Ray Autoscaler is required for IncrementalUpgrade") + } + + options := rayService.Spec.UpgradeStrategy.IncrementalUpgradeOptions + if options == nil { + return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade") + } + + // MaxSurgePercent defaults to 100% if unset. + if *options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100 { + return fmt.Errorf("maxSurgePercent must be between 0 and 100") + } + + if options.StepSizePercent == nil || *options.StepSizePercent < 0 || *options.StepSizePercent > 100 { + return fmt.Errorf("stepSizePercent must be between 0 and 100") + } + + if options.IntervalSeconds == nil || *options.IntervalSeconds <= 0 { + return fmt.Errorf("intervalSeconds must be greater than 0") + } + + if options.GatewayClassName == "" { + return fmt.Errorf("gatewayClassName is required for IncrementalUpgrade") + } + return nil } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index dc464424f40..7debd97e6d2 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1229,3 +1229,112 @@ func createBasicRayClusterSpec() *rayv1.RayClusterSpec { }, } } + +func TestValidateIncrementalUpgradeOptions(t *testing.T) { + tests := []struct { + maxSurgePercent *int32 + stepSizePercent *int32 + intervalSeconds *int32 + name string + gatewayClassName string + spec rayv1.RayServiceSpec + enableAutoscaling bool + expectError bool + }{ + { + name: "valid config", + maxSurgePercent: ptr.To(int32(50)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: false, + }, + { + name: "missing autoscaler", + maxSurgePercent: ptr.To(int32(50)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: false, + expectError: true, + }, + { + name: "missing options", + enableAutoscaling: true, + expectError: true, + }, + { + name: "invalid MaxSurgePercent", + maxSurgePercent: ptr.To(int32(200)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "missing StepSizePercent", + maxSurgePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "invalid IntervalSeconds", + maxSurgePercent: ptr.To(int32(50)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(0)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "missing GatewayClassName", + maxSurgePercent: ptr.To(int32(50)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + enableAutoscaling: true, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var upgradeStrategy *rayv1.RayServiceUpgradeStrategy + if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" { + upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + MaxSurgePercent: tt.maxSurgePercent, + StepSizePercent: tt.stepSizePercent, + IntervalSeconds: tt.intervalSeconds, + GatewayClassName: tt.gatewayClassName, + }, + } + } else if tt.expectError { + upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + } + } + + rayClusterSpec := *createBasicRayClusterSpec() + rayClusterSpec.EnableInTreeAutoscaling = ptr.To(tt.enableAutoscaling) + + rayService := &rayv1.RayService{ + Spec: rayv1.RayServiceSpec{ + RayClusterSpec: rayClusterSpec, + UpgradeStrategy: upgradeStrategy, + }, + } + + err := ValidateIncrementalUpgradeOptions(rayService) + if tt.expectError { + require.Error(t, err, tt.name) + } else { + require.NoError(t, err, tt.name) + } + }) + } +} diff --git a/ray-operator/go.mod b/ray-operator/go.mod index 94d155da29f..78f3870ae24 100644 --- a/ray-operator/go.mod +++ b/ray-operator/go.mod @@ -4,22 +4,21 @@ go 1.24.0 require ( github.com/Masterminds/semver/v3 v3.3.1 + github.com/coder/websocket v1.8.13 github.com/go-logr/logr v1.4.3 github.com/go-logr/zapr v1.3.0 - github.com/google/go-cmp v0.7.0 github.com/jarcoal/httpmock v1.4.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 github.com/orcaman/concurrent-map/v2 v2.0.1 - github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.22.0 + github.com/spf13/pflag v1.0.6 github.com/stretchr/testify v1.10.0 go.uber.org/mock v0.5.2 go.uber.org/zap v1.27.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 k8s.io/api v0.33.1 - k8s.io/apiextensions-apiserver v0.33.1 k8s.io/apimachinery v0.33.1 k8s.io/apiserver v0.33.1 k8s.io/client-go v0.33.1 @@ -28,6 +27,7 @@ require ( k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.21.0 + sigs.k8s.io/gateway-api v1.3.0 sigs.k8s.io/scheduler-plugins v0.31.8 sigs.k8s.io/structured-merge-diff/v4 v4.7.0 sigs.k8s.io/yaml v1.4.0 @@ -38,19 +38,19 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/coder/websocket v1.8.13 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -62,11 +62,11 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/otel v1.33.0 // indirect @@ -74,19 +74,20 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.39.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect + golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/term v0.31.0 // indirect + golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.9.0 // indirect golang.org/x/tools v0.31.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/protobuf v1.36.5 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiextensions-apiserver v0.33.1 // indirect k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect diff --git a/ray-operator/go.sum b/ray-operator/go.sum index 6d6e0b27493..2d1825ab836 100644 --- a/ray-operator/go.sum +++ b/ray-operator/go.sum @@ -10,13 +10,12 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coder/websocket v1.8.13 h1:f3QZdXy7uGVz+4uCJy2nTZyM0yTBj8yANEHhqlXZ9FE= github.com/coder/websocket v1.8.13/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= -github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= +github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -29,12 +28,10 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= @@ -67,11 +64,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= @@ -116,17 +110,12 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= @@ -158,26 +147,26 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -192,8 +181,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= -google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -203,7 +192,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= @@ -230,6 +218,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= +sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= diff --git a/ray-operator/main.go b/ray-operator/main.go index a10c27ee367..1022f8e7577 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -27,6 +27,7 @@ import ( k8szap "sigs.k8s.io/controller-runtime/pkg/log/zap" ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" @@ -191,6 +192,10 @@ func main() { } features.LogFeatureGates(setupLog) + if features.Enabled(features.RayServiceIncrementalUpgrade) { + utilruntime.Must(gwv1.AddToScheme(scheme)) + } + // Manager options options := ctrl.Options{ Cache: cache.Options{ diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go new file mode 100644 index 00000000000..a736a964cdb --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go @@ -0,0 +1,50 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +// IncrementalUpgradeOptionsApplyConfiguration represents a declarative configuration of the IncrementalUpgradeOptions type for use +// with apply. +type IncrementalUpgradeOptionsApplyConfiguration struct { + MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"` + StepSizePercent *int32 `json:"stepSizePercent,omitempty"` + IntervalSeconds *int32 `json:"intervalSeconds,omitempty"` + GatewayClassName *string `json:"gatewayClassName,omitempty"` +} + +// IncrementalUpgradeOptionsApplyConfiguration constructs a declarative configuration of the IncrementalUpgradeOptions type for use with +// apply. +func IncrementalUpgradeOptions() *IncrementalUpgradeOptionsApplyConfiguration { + return &IncrementalUpgradeOptionsApplyConfiguration{} +} + +// WithMaxSurgePercent sets the MaxSurgePercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MaxSurgePercent field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.MaxSurgePercent = &value + return b +} + +// WithStepSizePercent sets the StepSizePercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the StepSizePercent field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.StepSizePercent = &value + return b +} + +// WithIntervalSeconds sets the IntervalSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the IntervalSeconds field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.IntervalSeconds = &value + return b +} + +// WithGatewayClassName sets the GatewayClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GatewayClassName field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *IncrementalUpgradeOptionsApplyConfiguration { + b.GatewayClassName = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go index b0fcd8032bb..2d7f2984cef 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go @@ -2,12 +2,19 @@ package v1 +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + // RayServiceStatusApplyConfiguration represents a declarative configuration of the RayServiceStatus type for use // with apply. type RayServiceStatusApplyConfiguration struct { - Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"` - RayClusterName *string `json:"rayClusterName,omitempty"` - RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"` + Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"` + TargetCapacity *int32 `json:"targetCapacity,omitempty"` + TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` + RayClusterName *string `json:"rayClusterName,omitempty"` + RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"` } // RayServiceStatusApplyConfiguration constructs a declarative configuration of the RayServiceStatus type for use with @@ -30,6 +37,30 @@ func (b *RayServiceStatusApplyConfiguration) WithApplications(entries map[string return b } +// WithTargetCapacity sets the TargetCapacity field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TargetCapacity field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithTargetCapacity(value int32) *RayServiceStatusApplyConfiguration { + b.TargetCapacity = &value + return b +} + +// WithTrafficRoutedPercent sets the TrafficRoutedPercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TrafficRoutedPercent field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithTrafficRoutedPercent(value int32) *RayServiceStatusApplyConfiguration { + b.TrafficRoutedPercent = &value + return b +} + +// WithLastTrafficMigratedTime sets the LastTrafficMigratedTime field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the LastTrafficMigratedTime field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithLastTrafficMigratedTime(value metav1.Time) *RayServiceStatusApplyConfiguration { + b.LastTrafficMigratedTime = &value + return b +} + // WithRayClusterName sets the RayClusterName field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the RayClusterName field is set to the value of the last call. diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go index 361a98f6ac9..0a190883bff 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go @@ -9,7 +9,8 @@ import ( // RayServiceUpgradeStrategyApplyConfiguration represents a declarative configuration of the RayServiceUpgradeStrategy type for use // with apply. type RayServiceUpgradeStrategyApplyConfiguration struct { - Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` + Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` + IncrementalUpgradeOptions *IncrementalUpgradeOptionsApplyConfiguration `json:"incrementalUpgradeOptions,omitempty"` } // RayServiceUpgradeStrategyApplyConfiguration constructs a declarative configuration of the RayServiceUpgradeStrategy type for use with @@ -25,3 +26,11 @@ func (b *RayServiceUpgradeStrategyApplyConfiguration) WithType(value rayv1.RaySe b.Type = &value return b } + +// WithIncrementalUpgradeOptions sets the IncrementalUpgradeOptions field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the IncrementalUpgradeOptions field is set to the value of the last call. +func (b *RayServiceUpgradeStrategyApplyConfiguration) WithIncrementalUpgradeOptions(value *IncrementalUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration { + b.IncrementalUpgradeOptions = value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 23e455d739a..e46530b7582 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -30,6 +30,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.HeadGroupSpecApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("HeadInfo"): return &rayv1.HeadInfoApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("IncrementalUpgradeOptions"): + return &rayv1.IncrementalUpgradeOptionsApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayCluster"): return &rayv1.RayClusterApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayClusterSpec"): diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go index 2abea2ffbbb..ce5734cee0a 100644 --- a/ray-operator/pkg/features/features.go +++ b/ray-operator/pkg/features/features.go @@ -24,6 +24,13 @@ const ( // // Enables new deletion policy API in RayJob RayJobDeletionPolicy featuregate.Feature = "RayJobDeletionPolicy" + + // owner: @ryanaoleary + // rep: N/A + // alpha: v1.0 + // + // Enabled incremental upgrades for RayService zero-downtime upgrades. + RayServiceIncrementalUpgrade featuregate.Feature = "RayServiceIncrementalUpgrade" ) func init() { @@ -31,8 +38,9 @@ func init() { } var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ - RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, - RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, + RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, + RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, + RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha}, } // SetFeatureGateDuringTest is a helper method to override feature gates in tests. diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go new file mode 100644 index 00000000000..9ce4e87777d --- /dev/null +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -0,0 +1,320 @@ +package e2eincrementalupgrade + +import ( + "fmt" + "strings" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" + + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + "github.com/ray-project/kuberay/ray-operator/pkg/features" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +// helper function to get RayCluster head service external IP to use to poll the RayService +func GetHeadServiceExternalIP(t *testing.T, clusterName, namespace string) (string, error) { + test := With(t) + + svc, err := test.Client().Core().CoreV1().Services(namespace).Get(test.Ctx(), clusterName+"-head-svc", metav1.GetOptions{}) + if err != nil { + return "", err + } + if len(svc.Status.LoadBalancer.Ingress) == 0 { + return "", fmt.Errorf("no ingress for service %s", svc.Name) + } + return svc.Status.LoadBalancer.Ingress[0].IP, nil +} + +func TestRayServiceIncrementalUpgrade(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + test := With(t) + g := NewWithT(t) + + namespace := test.NewTestNamespace() + rayServiceName := "incremental-rayservice" + + // Create a RayService with IncrementalUpgrade enabled + stepSize := ptr.To(int32(25)) + interval := ptr.To(int32(10)) + maxSurge := ptr.To(int32(50)) + + rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name). + WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge)) + rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayService).NotTo(BeNil()) + + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium). + Should(WithTransform(IsRayServiceReady, BeTrue())) + + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + // Validate Gateway and HTTPRoute objects have been created for incremental upgrade. + gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") + LogWithTimestamp(test.T(), "Waiting for Gateway %s/%s to be ready", rayService.Namespace, gatewayName) + g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium). + Should(WithTransform(utils.IsGatewayReady, BeTrue())) + + // Get the Gateway endpoint to send requests to + gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway")) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(gateway).NotTo(BeNil()) + + httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) + LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName) + g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium). + Should(Not(BeNil())) + + httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue()) + + // Create curl pod to test traffic routing through Gateway to RayService + curlPodName := "curl-pod" + curlContainerName := "curl-container" + curlPod, err := CreateCurlPod(g, test, curlPodName, curlContainerName, namespace.Name) + g.Expect(err).NotTo(HaveOccurred()) + + LogWithTimestamp(test.T(), "Waiting for Curl Pod %s to be ready", curlPodName) + g.Eventually(func(g Gomega) *corev1.Pod { + updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + return updatedPod + }, TestTimeoutShort).Should(WithTransform(IsPodRunningAndReady, BeTrue())) + + gatewayIP := GetGatewayIP(gateway) + g.Expect(gatewayIP).NotTo(BeEmpty()) + + LogWithTimestamp(test.T(), "Verifying RayService is serving traffic") + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Equal("6")) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) + g.Expect(stdout.String()).To(Equal("15 pizzas please!")) + + // Trigger incremental upgrade by updating RayService serve config and RayCluster spec + rayService, err = GetRayService(test, namespace.Name, rayService.Name) + g.Expect(err).NotTo(HaveOccurred()) + + rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m") + serveConfig := rayService.Spec.ServeConfigV2 + serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) + serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) + rayService.Spec.ServeConfigV2 = serveConfig + _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update( + test.Ctx(), + rayService, + metav1.UpdateOptions{}, + ) + g.Expect(err).NotTo(HaveOccurred()) + + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue())) + + LogWithTimestamp(test.T(), "Verifying temporary service creation and HTTPRoute backends") + upgradingRaySvc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + activeClusterName := upgradingRaySvc.Status.ActiveServiceStatus.RayClusterName + g.Expect(activeClusterName).NotTo(BeEmpty(), "The active cluster should be set when a RayService is ready.") + pendingClusterName := upgradingRaySvc.Status.PendingServiceStatus.RayClusterName + g.Expect(pendingClusterName).NotTo(BeEmpty(), "The controller should have created a pending cluster.") + + // Validate serve service for the active cluster exists. + activeServeSvcName := utils.GenerateServeServiceName(activeClusterName) + _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), activeServeSvcName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred(), "The serve service for the active cluster should be created.") + + // Validate serve service for the pending cluster has been created for the upgrade. + pendingServeSvcName := utils.GenerateServeServiceName(pendingClusterName) + g.Eventually(func(g Gomega) { + _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), pendingServeSvcName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred(), "The serve service for the pending cluster should be created.") + }, TestTimeoutShort).Should(Succeed()) + + // Verify HTTPRoute is pointing to the correct two backends. + g.Eventually(func(g Gomega) { + route, err := GetHTTPRoute(test, namespace.Name, httpRouteName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(route.Spec.Rules).To(HaveLen(1)) + g.Expect(route.Spec.Rules[0].BackendRefs).To(HaveLen(2)) + g.Expect(string(route.Spec.Rules[0].BackendRefs[0].Name)).To(Equal(activeServeSvcName)) + g.Expect(string(route.Spec.Rules[0].BackendRefs[1].Name)).To(Equal(pendingServeSvcName)) + }, TestTimeoutShort).Should(Succeed()) + + LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration") + intervalSeconds := *interval + var lastMigratedTime *metav1.Time + + // Validate expected behavior during an IncrementalUpgrade. The following checks ensures + // that no requests are dropped throughout the upgrade process. + upgradeSteps := generateUpgradeSteps(*stepSize, *maxSurge) + for _, step := range upgradeSteps { + LogWithTimestamp(test.T(), "%s", step.name) + g.Eventually(func(g Gomega) int32 { + // Fetch updated RayService. + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + // Send a request to the RayService to validate no requests are dropped. + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Equal("6")) + + return step.getValue(svc) + }, TestTimeoutShort).Should(Equal(step.expectedValue)) + + if strings.Contains(step.name, "pending traffic to shift") { + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + currentMigratedTime := svc.Status.PendingServiceStatus.LastTrafficMigratedTime + g.Expect(currentMigratedTime).NotTo(BeNil()) + + // Verify IntervalSeconds have passed since last TrafficRoutedPercent update. + if lastMigratedTime != nil { + duration := currentMigratedTime.Sub(lastMigratedTime.Time) + g.Expect(duration).To(BeNumerically(">=", intervalSeconds), + "Time between traffic steps should be >= IntervalSeconds") + } + lastMigratedTime = currentMigratedTime + } + } + // Check that RayService completed upgrade + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be false", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse())) + + LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes") + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Equal("8")) +} + +func TestRayServiceIncrementalUpgradeRollback(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + test := With(t) + g := NewWithT(t) + + namespace := test.NewTestNamespace() + rayServiceName := "rollback-rayservice" + + // Create a RayService with IncrementalUpgrade enabled + stepSize := ptr.To(int32(25)) + interval := ptr.To(int32(10)) + maxSurge := ptr.To(int32(50)) + + rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name). + WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge)) + rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayService).NotTo(BeNil()) + + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium). + Should(WithTransform(IsRayServiceReady, BeTrue())) + + // Copy original spec to use to trigger a rollback later. + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + originalSpec := rayService.Spec.DeepCopy() + + // Verify Gateway and HTTPRoute are ready. + gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") + g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium). + Should(WithTransform(utils.IsGatewayReady, BeTrue())) + + gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway")) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(gateway).NotTo(BeNil()) + + httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) + LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName) + g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium). + Should(Not(BeNil())) + + httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue()) + + // Trigger an incremental upgrade through a change to the RayCluster spec. + LogWithTimestamp(test.T(), "Triggering an upgrade for RayService %s/%s", rayService.Namespace, rayService.Name) + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m") + _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(test.Ctx(), rayService, metav1.UpdateOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue())) + + // Wait for the upgrade to be underway with traffic partially migrated. + LogWithTimestamp(test.T(), "Waiting for upgrade to be partially complete") + g.Eventually(func(g Gomega) { + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(svc.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(*svc.Status.PendingServiceStatus.TrafficRoutedPercent).Should(BeNumerically(">", 0)) + }, TestTimeoutMedium).Should(Succeed()) + + // Trigger a rollback by updating the spec back to the original version. + LogWithTimestamp(test.T(), "Triggering a rollback for RayService %s/%s", rayService.Namespace, rayService.Name) + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + rayService.Spec = *originalSpec + _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(test.Ctx(), rayService, metav1.UpdateOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + + // Verify that the controller enters the rollback state. + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s RollbackInProgress condition to be true", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceRollingBack, BeTrue())) + + // Verify that traffic gradually shifts back to the active cluster. + LogWithTimestamp(test.T(), "Verifying traffic shifts back to the active cluster") + g.Eventually(func(g Gomega) { + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(svc.Status.ActiveServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(*svc.Status.ActiveServiceStatus.TrafficRoutedPercent).Should(Equal(int32(100))) + g.Expect(svc.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(*svc.Status.PendingServiceStatus.TrafficRoutedPercent).Should(Equal(int32(0))) + }, TestTimeoutMedium).Should(Succeed()) + + // Verify that the rollback completes and the pending cluster is cleaned up. + LogWithTimestamp(test.T(), "Waiting for rollback to complete and pending cluster to be deleted") + g.Eventually(func(g Gomega) { + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + // Rollback is done when both conditions are false and pending status is empty. + g.Expect(IsRayServiceRollingBack(svc)).To(BeFalse()) + g.Expect(IsRayServiceUpgrading(svc)).To(BeFalse()) + g.Expect(svc.Status.PendingServiceStatus.RayClusterName).To(BeEmpty()) + }, TestTimeoutMedium).Should(Succeed()) + + // Check that the pending RayCluster resource is deleted. + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + pendingClusterName := rayService.Status.PendingServiceStatus.RayClusterName + if pendingClusterName != "" { + g.Eventually(func() error { + _, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), pendingClusterName, metav1.GetOptions{}) + return err + }, TestTimeoutShort).Should(WithTransform(errors.IsNotFound, BeTrue())) + } + + // The HTTPRoute should now only have one backend after the rollback completes. + g.Eventually(HTTPRoute(test, namespace.Name, httpRouteName), TestTimeoutShort). + Should(WithTransform(func(route *gwv1.HTTPRoute) int { + if route == nil || len(route.Spec.Rules) == 0 { + return 0 + } + return len(route.Spec.Rules[0].BackendRefs) + }, Equal(1))) +} diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go new file mode 100644 index 00000000000..68c9e96460e --- /dev/null +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -0,0 +1,245 @@ +package e2eincrementalupgrade + +import ( + "bytes" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1ac "k8s.io/client-go/applyconfigurations/core/v1" + "k8s.io/utils/ptr" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +func CurlRayServiceGateway( + t Test, + gatewayIP string, + curlPod *corev1.Pod, + curlPodContainerName, + rayServicePath, + body string, +) (bytes.Buffer, bytes.Buffer) { + cmd := []string{ + "curl", + "--max-time", "10", + "-X", "POST", + "-H", "Content-Type: application/json", + fmt.Sprintf("%s:80%s", gatewayIP, rayServicePath), + "-d", body, + } + + return ExecPodCmd(t, curlPod, curlPodContainerName, cmd) +} + +func IncrementalUpgradeRayServiceApplyConfiguration( + stepSizePercent, intervalSeconds, maxSurgePercent *int32, +) *rayv1ac.RayServiceSpecApplyConfiguration { + return rayv1ac.RayServiceSpec(). + WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). + WithType(rayv1.IncrementalUpgrade). + WithIncrementalUpgradeOptions( + rayv1ac.IncrementalUpgradeOptions(). + WithGatewayClassName("istio"). + WithStepSizePercent(*stepSizePercent). + WithIntervalSeconds(*intervalSeconds). + WithMaxSurgePercent(*maxSurgePercent), + )). + WithServeConfigV2(`applications: + - name: fruit_app + import_path: fruit.deployment_graph + route_prefix: /fruit + runtime_env: + working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip" + deployments: + - name: MangoStand + num_replicas: 1 + user_config: + price: 3 + ray_actor_options: + num_cpus: 0.1 + - name: OrangeStand + num_replicas: 1 + user_config: + price: 2 + ray_actor_options: + num_cpus: 0.1 + - name: FruitMarket + num_replicas: 1 + ray_actor_options: + num_cpus: 0.1 + - name: math_app + import_path: conditional_dag.serve_dag + route_prefix: /calc + runtime_env: + working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip" + deployments: + - name: Adder + num_replicas: 1 + user_config: + increment: 3 + ray_actor_options: + num_cpus: 0.1 + - name: Multiplier + num_replicas: 1 + user_config: + factor: 5 + ray_actor_options: + num_cpus: 0.1 + - name: Router + ray_actor_options: + num_cpus: 0.1 + num_replicas: 1`). + WithRayClusterSpec(rayv1ac.RayClusterSpec(). + WithRayVersion(GetRayVersion()). + WithEnableInTreeAutoscaling(true). + WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). + WithRayStartParams(map[string]string{"dashboard-host": "0.0.0.0"}). + WithTemplate(corev1ac.PodTemplateSpec(). + WithSpec(corev1ac.PodSpec(). + WithRestartPolicy(corev1.RestartPolicyNever). + WithContainers(corev1ac.Container(). + WithName("ray-head"). + WithImage(GetRayImage()). + WithEnv(corev1ac.EnvVar().WithName(utils.RAY_ENABLE_AUTOSCALER_V2).WithValue("1")). + WithPorts( + corev1ac.ContainerPort().WithName(utils.GcsServerPortName).WithContainerPort(utils.DefaultGcsServerPort), + corev1ac.ContainerPort().WithName(utils.ServingPortName).WithContainerPort(utils.DefaultServingPort), + corev1ac.ContainerPort().WithName(utils.DashboardPortName).WithContainerPort(utils.DefaultDashboardPort), + corev1ac.ContainerPort().WithName(utils.ClientPortName).WithContainerPort(utils.DefaultClientPort), + ). + WithResources(corev1ac.ResourceRequirements(). + WithRequests(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("3Gi"), + }). + WithLimits(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("3Gi"), + })))))). + WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec(). + WithReplicas(1). + WithMinReplicas(1). + WithMaxReplicas(4). + WithRayStartParams(map[string]string{"num-cpus": "1"}). + WithGroupName("small-group"). + WithTemplate(corev1ac.PodTemplateSpec(). + WithSpec(corev1ac.PodSpec(). + WithRestartPolicy(corev1.RestartPolicyNever). + WithContainers(corev1ac.Container(). + WithName("ray-worker"). + WithImage(GetRayImage()). + WithResources(corev1ac.ResourceRequirements(). + WithRequests(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("300m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }). + WithLimits(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + })))))), + ) +} + +// GetGatewayIP retrieves the external IP for a Gateway object +func GetGatewayIP(gateway *gwv1.Gateway) string { + if gateway == nil { + return "" + } + for _, addr := range gateway.Status.Addresses { + if addr.Type == nil || *addr.Type == gwv1.IPAddressType { + return addr.Value + } + } + + return "" +} + +func GetPendingCapacity(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.PendingServiceStatus.TargetCapacity, 0) +} + +func GetPendingTraffic(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.PendingServiceStatus.TrafficRoutedPercent, 0) +} + +func GetActiveCapacity(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.ActiveServiceStatus.TargetCapacity, 100) +} + +func GetActiveTraffic(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.ActiveServiceStatus.TrafficRoutedPercent, 100) +} + +func GetLastTrafficMigratedTime(rs *rayv1.RayService) *metav1.Time { + return rs.Status.ActiveServiceStatus.LastTrafficMigratedTime +} + +// testStep defines a validation condition to wait for during the upgrade. +type testStep struct { + getValue func(rs *rayv1.RayService) int32 + name string + expectedValue int32 +} + +// generateUpgradeSteps is a helper function for testing that the controller follows the expected +// sequence of updates to TrafficRoutedPercent and TargetCapacity during an incremental upgrade. +func generateUpgradeSteps(stepSize, maxSurge int32) []testStep { + var steps []testStep + + pendingCapacity := int32(0) + pendingTraffic := int32(0) + activeCapacity := int32(100) + activeTraffic := int32(100) + + for pendingTraffic < 100 { + // Scale up the pending cluster's TargetCapacity. + if pendingTraffic == pendingCapacity { + nextPendingCapacity := min(pendingCapacity+maxSurge, 100) + if nextPendingCapacity > pendingCapacity { + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for pending capacity to scale up to %d", nextPendingCapacity), + getValue: GetPendingCapacity, + expectedValue: nextPendingCapacity, + }) + pendingCapacity = nextPendingCapacity + } + } + + // Shift traffic over from the active to the pending cluster by StepSizePercent. + for pendingTraffic < pendingCapacity { + nextPendingTraffic := min(pendingTraffic+stepSize, 100) + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for pending traffic to shift to %d", nextPendingTraffic), + getValue: GetPendingTraffic, + expectedValue: nextPendingTraffic, + }) + pendingTraffic = nextPendingTraffic + + nextActiveTraffic := max(activeTraffic-stepSize, 0) + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for active traffic to shift down to %d", nextActiveTraffic), + getValue: GetActiveTraffic, + expectedValue: nextActiveTraffic, + }) + activeTraffic = nextActiveTraffic + } + + // Scale down the active cluster's target capacity. + nextActiveCapacity := max(activeCapacity-maxSurge, 0) + if nextActiveCapacity < activeCapacity { + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for active capacity to scale down to %d", nextActiveCapacity), + getValue: GetActiveCapacity, + expectedValue: nextActiveCapacity, + }) + activeCapacity = nextActiveCapacity + } + } + return steps +} diff --git a/ray-operator/test/support/client.go b/ray-operator/test/support/client.go index 2e313483966..4925184d46b 100644 --- a/ray-operator/test/support/client.go +++ b/ray-operator/test/support/client.go @@ -8,6 +8,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + gatewayclient "sigs.k8s.io/gateway-api/pkg/client/clientset/versioned" rayclient "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned" ) @@ -17,6 +18,7 @@ type Client interface { Ray() rayclient.Interface Dynamic() dynamic.Interface Config() rest.Config + Gateway() gatewayclient.Interface } type testClient struct { @@ -24,6 +26,7 @@ type testClient struct { ray rayclient.Interface dynamic dynamic.Interface config rest.Config + gateway gatewayclient.Interface } var _ Client = (*testClient)(nil) @@ -44,6 +47,10 @@ func (t *testClient) Config() rest.Config { return t.config } +func (t *testClient) Gateway() gatewayclient.Interface { + return t.gateway +} + func newTestClient() (Client, error) { cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( clientcmd.NewDefaultClientConfigLoadingRules(), @@ -68,10 +75,16 @@ func newTestClient() (Client, error) { return nil, err } + gatewayClient, err := gatewayclient.NewForConfig(cfg) + if err != nil { + return nil, err + } + return &testClient{ core: kubeClient, ray: rayClient, dynamic: dynamicClient, config: *cfg, + gateway: gatewayClient, }, nil } diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go index ffea3c75d87..162910081d6 100644 --- a/ray-operator/test/support/ray.go +++ b/ray-operator/test/support/ray.go @@ -9,6 +9,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -215,6 +216,10 @@ func IsRayServiceUpgrading(service *rayv1.RayService) bool { return meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.UpgradeInProgress)) } +func IsRayServiceRollingBack(service *rayv1.RayService) bool { + return meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.RollbackInProgress)) +} + func RayServicesNumEndPoints(service *rayv1.RayService) int32 { return service.Status.NumServeEndpoints } @@ -226,3 +231,23 @@ func GetRayClusterWorkerGroupReplicaSum(cluster *rayv1.RayCluster) int32 { } return replicas } + +func GetHTTPRoute(t Test, namespace, name string) (*gwv1.HTTPRoute, error) { + return t.Client().Gateway().GatewayV1().HTTPRoutes(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) +} + +func HTTPRoute(t Test, namespace, name string) func() (*gwv1.HTTPRoute, error) { + return func() (*gwv1.HTTPRoute, error) { + return GetHTTPRoute(t, namespace, name) + } +} + +func GetGateway(t Test, namespace, name string) (*gwv1.Gateway, error) { + return t.Client().Gateway().GatewayV1().Gateways(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) +} + +func Gateway(t Test, namespace, name string) func() (*gwv1.Gateway, error) { + return func() (*gwv1.Gateway, error) { + return GetGateway(t, namespace, name) + } +}