diff --git a/docs/reference/api.md b/docs/reference/api.md
index 4b495fef69e..6c9808c0c3c 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -142,6 +142,25 @@ _Appears in:_
| `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | |
+#### IncrementalUpgradeOptions
+
+
+
+
+
+
+
+_Appears in:_
+- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | |
+| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | |
+| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | |
+| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | |
+
+
#### JobSubmissionMode
@@ -319,7 +338,8 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |
+| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports
`NewCluster`, `IncrementalUpgrade`, and `None`. | | |
+| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. | | |
#### RayServiceUpgradeType
diff --git a/go.mod b/go.mod
index 472e6d593df..e93dc132eda 100644
--- a/go.mod
+++ b/go.mod
@@ -73,7 +73,7 @@ require (
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
- github.com/mattn/go-isatty v0.0.19 // indirect
+ github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/term v0.5.0 // indirect
@@ -95,12 +95,12 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
- golang.org/x/net v0.38.0 // indirect
+ golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
- golang.org/x/sync v0.12.0 // indirect
+ golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
- golang.org/x/term v0.30.0 // indirect
- golang.org/x/text v0.23.0 // indirect
+ golang.org/x/term v0.31.0 // indirect
+ golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.10.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
@@ -112,6 +112,7 @@ require (
k8s.io/component-base v0.33.1 // indirect
k8s.io/component-helpers v0.33.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
+ sigs.k8s.io/gateway-api v1.3.0 // indirect
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
sigs.k8s.io/kustomize/api v0.19.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
diff --git a/go.sum b/go.sum
index dddab9f7e86..22e4f1113d9 100644
--- a/go.sum
+++ b/go.sum
@@ -139,8 +139,9 @@ github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUt
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0=
github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
@@ -263,8 +264,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
+golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
@@ -274,8 +275,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
+golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -292,12 +293,12 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
+golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4=
golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -380,6 +381,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
+sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M=
+sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk=
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE=
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ=
diff --git a/helm-chart/kuberay-operator/README.md b/helm-chart/kuberay-operator/README.md
index 6837698d597..43ea4144af3 100644
--- a/helm-chart/kuberay-operator/README.md
+++ b/helm-chart/kuberay-operator/README.md
@@ -165,6 +165,8 @@ spec:
| featureGates[0].enabled | bool | `true` | |
| featureGates[1].name | string | `"RayJobDeletionPolicy"` | |
| featureGates[1].enabled | bool | `false` | |
+| featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | |
+| featureGates[2].enabled | bool | `false` | |
| metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. |
| metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor |
| metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval |
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
index a86457fac1a..41bda880d9a 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
@@ -8225,6 +8225,25 @@ spec:
type: integer
upgradeStrategy:
properties:
+ incrementalUpgradeOptions:
+ properties:
+ gatewayClassName:
+ type: string
+ intervalSeconds:
+ format: int32
+ type: integer
+ maxSurgePercent:
+ default: 100
+ format: int32
+ type: integer
+ stepSizePercent:
+ format: int32
+ type: integer
+ required:
+ - gatewayClassName
+ - intervalSeconds
+ - stepSizePercent
+ type: object
type:
type: string
type: object
@@ -8253,6 +8272,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8367,6 +8389,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
conditions:
items:
@@ -8436,6 +8464,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8550,6 +8581,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
serviceStatus:
type: string
diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl
index 5d14510a61b..d5e0e7352d0 100644
--- a/helm-chart/kuberay-operator/templates/_helpers.tpl
+++ b/helm-chart/kuberay-operator/templates/_helpers.tpl
@@ -222,6 +222,17 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - gateway.networking.k8s.io
+ resources:
+ - gateways
+ - httproutes
+ verbs:
+ - create
+ - get
+ - list
+ - update
+ - watch
- apiGroups:
- networking.k8s.io
resources:
diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml
index 6010d7f2b3e..f1464ba3a30 100644
--- a/helm-chart/kuberay-operator/values.yaml
+++ b/helm-chart/kuberay-operator/values.yaml
@@ -88,6 +88,8 @@ featureGates:
enabled: true
- name: RayJobDeletionPolicy
enabled: false
+- name: RayServiceIncrementalUpgrade
+ enabled: false
# Configurations for KubeRay operator metrics.
metrics:
diff --git a/ray-operator/Makefile b/ray-operator/Makefile
index 3eda8a616c4..04451030ad2 100644
--- a/ray-operator/Makefile
+++ b/ray-operator/Makefile
@@ -76,8 +76,16 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler
test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests.
go test -timeout 30m -v $(WHAT)
+test-e2e-rayservice: WHAT ?= ./test/e2erayservice
+test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests.
+ go test -timeout 30m -v $(WHAT)
+
test-e2e-upgrade: WHAT ?= ./test/e2eupgrade
-test-e2e-upgrade: manifests fmt vet ## Run e2e tests.
+test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests.
+ go test -timeout 30m -v $(WHAT)
+
+test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade
+test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests.
go test -timeout 30m -v $(WHAT)
test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter
diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go
index e7d73e07d8e..e331fba27ae 100644
--- a/ray-operator/apis/ray/v1/rayservice_types.go
+++ b/ray-operator/apis/ray/v1/rayservice_types.go
@@ -22,6 +22,9 @@ const (
type RayServiceUpgradeType string
const (
+ // During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale
+ // and migrate traffic to using Gateway API.
+ IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade"
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
@@ -57,10 +60,27 @@ var DeploymentStatusEnum = struct {
UNHEALTHY: "UNHEALTHY",
}
+type IncrementalUpgradeOptions struct {
+ // The capacity of serve requests the upgraded cluster should scale to handle each interval.
+ // Defaults to 100%.
+ // +kubebuilder:default:=100
+ MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
+ // The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent.
+ StepSizePercent *int32 `json:"stepSizePercent"`
+ // The interval in seconds between transferring StepSize traffic from the old to new RayCluster.
+ IntervalSeconds *int32 `json:"intervalSeconds"`
+ // The name of the Gateway Class installed by the Kubernetes Cluster admin.
+ GatewayClassName string `json:"gatewayClassName"`
+}
+
type RayServiceUpgradeStrategy struct {
- // Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
+ // Type represents the strategy used when upgrading the RayService. Currently supports
+ // `NewCluster`, `IncrementalUpgrade`, and `None`.
// +optional
Type *RayServiceUpgradeType `json:"type,omitempty"`
+ // IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
+ // RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions.
+ IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"`
}
// RayServiceSpec defines the desired state of RayService
@@ -130,6 +150,12 @@ type RayServiceStatus struct {
// +optional
Applications map[string]AppStatus `json:"applicationStatuses,omitempty"`
// +optional
+ TargetCapacity *int32 `json:"targetCapacity,omitempty"`
+ // +optional
+ TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
+ // +optional
+ LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
+ // +optional
RayClusterName string `json:"rayClusterName,omitempty"`
// +optional
RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"`
@@ -162,6 +188,8 @@ const (
RayServiceReady RayServiceConditionType = "Ready"
// UpgradeInProgress means the RayService is currently performing a zero-downtime upgrade.
UpgradeInProgress RayServiceConditionType = "UpgradeInProgress"
+ // RollbackInProgress means the RayService is currently rolling back an in-progress upgrade to the original cluster state.
+ RollbackInProgress RayServiceConditionType = "RollbackInProgress"
)
const (
@@ -171,6 +199,7 @@ const (
BothActivePendingClustersExist RayServiceConditionReason = "BothActivePendingClustersExist"
NoPendingCluster RayServiceConditionReason = "NoPendingCluster"
NoActiveCluster RayServiceConditionReason = "NoActiveCluster"
+ GoalClusterChanged RayServiceConditionReason = "GoalClusterChanged"
)
// +kubebuilder:object:root=true
@@ -184,8 +213,7 @@ const (
type RayService struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
-
- Spec RayServiceSpec `json:"spec,omitempty"`
+ Spec RayServiceSpec `json:"spec,omitempty"`
// +optional
Status RayServiceStatuses `json:"status,omitempty"`
}
diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
index b4cb5decf12..c9f5974f116 100644
--- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
+++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
@@ -213,6 +213,36 @@ func (in *HeadInfo) DeepCopy() *HeadInfo {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *IncrementalUpgradeOptions) DeepCopyInto(out *IncrementalUpgradeOptions) {
+ *out = *in
+ if in.MaxSurgePercent != nil {
+ in, out := &in.MaxSurgePercent, &out.MaxSurgePercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.StepSizePercent != nil {
+ in, out := &in.StepSizePercent, &out.StepSizePercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.IntervalSeconds != nil {
+ in, out := &in.IntervalSeconds, &out.IntervalSeconds
+ *out = new(int32)
+ **out = **in
+ }
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IncrementalUpgradeOptions.
+func (in *IncrementalUpgradeOptions) DeepCopy() *IncrementalUpgradeOptions {
+ if in == nil {
+ return nil
+ }
+ out := new(IncrementalUpgradeOptions)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RayCluster) DeepCopyInto(out *RayCluster) {
*out = *in
@@ -663,6 +693,20 @@ func (in *RayServiceStatus) DeepCopyInto(out *RayServiceStatus) {
(*out)[key] = *val.DeepCopy()
}
}
+ if in.TargetCapacity != nil {
+ in, out := &in.TargetCapacity, &out.TargetCapacity
+ *out = new(int32)
+ **out = **in
+ }
+ if in.TrafficRoutedPercent != nil {
+ in, out := &in.TrafficRoutedPercent, &out.TrafficRoutedPercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.LastTrafficMigratedTime != nil {
+ in, out := &in.LastTrafficMigratedTime, &out.LastTrafficMigratedTime
+ *out = (*in).DeepCopy()
+ }
in.RayClusterStatus.DeepCopyInto(&out.RayClusterStatus)
}
@@ -712,6 +756,11 @@ func (in *RayServiceUpgradeStrategy) DeepCopyInto(out *RayServiceUpgradeStrategy
*out = new(RayServiceUpgradeType)
**out = **in
}
+ if in.IncrementalUpgradeOptions != nil {
+ in, out := &in.IncrementalUpgradeOptions, &out.IncrementalUpgradeOptions
+ *out = new(IncrementalUpgradeOptions)
+ (*in).DeepCopyInto(*out)
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RayServiceUpgradeStrategy.
diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml
index a86457fac1a..41bda880d9a 100644
--- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml
@@ -8225,6 +8225,25 @@ spec:
type: integer
upgradeStrategy:
properties:
+ incrementalUpgradeOptions:
+ properties:
+ gatewayClassName:
+ type: string
+ intervalSeconds:
+ format: int32
+ type: integer
+ maxSurgePercent:
+ default: 100
+ format: int32
+ type: integer
+ stepSizePercent:
+ format: int32
+ type: integer
+ required:
+ - gatewayClassName
+ - intervalSeconds
+ - stepSizePercent
+ type: object
type:
type: string
type: object
@@ -8253,6 +8272,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8367,6 +8389,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
conditions:
items:
@@ -8436,6 +8464,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8550,6 +8581,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
serviceStatus:
type: string
diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml
index ba840f0c27f..9ea1db93190 100644
--- a/ray-operator/config/rbac/role.yaml
+++ b/ray-operator/config/rbac/role.yaml
@@ -107,6 +107,17 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - gateway.networking.k8s.io
+ resources:
+ - gateways
+ - httproutes
+ verbs:
+ - create
+ - get
+ - list
+ - update
+ - watch
- apiGroups:
- networking.k8s.io
resources:
diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go
index 63eefa94bc4..922a31d924f 100644
--- a/ray-operator/controllers/ray/common/association.go
+++ b/ray-operator/controllers/ray/common/association.go
@@ -203,3 +203,19 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp
},
}
}
+
+func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName {
+ gatewayName := utils.CheckGatewayName(fmt.Sprintf("%s-gateway", rayService.Name))
+ return types.NamespacedName{
+ Name: gatewayName,
+ Namespace: rayService.Namespace,
+ }
+}
+
+func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName {
+ httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", rayService.Name))
+ return types.NamespacedName{
+ Name: httpRouteName,
+ Namespace: rayService.Namespace,
+ }
+}
diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go
index 71cea97c005..7675a30b3bb 100644
--- a/ray-operator/controllers/ray/common/service.go
+++ b/ray-operator/controllers/ray/common/service.go
@@ -184,7 +184,10 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus
namespace := rayCluster.Namespace
crdType := utils.RayClusterCRD
if isRayService {
- name = rayService.Name
+ // For IncrementalUpgrade, the name is based on the unique RayCluster.
+ if !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) {
+ name = rayService.Name
+ }
namespace = rayService.Namespace
crdType = utils.RayServiceCRD
}
@@ -225,7 +228,7 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus
"otherwise, the Kubernetes service for Ray Serve will not be created.")
}
- if rayService.Spec.ServeService != nil {
+ if rayService.Spec.ServeService != nil && !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) {
// Use the provided "custom" ServeService.
// Deep copy the ServeService to avoid modifying the original object
serveService := rayService.Spec.ServeService.DeepCopy()
diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go
index 7e51c018fbf..a8b07cf6d82 100644
--- a/ray-operator/controllers/ray/rayservice_controller.go
+++ b/ray-operator/controllers/ray/rayservice_controller.go
@@ -6,6 +6,7 @@ import (
"fmt"
"math"
"os"
+ "reflect"
"strconv"
"strings"
"time"
@@ -21,6 +22,7 @@ import (
"k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/client-go/tools/record"
"k8s.io/utils/lru"
+ "k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -28,6 +30,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -90,6 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut
// +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch
// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update
// +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete
+// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update;
+// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update;
// +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update
// +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete
@@ -142,10 +147,42 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
}
+ // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects.
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // If an upgrade is in progress, check if rollback is necessary.
+ if activeRayClusterInstance != nil && pendingRayClusterInstance != nil {
+ if err := r.reconcileRollbackState(ctx, rayServiceInstance, activeRayClusterInstance, pendingRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ }
+
+ // Ensure per-cluster Serve service exists for the active and pending RayClusters.
+ if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ // Creates or updates a Gateway CR that points to the Serve services of
+ // the active and pending (if it exists) RayClusters. For incremental upgrades,
+ // the Gateway endpoint is used rather than the Serve service.
+ err = r.reconcileGateway(ctx, rayServiceInstance)
+ if err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err)
+ }
+ // Create or update the HTTPRoute attached to this RayService's Gateway.
+ err = r.reconcileHTTPRoute(ctx, rayServiceInstance)
+ if err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err)
+ }
+ }
+
// Reconcile serve applications for active and/or pending clusters
// 1. If there is a pending cluster, reconcile serve applications for the pending cluster.
// 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only.
// 3. If there is no pending cluster, reconcile serve applications for the active cluster.
+ // 4. During an IncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster based
+ // on total TargetCapacity.
var isActiveClusterReady, isPendingClusterReady bool = false, false
var activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus = nil, nil
if pendingRayClusterInstance != nil {
@@ -162,6 +199,11 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
}
+ } else if activeRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ logger.Info("Reconciling the Serve applications for active cluster during IncrementalUpgrade", "clusterName", activeRayClusterInstance.Name)
+ if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
}
// Reconcile K8s services and make sure it points to the correct RayCluster.
@@ -229,6 +271,27 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
rayServiceInstance.Status.ObservedGeneration = rayServiceInstance.ObjectMeta.Generation
+ if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) {
+ activeStatus := &rayServiceInstance.Status.ActiveServiceStatus
+ pendingStatus := &rayServiceInstance.Status.PendingServiceStatus
+
+ // A rollback is complete when the active cluster is back at 100% TargetCapacity and TrafficRoutedPercent,
+ // and the pending cluster is at 0% TargetCapacity and TrafficRoutedPercent.
+ if ptr.Deref(activeStatus.TargetCapacity, -1) == 100 &&
+ ptr.Deref(activeStatus.TrafficRoutedPercent, -1) == 100 &&
+ ptr.Deref(pendingStatus.TargetCapacity, -1) == 0 &&
+ ptr.Deref(pendingStatus.TrafficRoutedPercent, -1) == 0 {
+
+ logger.Info("Rollback to original cluster is complete. Cleaning up pending cluster from prior upgrade.")
+
+ // Clear the RayService pending service status to clean up the pending cluster.
+ rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{}
+ pendingCluster = nil
+
+ meta.RemoveStatusCondition(&rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress))
+ }
+ }
+
// Update RayClusterStatus in RayService status.
var activeClusterStatus, pendingClusterStatus rayv1.RayClusterStatus
if activeCluster != nil {
@@ -278,10 +341,30 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
}
logger.Info("Preparing a new pending RayCluster instance by setting RayClusterName",
"clusterName", rayServiceInstance.Status.PendingServiceStatus.RayClusterName)
+
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled
+ if rayServiceInstance.Status.ActiveServiceStatus.RayClusterName == "" {
+ // If no Active RayCluster exists - default to starting with 100% TargetCapacity.
+ if rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity == nil {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(100))
+ }
+ } else if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) {
+ // Pending RayCluster during an upgrade should start with 0% TargetCapacity.
+ if rayServiceInstance.Status.PendingServiceStatus.TargetCapacity == nil {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(0))
+ }
+ }
+ }
}
serveEndPoints := &corev1.Endpoints{}
- if err := r.Get(ctx, common.RayServiceServeServiceNamespacedName(rayServiceInstance), serveEndPoints); err != nil && !errors.IsNotFound(err) {
+ serveServiceName := common.RayServiceServeServiceNamespacedName(rayServiceInstance)
+ // For IncrementalUpgrade, the Serve service name is based on the RayCluster.
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeCluster != nil {
+ serveServiceName.Name = utils.GenerateServeServiceName(activeCluster.Name)
+ }
+ if err := r.Get(ctx, serveServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) {
return err
}
@@ -294,6 +377,21 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
if numServeEndpoints > math.MaxInt32 {
return errstd.New("numServeEndpoints exceeds math.MaxInt32")
}
+
+ // During an IncrementalUpgrade, the pending RayCluster is also serving.
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil {
+ pendingServeServiceName := common.RayClusterServeServiceNamespacedName(pendingCluster)
+ if err := r.Get(ctx, pendingServeServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) {
+ return err
+ }
+ for _, subset := range serveEndPoints.Subsets {
+ numServeEndpoints += len(subset.Addresses)
+ }
+ if numServeEndpoints > math.MaxInt32 {
+ return errstd.New("numServeEndpoints exceeds math.MaxInt32")
+ }
+ }
+
rayServiceInstance.Status.NumServeEndpoints = int32(numServeEndpoints) //nolint:gosec // This is a false positive from gosec. See https://github.com/securego/gosec/issues/1212 for more details.
calculateConditions(rayServiceInstance)
@@ -302,6 +400,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RayServiceReady)) {
rayServiceInstance.Status.ServiceStatus = rayv1.Running
}
+
return nil
}
@@ -392,7 +491,12 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra
if upgradeStrategy != nil {
upgradeType := upgradeStrategy.Type
if upgradeType != nil {
- if *upgradeType != rayv1.NewCluster {
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade {
+ logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade))
+ return false
+ }
+ } else if *upgradeType != rayv1.NewCluster {
logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to NewCluster.")
return false
}
@@ -407,6 +511,288 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra
return true
}
+func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) {
+ options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade.")
+ }
+
+ gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway")
+ // Define the desired Gateway object
+ rayServiceGateway := &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: gatewayName,
+ Namespace: rayServiceInstance.Namespace,
+ },
+ Spec: gwv1.GatewaySpec{
+ GatewayClassName: gwv1.ObjectName(options.GatewayClassName),
+ },
+ }
+
+ rayServiceGateway.Spec.Listeners = utils.GetGatewayListenersForRayService(rayServiceInstance)
+
+ return rayServiceGateway, nil
+}
+
+// `reconcileGateway` reconciles a Gateway resource for a RayService. The possible cases are:
+// (1) Create a new Gateway instance. (2) Update the Gateway instance if RayService has updated. (3) Do nothing.
+func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) error {
+ logger := ctrl.LoggerFrom(ctx)
+ var err error
+
+ // Construct desired Gateway object for RayService
+ desiredGateway, err := r.createGateway(rayServiceInstance)
+ if err != nil {
+ logger.Error(err, "Failed to build Gateway object for Rayservice")
+ return err
+ }
+ if desiredGateway == nil {
+ logger.Info("Skipping Gateway reconciliation: desired Gateway is nil")
+ return nil
+ }
+
+ // Check for existing RayService Gateway, create the desired Gateway if none is found
+ existingGateway := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), existingGateway); err != nil {
+ if errors.IsNotFound(err) {
+ // Set the ownership in order to do the garbage collection by k8s.
+ if err := ctrl.SetControllerReference(rayServiceInstance, desiredGateway, r.Scheme); err != nil {
+ return err
+ }
+ logger.Info("Creating a new Gateway instance", "Gateway Listeners", desiredGateway.Spec.Listeners)
+ if err := r.Create(ctx, desiredGateway); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name)
+ return nil
+ }
+ return err
+ }
+
+ // If Gateway already exists, check if update is needed to reach desired state
+ if !reflect.DeepEqual(existingGateway.Spec, desiredGateway.Spec) {
+ logger.Info("Updating existing Gateway", "name", existingGateway.Name)
+ existingGateway.Spec = desiredGateway.Spec
+ if err := r.Update(ctx, existingGateway); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name)
+ }
+
+ return nil
+}
+
+// reconcileTrafficRoutedPercent determines the traffic split between the active and pending clusters during an upgrade,
+// returning the weights for the old and new clusters respectively, or an error if misconfigured.
+func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, hasPendingCluster bool) (activeClusterWeight, pendingClusterWeight int32, err error) {
+ logger := ctrl.LoggerFrom(ctx)
+ activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus
+ pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus
+
+ // Default to 100% traffic on the active cluster.
+ activeClusterWeight = 100
+ pendingClusterWeight = 0
+
+ if hasPendingCluster {
+ // Zero-downtime upgrade in progress.
+ options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return 0, 0, errstd.New("IncrementalUpgradeOptions are not set during upgrade.")
+ }
+
+ // Check that target_capacity has been updated before migrating traffic.
+ pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0)
+ pendingClusterTargetCapacity := ptr.Deref(pendingServiceStatus.TargetCapacity, 0)
+
+ activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100)
+ activeClusterTargetCapacity := ptr.Deref(activeServiceStatus.TargetCapacity, 100)
+
+ isRollbackInProgress := meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress))
+
+ if (pendingClusterWeight == pendingClusterTargetCapacity && !isRollbackInProgress) || (isRollbackInProgress && activeClusterWeight == activeClusterTargetCapacity) {
+ // return without changing current traffic weights since cluster being migrated to is at capacity.
+ return activeClusterWeight, pendingClusterWeight, nil
+ }
+
+ // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate StepSizePercent traffic
+ // from the active RayCluster to the pending RayCluster.
+ intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second
+ lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime
+ if lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds {
+ if isRollbackInProgress {
+ // Gradually shift traffic from the pending to the active cluster.
+ logger.Info("Rollback in progress. Shifting traffic back to active cluster.", "stepSize", *options.StepSizePercent)
+ // cluster weight should never exceed current TargetCapacity and should sum to 100%
+ proposedActiveWeight := activeClusterWeight + *options.StepSizePercent
+ activeClusterWeight = min(100, proposedActiveWeight, activeClusterTargetCapacity)
+ pendingClusterWeight = 100 - activeClusterWeight
+ } else {
+ // Gradually shift traffic from the active to the pending cluster.
+ logger.Info("Upgrade in progress. Migrating traffic by StepSizePercent.", "stepSize", *options.StepSizePercent)
+ proposedPendingWeight := pendingClusterWeight + *options.StepSizePercent
+ pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity)
+ activeClusterWeight = 100 - pendingClusterWeight
+ }
+
+ pendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ activeServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ }
+ }
+
+ // Update the RayService status with the calculated traffic weights.
+ activeServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight)
+ pendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight)
+ logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight, "pendingClusterWeight", pendingClusterWeight)
+
+ return activeClusterWeight, pendingClusterWeight, nil
+}
+
+// createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with
+// weights based on TrafficRoutedPercent.
+func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) {
+ logger := ctrl.LoggerFrom(ctx)
+
+ // Retrieve Gateway instance to attach this HTTPRoute to.
+ gatewayInstance := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil {
+ return nil, err
+ }
+
+ // Retrieve the active RayCluster
+ activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance))
+ if err != nil && !errors.IsNotFound(err) {
+ logger.Error(err, "Failed to retrieve active RayCluster")
+ return nil, err
+ }
+ if activeRayCluster == nil {
+ logger.Info("Active RayCluster not found, skipping HTTPRoute creation.")
+ return nil, nil
+ }
+
+ // Attempt to retrieve pending RayCluster
+ pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance))
+ hasPendingCluster := (err == nil && pendingRayCluster != nil)
+ if err != nil && !errors.IsNotFound(err) {
+ logger.Info("Failed to retrieve pending RayCluster.")
+ return nil, err
+ }
+
+ activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster)
+ if err != nil {
+ logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.")
+ return nil, err
+ }
+
+ activeClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name)
+
+ backendRefs := []gwv1.HTTPBackendRef{
+ {
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{
+ Name: gwv1.ObjectName(activeClusterServeSvcName),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ Port: ptr.To(gwv1.PortNumber(8000)),
+ },
+ Weight: ptr.To(activeClusterWeight),
+ },
+ },
+ }
+
+ if hasPendingCluster {
+ pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name)
+
+ backendRefs = append(backendRefs, gwv1.HTTPBackendRef{
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{
+ Name: gwv1.ObjectName(pendingClusterServeSvcName),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ Port: ptr.To(gwv1.PortNumber(8000)),
+ },
+ Weight: ptr.To(pendingClusterWeight),
+ },
+ })
+ }
+
+ httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name))
+ desiredHTTPRoute := &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{Name: httpRouteName, Namespace: gatewayInstance.Namespace},
+ Spec: gwv1.HTTPRouteSpec{
+ CommonRouteSpec: gwv1.CommonRouteSpec{
+ ParentRefs: []gwv1.ParentReference{
+ {
+ Name: gwv1.ObjectName(gatewayInstance.Name),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ },
+ },
+ },
+ Rules: []gwv1.HTTPRouteRule{
+ {
+ Matches: []gwv1.HTTPRouteMatch{
+ {
+ Path: &gwv1.HTTPPathMatch{
+ Type: ptr.To(gwv1.PathMatchPathPrefix),
+ Value: ptr.To("/"),
+ },
+ },
+ },
+ BackendRefs: backendRefs,
+ },
+ },
+ },
+ }
+
+ return desiredHTTPRoute, nil
+}
+
+// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade.
+func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) error {
+ logger := ctrl.LoggerFrom(ctx)
+ var err error
+
+ desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance)
+ if err != nil {
+ logger.Error(err, "Failed to build HTTPRoute for RayService upgrade")
+ return err
+ }
+ if desiredHTTPRoute == nil {
+ logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil")
+ return nil
+ }
+
+ // Check for existing HTTPRoute for RayService
+ existingHTTPRoute := &gwv1.HTTPRoute{}
+ if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), existingHTTPRoute); err != nil {
+ if errors.IsNotFound(err) {
+ // Set the ownership in order to do the garbage collection by k8s.
+ if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil {
+ return err
+ }
+ if err = r.Create(ctx, desiredHTTPRoute); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.FailedToCreateHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name)
+ return nil
+ }
+ return err
+ }
+
+ // If HTTPRoute already exists, check if update is needed
+ if !reflect.DeepEqual(existingHTTPRoute.Spec, desiredHTTPRoute.Spec) {
+ logger.Info("Updating existing HTTPRoute", "name", desiredHTTPRoute.Name)
+ existingHTTPRoute.Spec = desiredHTTPRoute.Spec
+ if err := r.Update(ctx, existingHTTPRoute); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name)
+ }
+
+ return nil
+}
+
// `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases:
// (1) Create a new pending cluster. (2) Update the active cluster. (3) Update the pending cluster. (4) Do nothing.
func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServiceInstance *rayv1.RayService) (*rayv1.RayCluster, *rayv1.RayCluster, error) {
@@ -767,6 +1153,193 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer
return nil
}
+// checkIfNeedIncrementalUpgradeUpdate returns whether the controller should adjust the target_capacity
+// of the Serve config associated with a RayCluster during an IncrementalUpgrade.
+func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) {
+ activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus
+ pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus
+
+ if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" {
+ return false, "Both active and pending RayCluster instances required for incremental upgrade."
+ }
+
+ // Validate Gateway and HTTPRoute objects are ready
+ gatewayInstance := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil {
+ return false, fmt.Sprintf("Failed to retrieve Gateway for RayService: %v", err)
+ }
+ if !utils.IsGatewayReady(gatewayInstance) {
+ return false, "Gateway for RayService IncrementalUpgrade is not ready."
+ }
+
+ httpRouteInstance := &gwv1.HTTPRoute{}
+ if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRouteInstance); err != nil {
+ return false, fmt.Sprintf("Failed to retrieve HTTPRoute for RayService: %v", err)
+ }
+ if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) {
+ return false, "HTTPRoute for RayService IncrementalUpgrade is not ready."
+ }
+
+ // Retrieve the current observed IncrementalUpgrade Status fields for each RayService.
+ if activeRayServiceStatus.TargetCapacity == nil || activeRayServiceStatus.TrafficRoutedPercent == nil {
+ return true, "Active RayServiceStatus missing TargetCapacity or TrafficRoutedPercent."
+ }
+ if pendingRayServiceStatus.TargetCapacity == nil || pendingRayServiceStatus.TrafficRoutedPercent == nil {
+ return true, "Pending RayServiceStatus missing TargetCapacity or TrafficRoutedPercent."
+ }
+ activeTargetCapacity := int(*activeRayServiceStatus.TargetCapacity)
+ pendingTargetCapacity := int(*pendingRayServiceStatus.TargetCapacity)
+ pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent)
+
+ if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 {
+ return true, "Pending RayCluster has not finished scaling up."
+ } else if activeTargetCapacity == 0 && pendingTargetCapacity == 100 {
+ return false, "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete."
+ }
+ return true, "Active RayCluster TargetCapacity has not finished scaling down."
+}
+
+// applyServeTargetCapacity updates the target_capacity for a given RayCluster's Serve applications.
+func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface, goalTargetCapacity int32) error {
+ logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name)
+
+ // Retrieve cached ServeConfig from last reconciliation for cluster to update
+ cachedConfig := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name)
+ if cachedConfig == "" {
+ cachedConfig = rayServiceInstance.Spec.ServeConfigV2
+ }
+
+ serveConfig := make(map[string]interface{})
+ if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil {
+ return err
+ }
+
+ // Check if ServeConfig requires update
+ if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok {
+ if int32(currentTargetCapacity) == goalTargetCapacity {
+ logger.Info("target_capacity already updated on RayCluster", "target_capacity", currentTargetCapacity)
+ // No update required, return early
+ return nil
+ }
+ }
+
+ serveConfig["target_capacity"] = goalTargetCapacity
+ configJson, err := json.Marshal(serveConfig)
+ if err != nil {
+ return fmt.Errorf("failed to marshal serve config: %w", err)
+ }
+
+ logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity)
+ if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil {
+ return fmt.Errorf("failed to update target_capacity for Serve applications: %w", err)
+ }
+
+ // Update the status fields and cache new Serve config.
+ if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName {
+ rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ } else if rayClusterInstance.Name == rayServiceInstance.Status.PendingServiceStatus.RayClusterName {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ }
+ r.cacheServeConfig(rayServiceInstance, rayClusterInstance.Name)
+
+ return nil
+}
+
+// reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during
+// an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices.
+func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error {
+ logger := ctrl.LoggerFrom(ctx)
+ logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name)
+
+ if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ return nil
+ }
+
+ activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus
+ pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus
+
+ // Set initial TargetCapacity values if unset
+ if activeRayServiceStatus.TargetCapacity == nil {
+ activeRayServiceStatus.TargetCapacity = ptr.To(int32(100))
+ }
+ if pendingRayServiceStatus.TargetCapacity == nil {
+ pendingRayServiceStatus.TargetCapacity = ptr.To(int32(0))
+ }
+
+ // Retrieve the current observed Status fields for IncrementalUpgrade
+ activeTargetCapacity := ptr.Deref(activeRayServiceStatus.TargetCapacity, 100)
+ pendingTargetCapacity := ptr.Deref(pendingRayServiceStatus.TargetCapacity, 0)
+ pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0)
+
+ // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by
+ options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade")
+ }
+ maxSurgePercent := ptr.Deref(options.MaxSurgePercent, 100)
+
+ if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress)) {
+ // Rollback the upgrade. The active RayCluster should be scaled back to 100% target_capacity,
+ // while the pending RayCluster is scaled to 0%. This is the inverse of the regular upgrade path.
+ activeTrafficRoutedPercent := ptr.Deref(activeRayServiceStatus.TrafficRoutedPercent, 0)
+ if activeTargetCapacity != activeTrafficRoutedPercent {
+ logger.Info("Traffic is rolling back to active cluster, deferring capacity update.", "ActiveTargetCapacity", activeTargetCapacity, "ActiveTrafficRoutedPercent", activeTrafficRoutedPercent)
+ return nil
+ }
+
+ if activeTargetCapacity+pendingTargetCapacity > 100 {
+ if rayClusterInstance.Name == pendingRayServiceStatus.RayClusterName {
+ goalTargetCapacity := max(0, pendingTargetCapacity-maxSurgePercent)
+ logger.Info("Rollback: Scaling down pending cluster `target_capacity`.", "goal", goalTargetCapacity)
+ return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity)
+ }
+ } else {
+ if rayClusterInstance.Name == activeRayServiceStatus.RayClusterName {
+ goalTargetCapacity := min(100, activeTargetCapacity+maxSurgePercent)
+ logger.Info("Rollback: Scaling up active cluster `target_capacity`.", "goal", goalTargetCapacity)
+ return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity)
+ }
+ }
+ }
+
+ // Defer updating the target_capacity until traffic weights are updated
+ if pendingTargetCapacity != pendingTrafficRoutedPercent {
+ logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent)
+ return nil
+ }
+
+ // There are two cases:
+ // 1. The total target_capacity is greater than 100. This means the pending RayCluster has
+ // scaled up traffic and the active RayCluster can be scaled down by MaxSurgePercent.
+ // 2. The total target_capacity is equal to 100. This means the pending RayCluster can
+ // increase its target_capacity by MaxSurgePercent.
+ // If the rayClusterInstance passed into this function is not the cluster to update based
+ // on the above conditions, we return without doing anything.
+ var clusterName string
+ var goalTargetCapacity int32
+ if activeTargetCapacity+pendingTargetCapacity > int32(100) {
+ // Scale down the Active RayCluster TargetCapacity on this iteration.
+ goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent)
+ clusterName = activeRayServiceStatus.RayClusterName
+ if clusterName != rayClusterInstance.Name {
+ return nil
+ }
+ activeRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ logger.Info("Setting target_capacity for active Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity)
+ } else {
+ // Scale up the Pending RayCluster TargetCapacity on this iteration.
+ goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent)
+ clusterName = pendingRayServiceStatus.RayClusterName
+ if clusterName != rayClusterInstance.Name {
+ return nil
+ }
+ pendingRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity)
+ }
+
+ return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity)
+}
+
// `getAndCheckServeStatus` gets Serve applications' and deployments' statuses and check whether the
// Serve applications are ready to serve incoming traffic or not. It returns three values:
//
@@ -965,6 +1538,24 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
}
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
}
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) {
+ incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance)
+ logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason)
+ if incrementalUpgradeUpdate {
+ if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
+ return false, serveApplications, err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity),
+ "Updated target_capacity of serve applications to to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
+
+ // Don't switch to the pending RayCluster until IncrementalUpgrade is complete.
+ if rayServiceInstance.Status.PendingServiceStatus.RayClusterName == rayClusterInstance.Name {
+ return false, serveApplications, nil
+ }
+ }
+ }
+
return isReady, serveApplications, nil
}
@@ -1041,3 +1632,67 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins
}
return utils.IsRunningAndReady(headPod), nil
}
+
+// reconcilePerClusterServeService reconciles a load-balancing serve service for a given RayCluster.
+func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) error {
+ if rayClusterInstance == nil {
+ return nil
+ }
+
+ logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name)
+
+ logger.Info("Building per-cluster RayService")
+
+ // Create a serve service for the RayCluster associated with this RayService. During an incremental
+ // upgrade, this will be called for the pending RayCluster instance.
+ desiredSvc, err := common.BuildServeService(ctx, *rayServiceInstance, *rayClusterInstance, true)
+ if err != nil {
+ logger.Error(err, "Failed to build per-cluster serve service spec")
+ return err
+ }
+ if err := ctrl.SetControllerReference(rayClusterInstance, desiredSvc, r.Scheme); err != nil {
+ return err
+ }
+
+ existingSvc := &corev1.Service{}
+ err = r.Get(ctx, client.ObjectKey{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc)
+ if errors.IsNotFound(err) {
+ logger.Info("Creating new per-cluster serve service for incremental upgrade.", "Service", desiredSvc.Name)
+ return r.Create(ctx, desiredSvc)
+ }
+
+ return err
+}
+
+// reconcileRollbackState determines whether to initiate a rollback by setting the RollbackInProgress condition.
+func (r *RayServiceReconciler) reconcileRollbackState(ctx context.Context, rayServiceInstance *rayv1.RayService, activeCluster, pendingCluster *rayv1.RayCluster) error {
+ logger := ctrl.LoggerFrom(ctx)
+
+ goalHash, err := generateHashWithoutReplicasAndWorkersToDelete(rayServiceInstance.Spec.RayClusterSpec)
+ if err != nil {
+ return fmt.Errorf("failed to generate hash for goal cluster spec: %w", err)
+ }
+
+ originalHash := activeCluster.Annotations[utils.HashWithoutReplicasAndWorkersToDeleteKey]
+ pendingHash := pendingCluster.Annotations[utils.HashWithoutReplicasAndWorkersToDeleteKey]
+
+ isRollbackInProgress := meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress))
+
+ // Case 1: The goal spec matches the pending cluster's spec. In this case, we should revert the rollback attempt
+ // and continue to upgrade as normal.
+ if goalHash == pendingHash {
+ if isRollbackInProgress {
+ logger.Info("Goal state matches pending cluster. Canceling rollback and resuming upgrade.")
+ meta.RemoveStatusCondition(&rayServiceInstance.Status.Conditions, string(rayv1.RollbackInProgress))
+ }
+ return nil
+ }
+
+ // Case 2: The goal spec differs from pending cluster's spec. Rollback to original cluster.
+ if !isRollbackInProgress {
+ logger.Info("Goal state has changed during upgrade. Initiating rollback to the original cluster.", "goalHash", goalHash, "originalHash", originalHash, "pendingHash", pendingHash)
+ setCondition(rayServiceInstance, rayv1.RollbackInProgress, metav1.ConditionTrue, rayv1.GoalClusterChanged, "Goal state changed, rolling back to original cluster.")
+ }
+
+ return nil
+}
diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go
index 638af6b26fb..cfb165bcb3e 100644
--- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go
+++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go
@@ -13,13 +13,16 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
+ "k8s.io/utils/lru"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -27,6 +30,7 @@ import (
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types"
"github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
"github.com/ray-project/kuberay/ray-operator/test/support"
)
@@ -1319,3 +1323,920 @@ func TestRayClusterDeletionDelaySeconds(t *testing.T) {
})
}
}
+
+// Helper function to create a RayService object undergoing an incremental upgrade.
+func makeIncrementalUpgradeRayService(
+ withOptions bool,
+ gatewayClassName string,
+ stepSizePercent *int32,
+ intervalSeconds *int32,
+ routedPercent *int32,
+ lastTrafficMigratedTime *metav1.Time,
+) *rayv1.RayService {
+ spec := rayv1.RayServiceSpec{
+ ServeService: &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "serve-service",
+ Namespace: "test-ns",
+ },
+ Spec: corev1.ServiceSpec{
+ Ports: []corev1.ServicePort{
+ {
+ Name: "http",
+ Port: 8000,
+ },
+ },
+ },
+ },
+ }
+ if withOptions {
+ spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{
+ GatewayClassName: gatewayClassName,
+ StepSizePercent: stepSizePercent,
+ IntervalSeconds: intervalSeconds,
+ },
+ }
+ }
+
+ return &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "incremental-ray-service",
+ Namespace: "test-ns",
+ },
+ Spec: spec,
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active-ray-cluster",
+ RayClusterStatus: rayv1.RayClusterStatus{
+ Head: rayv1.HeadInfo{ServiceName: "active-service"},
+ },
+ TrafficRoutedPercent: routedPercent,
+ LastTrafficMigratedTime: lastTrafficMigratedTime,
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending-ray-cluster",
+ RayClusterStatus: rayv1.RayClusterStatus{
+ Head: rayv1.HeadInfo{ServiceName: "pending-service"},
+ },
+ TrafficRoutedPercent: ptr.To(int32(100) - *routedPercent),
+ LastTrafficMigratedTime: lastTrafficMigratedTime,
+ },
+ },
+ }
+}
+
+func TestCreateGateway(t *testing.T) {
+ serveService := &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "serve-service",
+ Namespace: "test-ns",
+ },
+ Spec: corev1.ServiceSpec{
+ Ports: []corev1.ServicePort{
+ {
+ Port: 8000,
+ },
+ },
+ },
+ }
+ newScheme := runtime.NewScheme()
+ _ = corev1.AddToScheme(newScheme)
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(serveService).Build()
+ reconciler := &RayServiceReconciler{
+ Client: fakeClient,
+ }
+
+ tests := []struct {
+ rayService *rayv1.RayService
+ name string
+ expectedGatewayName string
+ expectedClass string
+ expectedListeners int
+ expectErr bool
+ }{
+ {
+ name: "valid gateway creation",
+ expectedGatewayName: "incremental-ray-service-gateway",
+ rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(10)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}),
+ expectErr: false,
+ expectedClass: "gateway-class",
+ expectedListeners: 1,
+ },
+ {
+ name: "missing IncrementalUpgradeOptions",
+ rayService: makeIncrementalUpgradeRayService(false, "istio", ptr.To(int32(0)), ptr.To(int32(0)), ptr.To(int32(0)), &metav1.Time{Time: time.Now()}),
+ expectErr: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ gw, err := reconciler.createGateway(tt.rayService)
+ if tt.expectErr {
+ require.Error(t, err)
+ assert.Nil(t, gw)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, gw)
+ assert.Equal(t, tt.expectedGatewayName, gw.Name)
+ assert.Equal(t, tt.rayService.Namespace, gw.Namespace)
+ assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName)
+ assert.Len(t, gw.Spec.Listeners, tt.expectedListeners)
+ }
+ })
+ }
+}
+
+func TestCreateHTTPRoute(t *testing.T) {
+ ctx := context.TODO()
+ namespace := "test-ns"
+ stepSize := int32(10)
+ interval := int32(30)
+
+ activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-active", Namespace: namespace}}
+ pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-pending", Namespace: namespace}}
+ gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice-gateway", Namespace: namespace}}
+ activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}}
+ pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}}
+
+ baseRayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace},
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{
+ StepSizePercent: &stepSize,
+ IntervalSeconds: &interval,
+ GatewayClassName: "istio",
+ },
+ },
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: activeCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(100)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: pendingCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(0)),
+ TargetCapacity: ptr.To(int32(30)),
+ },
+ },
+ }
+
+ tests := []struct {
+ name string
+ modifier func(rs *rayv1.RayService)
+ runtimeObjects []runtime.Object
+ expectError bool
+ expectedActiveWeight int32
+ expectedPendingWeight int32
+ }{
+ {
+ name: "Incremental upgrade, time since LastTrafficMigratedTime < IntervalSeconds.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ {
+ name: "Incremental upgrade, time since LastTrafficMigratedTime >= IntervalSeconds.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectedActiveWeight: 90,
+ expectedPendingWeight: 10,
+ },
+ {
+ name: "Incremental upgrade, TrafficRoutedPercent capped to pending TargetCapacity.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectedActiveWeight: 95,
+ expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached
+ },
+ {
+ name: "Rollback from upgrade, IntervalSeconds have passed since LastTrafficMigratedTime.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.Conditions = append(rs.Status.Conditions, metav1.Condition{Type: string(rayv1.RollbackInProgress), Status: metav1.ConditionTrue})
+
+ // mock a partially completed upgrade
+ rs.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(70))
+ rs.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(int32(30))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectedActiveWeight: 80,
+ expectedPendingWeight: 20,
+ },
+ {
+ name: "Rollback from upgrade, TrafficRoutedPercent capped to active TargetCapacity.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.Conditions = append(rs.Status.Conditions, metav1.Condition{Type: string(rayv1.RollbackInProgress), Status: metav1.ConditionTrue})
+ rs.Status.ActiveServiceStatus.TargetCapacity = ptr.To(int32(65))
+ rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(40))
+
+ // mock a partially completed upgrade
+ rs.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(60))
+ rs.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(int32(40))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectedActiveWeight: 65,
+ expectedPendingWeight: 35, // can only migrate 5% to pending until TargetCapacity reached
+ },
+ {
+ name: "Create HTTPRoute called with missing IncrementalUpgradeOptions.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ expectError: true,
+ },
+ {
+ name: "No on-going upgrade, pending cluster does not exist.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus = rayv1.RayServiceStatus{}
+ },
+ runtimeObjects: []runtime.Object{activeCluster, gateway, activeServeService},
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ rayService := baseRayService.DeepCopy()
+ tt.modifier(rayService)
+ tt.runtimeObjects = append(tt.runtimeObjects, rayService)
+
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(1),
+ }
+
+ route, err := reconciler.createHTTPRoute(ctx, rayService)
+
+ if tt.expectError {
+ require.Error(t, err)
+ assert.Nil(t, route)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, route)
+
+ assert.Equal(t, "httproute-test-rayservice-gateway", route.Name)
+ assert.Equal(t, "test-ns", route.Namespace)
+
+ require.Len(t, route.Spec.Rules, 1)
+ rule := route.Spec.Rules[0]
+
+ require.GreaterOrEqual(t, len(rule.BackendRefs), 1)
+ assert.Equal(t, gwv1.ObjectName(activeServeService.Name), rule.BackendRefs[0].BackendRef.Name)
+ assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight)
+
+ if len(rule.BackendRefs) > 1 {
+ assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name)
+ assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight)
+ } else {
+ assert.Equal(t, int32(0), tt.expectedPendingWeight)
+ }
+ }
+ })
+ }
+}
+
+func TestReconcileHTTPRoute(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+ stepSize := int32(10)
+ interval := int32(30)
+ gatewayName := "test-rayservice-gateway"
+ routeName := fmt.Sprintf("httproute-%s", gatewayName)
+
+ activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active-ray-cluster", Namespace: namespace}}
+ pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending-ray-cluster", Namespace: namespace}}
+ activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}}
+ pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}}
+ gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}}
+
+ baseRayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace},
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{
+ StepSizePercent: &stepSize,
+ IntervalSeconds: &interval,
+ GatewayClassName: "istio",
+ },
+ },
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: activeCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(80)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: pendingCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(20)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ },
+ }
+
+ tests := []struct {
+ modifier func(rs *rayv1.RayService)
+ existingRoute *gwv1.HTTPRoute
+ name string
+ expectedActiveWeight int32
+ expectedPendingWeight int32
+ }{
+ {
+ name: "Create new HTTPRoute with weights.",
+ expectedActiveWeight: 70,
+ expectedPendingWeight: 30,
+ },
+ {
+ name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ },
+ existingRoute: &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace},
+ Spec: gwv1.HTTPRouteSpec{},
+ },
+ expectedActiveWeight: 70,
+ expectedPendingWeight: 30,
+ },
+ {
+ name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ },
+ expectedActiveWeight: 80,
+ expectedPendingWeight: 20,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ rayService := baseRayService.DeepCopy()
+ if tt.modifier != nil {
+ tt.modifier(rayService)
+ }
+
+ runtimeObjects := []runtime.Object{rayService, activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}
+ if tt.existingRoute != nil {
+ runtimeObjects = append(runtimeObjects, tt.existingRoute)
+ }
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
+ reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)}
+
+ err := reconciler.reconcileHTTPRoute(ctx, rayService)
+ require.NoError(t, err)
+
+ reconciledRoute := &gwv1.HTTPRoute{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: routeName, Namespace: namespace}, reconciledRoute)
+ require.NoError(t, err, "Failed to fetch the reconciled HTTPRoute")
+
+ require.Len(t, reconciledRoute.Spec.Rules, 1)
+ rule := reconciledRoute.Spec.Rules[0]
+ require.Len(t, rule.BackendRefs, 2)
+
+ // Assert weights are set as expected.
+ assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight)
+ assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight)
+
+ // Assert ParentRef namespace is now correctly set.
+ parent := reconciledRoute.Spec.ParentRefs[0]
+ assert.Equal(t, gwv1.ObjectName(gatewayName), parent.Name)
+ assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace)
+ })
+ }
+}
+
+func TestReconcileGateway(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+
+ rayService := makeIncrementalUpgradeRayService(
+ true,
+ "gateway-class",
+ ptr.To(int32(20)),
+ ptr.To(int32(30)),
+ ptr.To(int32(80)),
+ ptr.To(metav1.Now()),
+ )
+ gateway := makeGateway(fmt.Sprintf("%s-gateway", rayService.Name), rayService.Namespace, true)
+
+ tests := []struct {
+ name string
+ expectedGatewayName string
+ expectedClass string
+ runtimeObjects []runtime.Object
+ expectedNumListeners int
+ }{
+ {
+ name: "creates new Gateway if missing",
+ runtimeObjects: []runtime.Object{rayService},
+ expectedGatewayName: "incremental-ray-service-gateway",
+ expectedClass: "gateway-class",
+ expectedNumListeners: 1,
+ },
+ {
+ name: "updates Gateway if spec differs",
+ runtimeObjects: []runtime.Object{rayService, gateway},
+ expectedGatewayName: "incremental-ray-service-gateway",
+ expectedClass: "gateway-class",
+ expectedNumListeners: 1,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ fakeClient := clientFake.NewClientBuilder().
+ WithScheme(newScheme).
+ WithRuntimeObjects(tt.runtimeObjects...).
+ Build()
+
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(10),
+ }
+
+ err := reconciler.reconcileGateway(ctx, rayService)
+ require.NoError(t, err)
+
+ reconciledGateway := &gwv1.Gateway{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedGatewayName, Namespace: namespace}, reconciledGateway)
+ require.NoError(t, err, "Failed to get the reconciled Gateway")
+
+ assert.Equal(t, tt.expectedGatewayName, reconciledGateway.Name)
+ assert.Equal(t, namespace, reconciledGateway.Namespace)
+ assert.Equal(t, gwv1.ObjectName(tt.expectedClass), reconciledGateway.Spec.GatewayClassName)
+ assert.Len(t, reconciledGateway.Spec.Listeners, tt.expectedNumListeners)
+ })
+ }
+}
+
+func TestReconcileServeTargetCapacity(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ tests := []struct {
+ name string
+ updatedCluster string
+ activeCapacity int32
+ pendingCapacity int32
+ activeRoutedPercent int32
+ pendingRoutedPercent int32
+ maxSurgePercent int32
+ expectedActiveCapacity int32
+ expectedPendingCapacity int32
+ isRollback bool
+ }{
+ {
+ name: "Scale up pending RayCluster when total TargetCapacity < 100",
+ pendingRoutedPercent: 10,
+ activeCapacity: 70,
+ pendingCapacity: 10,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 70,
+ expectedPendingCapacity: 30,
+ updatedCluster: "pending",
+ },
+ {
+ name: "Scale down active RayCluster when total TargetCapacity > 100",
+ pendingRoutedPercent: 30,
+ activeCapacity: 80,
+ pendingCapacity: 30,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 60,
+ expectedPendingCapacity: 30,
+ updatedCluster: "active",
+ },
+ {
+ name: "Rollback: Scale up active RayCluster when total TargetCapacity < 100",
+ isRollback: true,
+ activeRoutedPercent: 60,
+ pendingRoutedPercent: 40,
+ activeCapacity: 60,
+ pendingCapacity: 30,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 80,
+ expectedPendingCapacity: 30,
+ updatedCluster: "active",
+ },
+ {
+ name: "Rollback: Scale down pending RayCluster when total TargetCapacity > 100",
+ isRollback: true,
+ activeRoutedPercent: 90,
+ pendingRoutedPercent: 10,
+ activeCapacity: 90,
+ pendingCapacity: 20,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 90,
+ expectedPendingCapacity: 0,
+ updatedCluster: "pending",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := context.TODO()
+ rayService := &rayv1.RayService{
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{
+ MaxSurgePercent: ptr.To(tt.maxSurgePercent),
+ },
+ },
+ ServeConfigV2: `{"target_capacity": 0}`,
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(tt.activeCapacity),
+ TrafficRoutedPercent: ptr.To(tt.activeRoutedPercent),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(tt.pendingCapacity),
+ TrafficRoutedPercent: ptr.To(tt.pendingRoutedPercent),
+ },
+ },
+ }
+ if tt.isRollback {
+ rayService.Status.Conditions = []metav1.Condition{
+ {
+ Type: string(rayv1.RollbackInProgress),
+ Status: metav1.ConditionTrue,
+ },
+ }
+ }
+
+ var rayCluster *rayv1.RayCluster
+ if tt.updatedCluster == "active" {
+ rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active"}}
+ } else {
+ rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending"}}
+ }
+
+ fakeDashboard := &utils.FakeRayDashboardClient{}
+ reconciler := &RayServiceReconciler{
+ ServeConfigs: lru.New(10),
+ }
+
+ err := reconciler.reconcileServeTargetCapacity(ctx, rayService, rayCluster, fakeDashboard)
+ require.NoError(t, err)
+ require.NotEmpty(t, fakeDashboard.LastUpdatedConfig)
+
+ if tt.updatedCluster == "active" {
+ assert.Equal(t, tt.expectedActiveCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity)
+ assert.Equal(t, tt.pendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity)
+ expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedActiveCapacity)) + `}`
+ assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig))
+ } else {
+ assert.Equal(t, tt.expectedPendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity)
+ assert.Equal(t, tt.activeCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity)
+ expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedPendingCapacity)) + `}`
+ assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig))
+ }
+ })
+ }
+}
+
+// MakeGateway is a helper function to return an Gateway object
+func makeGateway(name, namespace string, isReady bool) *gwv1.Gateway {
+ status := metav1.ConditionFalse
+ if isReady {
+ status = metav1.ConditionTrue
+ }
+ return &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Namespace: namespace,
+ },
+ Status: gwv1.GatewayStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.GatewayConditionAccepted),
+ Status: status,
+ },
+ {
+ Type: string(gwv1.GatewayConditionProgrammed),
+ Status: status,
+ },
+ },
+ },
+ }
+}
+
+// MakeHTTPRoute is a helper function to return an HTTPRoute object
+func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute {
+ status := metav1.ConditionFalse
+ if isReady {
+ status = metav1.ConditionTrue
+ }
+ return &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Namespace: namespace,
+ },
+ Status: gwv1.HTTPRouteStatus{
+ RouteStatus: gwv1.RouteStatus{
+ Parents: []gwv1.RouteParentStatus{
+ {
+ ParentRef: gwv1.ParentReference{
+ Name: gwv1.ObjectName("test-rayservice-gateway"),
+ Namespace: ptr.To(gwv1.Namespace(namespace)),
+ },
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.RouteConditionAccepted),
+ Status: status,
+ },
+ {
+ Type: string(gwv1.RouteConditionResolvedRefs),
+ Status: status,
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+}
+
+func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) {
+ rayServiceName := "test-rayservice"
+ gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway")
+ httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName)
+ namespace := "test-ns"
+
+ tests := []struct {
+ name string
+ expectedReason string
+ runtimeObjects []runtime.Object
+ activeStatus rayv1.RayServiceStatus
+ pendingStatus rayv1.RayServiceStatus
+ expectedNeedsUpdate bool
+ }{
+ {
+ name: "Missing RayClusterNames",
+ expectedNeedsUpdate: false,
+ expectedReason: "Both active and pending RayCluster instances required for incremental upgrade.",
+ },
+ {
+ name: "Gateway not ready",
+ activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"},
+ pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"},
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, false), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "Gateway for RayService IncrementalUpgrade is not ready.",
+ },
+ {
+ name: "HTTPRoute not ready",
+ activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"},
+ pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"},
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, false),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "HTTPRoute for RayService IncrementalUpgrade is not ready.",
+ },
+ {
+ name: "Incremental upgrade is complete",
+ activeStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(int32(0)),
+ TrafficRoutedPercent: ptr.To(int32(0)),
+ },
+ pendingStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(int32(100)),
+ TrafficRoutedPercent: ptr.To(int32(100)),
+ },
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete.",
+ },
+ {
+ name: "Pending RayCluster is still incrementally scaling",
+ activeStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(int32(70)),
+ TrafficRoutedPercent: ptr.To(int32(70)),
+ },
+ pendingStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(int32(30)),
+ TrafficRoutedPercent: ptr.To(int32(30)),
+ },
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: true,
+ expectedReason: "Pending RayCluster has not finished scaling up.",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+ // Initialize RayService reconciler.
+ ctx := context.TODO()
+ r := RayServiceReconciler{
+ Client: fakeClient,
+ Recorder: &record.FakeRecorder{},
+ Scheme: scheme.Scheme,
+ }
+ rayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace},
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: tt.activeStatus,
+ PendingServiceStatus: tt.pendingStatus,
+ },
+ }
+ needsUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayService)
+ assert.Equal(t, tt.expectedNeedsUpdate, needsUpdate)
+ assert.Equal(t, tt.expectedReason, reason)
+ })
+ }
+}
+
+func TestReconcilePerClusterServeService(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+
+ // Minimal RayCluster with at least one container.
+ rayCluster := &rayv1.RayCluster{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-ray-cluster",
+ Namespace: namespace,
+ UID: "test-uid",
+ },
+ Spec: rayv1.RayClusterSpec{
+ HeadGroupSpec: rayv1.HeadGroupSpec{
+ Template: corev1.PodTemplateSpec{
+ Spec: corev1.PodSpec{
+ Containers: []corev1.Container{
+ {Name: "ray-head"},
+ },
+ },
+ },
+ },
+ },
+ }
+ rayService := makeIncrementalUpgradeRayService(
+ true,
+ "istio",
+ ptr.To(int32(20)),
+ ptr.To(int32(30)),
+ ptr.To(int32(80)),
+ ptr.To(metav1.Now()),
+ )
+
+ // The expected pending RayCluster serve service.
+ expectedServeSvcName := utils.GenerateServeServiceName(rayCluster.Name)
+ expectedServeService := &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: expectedServeSvcName,
+ Namespace: namespace,
+ },
+ Spec: corev1.ServiceSpec{
+ Selector: map[string]string{
+ utils.RayClusterLabelKey: rayCluster.Name,
+ utils.RayClusterServingServiceLabelKey: "true",
+ },
+ },
+ }
+
+ tests := []struct {
+ name string
+ rayCluster *rayv1.RayCluster
+ runtimeObjects []runtime.Object
+ expectServiceCreated bool
+ expectError bool
+ }{
+ {
+ name: "RayCluster is nil, no-op.",
+ rayCluster: nil,
+ runtimeObjects: []runtime.Object{rayService},
+ expectServiceCreated: false,
+ expectError: false,
+ },
+ {
+ name: "Create a new Serve service for the RayCluster.",
+ rayCluster: rayCluster,
+ runtimeObjects: []runtime.Object{rayService, rayCluster},
+ expectServiceCreated: true,
+ expectError: false,
+ },
+ {
+ name: "Pending RayCluster serve service already exists, no-op.",
+ rayCluster: rayCluster,
+ runtimeObjects: []runtime.Object{rayService, rayCluster, expectedServeService},
+ expectServiceCreated: false,
+ expectError: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(1),
+ }
+
+ err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster)
+
+ if tt.expectError {
+ require.Error(t, err)
+ return
+ }
+ require.NoError(t, err)
+
+ reconciledSvc := &corev1.Service{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, reconciledSvc)
+
+ // No-op case, no service should be created when RayCluster is nil.
+ if tt.rayCluster == nil {
+ assert.True(t, errors.IsNotFound(err))
+ return
+ }
+
+ // Otherwise, a valid serve service should be created for the RayCluster.
+ require.NoError(t, err, "The Serve service should exist in the client")
+
+ // Validate the expected Serve service exists for the RayCluster.
+ require.NotNil(t, reconciledSvc)
+ assert.Equal(t, expectedServeSvcName, reconciledSvc.Name)
+
+ createdSvc := &corev1.Service{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, createdSvc)
+ require.NoError(t, err, "The Serve service should exist in the client")
+
+ // Verify the Serve service selector.
+ expectedSelector := map[string]string{
+ utils.RayClusterLabelKey: rayCluster.Name,
+ utils.RayClusterServingServiceLabelKey: "true",
+ }
+ assert.Equal(t, expectedSelector, createdSvc.Spec.Selector)
+
+ // Validate owner ref is set to the expected RayCluster.
+ if tt.expectServiceCreated {
+ require.Len(t, createdSvc.OwnerReferences, 1)
+ ownerRef := createdSvc.OwnerReferences[0]
+ assert.Equal(t, rayCluster.Name, ownerRef.Name)
+ assert.Equal(t, "RayCluster", ownerRef.Kind)
+ assert.Equal(t, rayCluster.UID, ownerRef.UID)
+ }
+ })
+ }
+}
diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go
index 2c2ba0fe616..4d04e9f5e3d 100644
--- a/ray-operator/controllers/ray/utils/consistency.go
+++ b/ray-operator/controllers/ray/utils/consistency.go
@@ -4,6 +4,7 @@ import (
"reflect"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
// Checks whether the old and new RayClusterStatus are inconsistent by comparing different fields. If the only
@@ -74,6 +75,15 @@ func inconsistentRayServiceStatus(oldStatus rayv1.RayServiceStatus, newStatus ra
}
}
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ // Also check for changes in IncrementalUpgrade related Status fields.
+ if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent ||
+ oldStatus.TargetCapacity != newStatus.TargetCapacity ||
+ oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime {
+ return true
+ }
+ }
+
return false
}
diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go
index fca1e4f8a00..025ae9968ef 100644
--- a/ray-operator/controllers/ray/utils/constant.go
+++ b/ray-operator/controllers/ray/utils/constant.go
@@ -317,9 +317,17 @@ const (
InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec"
InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata"
UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel"
+ UpdatedGateway K8sEventType = "UpdatedGateway"
+ UpdatedHTTPRoute K8sEventType = "UpdatedHTTPRoute"
UpdatedServeApplications K8sEventType = "UpdatedServeApplications"
+ UpdatedServeTargetCapacity K8sEventType = "UpdatedServeTargetCapacity"
FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel"
FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications"
+ FailedToUpdateTargetCapacity K8sEventType = "FailedToUpdateTargetCapacity"
+ FailedToCreateGateway K8sEventType = "FailedToCreateGateway"
+ FailedToUpdateGateway K8sEventType = "FailedToUpdateGateway"
+ FailedToCreateHTTPRoute K8sEventType = "FailedToCreateHTTPRoute"
+ FailedToUpdateHTTPRoute K8sEventType = "FailedToUpdateHTTPRoute"
// Generic Pod event list
DeletedPod K8sEventType = "DeletedPod"
diff --git a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
index 21a3fdb91be..1bf0588c403 100644
--- a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
+++ b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
@@ -12,9 +12,10 @@ import (
)
type FakeRayDashboardClient struct {
- multiAppStatuses map[string]*utiltypes.ServeApplicationStatus
- GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)]
- serveDetails utiltypes.ServeDetails
+ multiAppStatuses map[string]*utiltypes.ServeApplicationStatus
+ GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)]
+ serveDetails utiltypes.ServeDetails
+ LastUpdatedConfig []byte
}
var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(nil)
@@ -22,7 +23,8 @@ var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(ni
func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string) {
}
-func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, _ []byte) error {
+func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error {
+ r.LastUpdatedConfig = configJson
fmt.Print("UpdateDeployments fake succeeds.")
return nil
}
diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go
index cf6b9066323..fff9b0cf707 100644
--- a/ray-operator/controllers/ray/utils/util.go
+++ b/ray-operator/controllers/ray/utils/util.go
@@ -24,9 +24,11 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
const (
@@ -209,6 +211,40 @@ func CheckName(s string) string {
return s
}
+func CheckGatewayName(name string) string {
+ const maxLength = 63
+
+ if len(name) > maxLength {
+ offset := len(name) - maxLength
+ fmt.Printf("Gateway name too long (len = %d), shortening by offset = %d", len(name), offset)
+ name = name[offset:]
+ }
+
+ // Cannot start with a digit or punctuation
+ if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) {
+ name = "g" + name[1:]
+ }
+
+ return name
+}
+
+func CheckHTTPRouteName(name string) string {
+ const maxLength = 63
+
+ if len(name) > maxLength {
+ offset := len(name) - maxLength
+ fmt.Printf("HTTPRoute name too long (len = %d), shortening by offset = %d", len(name), offset)
+ name = name[offset:]
+ }
+
+ // Cannot start with a digit or punctuation
+ if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) {
+ name = "h" + name[1:]
+ }
+
+ return name
+}
+
// TrimJobName uses CheckLabel to trim Kubernetes job to constrains
func TrimJobName(jobName string) string {
return CheckLabel(jobName)
@@ -675,6 +711,89 @@ func GetRayClusterNameFromService(svc *corev1.Service) string {
return svc.Spec.Selector[RayClusterLabelKey]
}
+func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool {
+ if gatewayInstance == nil {
+ return false
+ }
+ hasAccepted := false
+ hasProgrammed := false
+
+ for _, condition := range gatewayInstance.Status.Conditions {
+ if condition.Type == string(gwv1.GatewayConditionAccepted) && condition.Status == metav1.ConditionTrue {
+ hasAccepted = true
+ }
+ if condition.Type == string(gwv1.GatewayConditionProgrammed) && condition.Status == metav1.ConditionTrue {
+ hasProgrammed = true
+ }
+ }
+
+ // If no ready condition found return false
+ return hasAccepted && hasProgrammed
+}
+
+// IsHTTPRouteReady returns whether the HTTPRoute associated with a given Gateway has a ready condition
+func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool {
+ if httpRouteInstance == nil {
+ return false
+ }
+ for _, parent := range httpRouteInstance.Status.Parents {
+ if parent.ParentRef.Name != gwv1.ObjectName(gatewayInstance.Name) {
+ continue
+ }
+ if parent.ParentRef.Namespace != nil && *parent.ParentRef.Namespace != gwv1.Namespace(gatewayInstance.Namespace) {
+ continue
+ }
+ hasAccepted := false
+ hasResolved := false
+
+ for _, condition := range parent.Conditions {
+ switch gwv1.RouteConditionType(condition.Type) {
+ case gwv1.RouteConditionAccepted:
+ if condition.Status == metav1.ConditionTrue {
+ hasAccepted = true
+ }
+ case gwv1.RouteConditionResolvedRefs:
+ if condition.Status == metav1.ConditionTrue {
+ hasResolved = true
+ }
+ }
+ }
+ if hasAccepted && hasResolved {
+ return true
+ }
+ }
+ return false
+}
+
+func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool {
+ if !features.Enabled(features.RayServiceIncrementalUpgrade) {
+ return false
+ }
+ return spec != nil && spec.UpgradeStrategy != nil &&
+ *spec.UpgradeStrategy.Type == rayv1.IncrementalUpgrade
+}
+
+func GetRayServiceIncrementalUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.IncrementalUpgradeOptions {
+ if spec != nil && spec.UpgradeStrategy != nil {
+ return spec.UpgradeStrategy.IncrementalUpgradeOptions
+ }
+ return nil
+}
+
+// addGatewayListenersForRayService is a helper function to returns Gateway Listeners
+func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener {
+ listeners := make([]gwv1.Listener, 0, 1)
+ listenerName := fmt.Sprintf("%s-listener", rayServiceInstance.Name)
+ listener := gwv1.Listener{
+ Name: gwv1.SectionName(listenerName),
+ Protocol: gwv1.HTTPProtocolType, // only support HTTP
+ Port: gwv1.PortNumber(int32(80)),
+ }
+ listeners = append(listeners, listener)
+
+ return listeners
+}
+
// Check where we are running. We are trying to distinguish here whether
// this is vanilla kubernetes cluster or Openshift
func GetClusterType() bool {
diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go
index 851e37af3ea..2d87c12ac46 100644
--- a/ray-operator/controllers/ray/utils/util_test.go
+++ b/ray-operator/controllers/ray/utils/util_test.go
@@ -12,9 +12,11 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
func TestGetClusterDomainName(t *testing.T) {
@@ -1248,6 +1250,235 @@ func TestCalculateResources(t *testing.T) {
}
}
+// helper function to return a Gateway object with GatewayStatus Conditions for testing.
+func makeGatewayWithCondition(accepted bool, programmed bool) *gwv1.Gateway {
+ var conditions []metav1.Condition
+
+ if accepted {
+ conditions = append(conditions, metav1.Condition{
+ Type: string(gwv1.GatewayConditionAccepted),
+ Status: metav1.ConditionTrue,
+ })
+ }
+
+ if programmed {
+ conditions = append(conditions, metav1.Condition{
+ Type: string(gwv1.GatewayConditionProgrammed),
+ Status: metav1.ConditionTrue,
+ })
+ }
+
+ return &gwv1.Gateway{
+ Status: gwv1.GatewayStatus{
+ Conditions: conditions,
+ },
+ }
+}
+
+func TestIsGatewayReady(t *testing.T) {
+ tests := []struct {
+ gateway *gwv1.Gateway
+ name string
+ expected bool
+ }{
+ {
+ name: "missing Gateway instance",
+ gateway: nil,
+ expected: false,
+ },
+ {
+ name: "Gateway created with Programmed condition only",
+ gateway: makeGatewayWithCondition(false, true),
+ expected: false,
+ },
+ {
+ name: "Gateway created with Accepted condition only",
+ gateway: makeGatewayWithCondition(true, false),
+ expected: false,
+ },
+ {
+ name: "Gateway created with both Accepted and Programmed conditions",
+ gateway: makeGatewayWithCondition(true, true),
+ expected: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.expected, IsGatewayReady(tt.gateway))
+ })
+ }
+}
+
+// helper function to return a HTTPRoute with HTTPRouteStatus for testing
+func makeHTTPRouteWithParentRef(
+ parentRefName string,
+ namespace string,
+ accepted bool,
+ resolvedRefs bool,
+) *gwv1.HTTPRoute {
+ var acceptedStatus, resolvedRefsStatus metav1.ConditionStatus
+ if accepted {
+ acceptedStatus = metav1.ConditionTrue
+ } else {
+ acceptedStatus = metav1.ConditionFalse
+ }
+ if resolvedRefs {
+ resolvedRefsStatus = metav1.ConditionTrue
+ } else {
+ resolvedRefsStatus = metav1.ConditionFalse
+ }
+
+ return &gwv1.HTTPRoute{
+ Status: gwv1.HTTPRouteStatus{
+ RouteStatus: gwv1.RouteStatus{
+ Parents: []gwv1.RouteParentStatus{
+ {
+ ParentRef: gwv1.ParentReference{
+ Name: gwv1.ObjectName(parentRefName),
+ Namespace: ptr.To(gwv1.Namespace(namespace)),
+ },
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.RouteConditionAccepted),
+ Status: acceptedStatus,
+ },
+ {
+ Type: string(gwv1.RouteConditionResolvedRefs),
+ Status: resolvedRefsStatus,
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+}
+
+func TestIsHTTPRouteReady(t *testing.T) {
+ gateway := &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-gateway", Namespace: "test-ns"},
+ }
+
+ tests := []struct {
+ httpRoute *gwv1.HTTPRoute
+ name string
+ expected bool
+ }{
+ {
+ name: "missing HTTPRoute",
+ httpRoute: nil,
+ expected: false,
+ },
+ {
+ name: "ParentRef does not match",
+ httpRoute: makeHTTPRouteWithParentRef("not-a-match", "other-test-ns", true, true),
+ expected: false,
+ },
+ {
+ name: "matching ParentRef with Accepted condition but without ResolvedRefs",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, false),
+ expected: false,
+ },
+ {
+ name: "matching ParentRef with ResolvedRefs but without Accepted",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", false, true),
+ expected: false,
+ },
+ {
+ name: "ready HTTPRoute with all required conditions",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, true),
+ expected: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.expected, IsHTTPRouteReady(gateway, tt.httpRoute))
+ })
+ }
+}
+
+func TestIsIncrementalUpgradeEnabled(t *testing.T) {
+ tests := []struct {
+ spec *rayv1.RayServiceSpec
+ name string
+ featureEnabled bool
+ expected bool
+ }{
+ {
+ name: "missing UpgradeStrategy Type",
+ spec: &rayv1.RayServiceSpec{},
+ featureEnabled: true,
+ expected: false,
+ },
+ {
+ name: "UpgradeStrategy Type is IncrementalUpgrade but feature disabled",
+ spec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ },
+ },
+ featureEnabled: false,
+ expected: false,
+ },
+ {
+ name: "UpgradeStrategy Type is IncrementalUpgrade and feature enabled",
+ spec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ },
+ },
+ featureEnabled: true,
+ expected: true,
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, tc.featureEnabled)
+ assert.Equal(t, tc.expected, IsIncrementalUpgradeEnabled(tc.spec))
+ })
+ }
+}
+
+func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) {
+ upgradeOptions := &rayv1.IncrementalUpgradeOptions{GatewayClassName: "gateway-class"}
+
+ tests := []struct {
+ rayServiceSpec *rayv1.RayServiceSpec
+ expectedOptions *rayv1.IncrementalUpgradeOptions
+ name string
+ }{
+ {
+ name: "RayServiceSpec is nil, return nil IncrementalUpgradeOptions",
+ rayServiceSpec: nil,
+ expectedOptions: nil,
+ },
+ {
+ name: "UpgradeStrategy is nil, return nil IncrementalUpgradeOptions",
+ rayServiceSpec: &rayv1.RayServiceSpec{},
+ expectedOptions: nil,
+ },
+ {
+ name: "Valid IncrementalUpgradeOptions",
+ rayServiceSpec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ IncrementalUpgradeOptions: upgradeOptions,
+ },
+ },
+ expectedOptions: upgradeOptions,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ actualOptions := GetRayServiceIncrementalUpgradeOptions(tt.rayServiceSpec)
+ assert.Equal(t, tt.expectedOptions, actualOptions)
+ })
+ }
+}
+
func TestGetContainerCommand(t *testing.T) {
tests := []struct {
name string
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 74d2b4fe0e6..a9debeac7a9 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -286,12 +286,13 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set")
}
- // only NewCluster and None are valid upgradeType
+ // only IncrementalUpgrade, NewCluster, and None are valid upgradeType
if rayService.Spec.UpgradeStrategy != nil &&
rayService.Spec.UpgradeStrategy.Type != nil &&
*rayService.Spec.UpgradeStrategy.Type != rayv1.None &&
- *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster {
- return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewCluster, rayv1.None)
+ *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster &&
+ *rayService.Spec.UpgradeStrategy.Type != rayv1.IncrementalUpgrade {
+ return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.IncrementalUpgrade, rayv1.NewCluster, rayv1.None)
}
if rayService.Spec.RayClusterDeletionDelaySeconds != nil &&
@@ -299,5 +300,40 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds)
}
+ // If type is IncrementalUpgrade, validate the IncrementalUpgradeOptions
+ if IsIncrementalUpgradeEnabled(&rayService.Spec) {
+ return ValidateIncrementalUpgradeOptions(rayService)
+ }
+
+ return nil
+}
+
+func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error {
+ if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) {
+ return fmt.Errorf("Ray Autoscaler is required for IncrementalUpgrade")
+ }
+
+ options := rayService.Spec.UpgradeStrategy.IncrementalUpgradeOptions
+ if options == nil {
+ return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade")
+ }
+
+ // MaxSurgePercent defaults to 100% if unset.
+ if *options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100 {
+ return fmt.Errorf("maxSurgePercent must be between 0 and 100")
+ }
+
+ if options.StepSizePercent == nil || *options.StepSizePercent < 0 || *options.StepSizePercent > 100 {
+ return fmt.Errorf("stepSizePercent must be between 0 and 100")
+ }
+
+ if options.IntervalSeconds == nil || *options.IntervalSeconds <= 0 {
+ return fmt.Errorf("intervalSeconds must be greater than 0")
+ }
+
+ if options.GatewayClassName == "" {
+ return fmt.Errorf("gatewayClassName is required for IncrementalUpgrade")
+ }
+
return nil
}
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index dc464424f40..7debd97e6d2 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1229,3 +1229,112 @@ func createBasicRayClusterSpec() *rayv1.RayClusterSpec {
},
}
}
+
+func TestValidateIncrementalUpgradeOptions(t *testing.T) {
+ tests := []struct {
+ maxSurgePercent *int32
+ stepSizePercent *int32
+ intervalSeconds *int32
+ name string
+ gatewayClassName string
+ spec rayv1.RayServiceSpec
+ enableAutoscaling bool
+ expectError bool
+ }{
+ {
+ name: "valid config",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: false,
+ },
+ {
+ name: "missing autoscaler",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: false,
+ expectError: true,
+ },
+ {
+ name: "missing options",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "invalid MaxSurgePercent",
+ maxSurgePercent: ptr.To(int32(200)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "missing StepSizePercent",
+ maxSurgePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "invalid IntervalSeconds",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(0)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "missing GatewayClassName",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ var upgradeStrategy *rayv1.RayServiceUpgradeStrategy
+ if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" {
+ upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{
+ MaxSurgePercent: tt.maxSurgePercent,
+ StepSizePercent: tt.stepSizePercent,
+ IntervalSeconds: tt.intervalSeconds,
+ GatewayClassName: tt.gatewayClassName,
+ },
+ }
+ } else if tt.expectError {
+ upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.IncrementalUpgrade),
+ }
+ }
+
+ rayClusterSpec := *createBasicRayClusterSpec()
+ rayClusterSpec.EnableInTreeAutoscaling = ptr.To(tt.enableAutoscaling)
+
+ rayService := &rayv1.RayService{
+ Spec: rayv1.RayServiceSpec{
+ RayClusterSpec: rayClusterSpec,
+ UpgradeStrategy: upgradeStrategy,
+ },
+ }
+
+ err := ValidateIncrementalUpgradeOptions(rayService)
+ if tt.expectError {
+ require.Error(t, err, tt.name)
+ } else {
+ require.NoError(t, err, tt.name)
+ }
+ })
+ }
+}
diff --git a/ray-operator/go.mod b/ray-operator/go.mod
index 94d155da29f..78f3870ae24 100644
--- a/ray-operator/go.mod
+++ b/ray-operator/go.mod
@@ -4,22 +4,21 @@ go 1.24.0
require (
github.com/Masterminds/semver/v3 v3.3.1
+ github.com/coder/websocket v1.8.13
github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0
- github.com/google/go-cmp v0.7.0
github.com/jarcoal/httpmock v1.4.0
github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7
github.com/orcaman/concurrent-map/v2 v2.0.1
- github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.22.0
+ github.com/spf13/pflag v1.0.6
github.com/stretchr/testify v1.10.0
go.uber.org/mock v0.5.2
go.uber.org/zap v1.27.0
gopkg.in/natefinch/lumberjack.v2 v2.2.1
k8s.io/api v0.33.1
- k8s.io/apiextensions-apiserver v0.33.1
k8s.io/apimachinery v0.33.1
k8s.io/apiserver v0.33.1
k8s.io/client-go v0.33.1
@@ -28,6 +27,7 @@ require (
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
sigs.k8s.io/controller-runtime v0.21.0
+ sigs.k8s.io/gateway-api v1.3.0
sigs.k8s.io/scheduler-plugins v0.31.8
sigs.k8s.io/structured-merge-diff/v4 v4.7.0
sigs.k8s.io/yaml v1.4.0
@@ -38,19 +38,19 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
- github.com/coder/websocket v1.8.13 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
- github.com/emicklei/go-restful/v3 v3.11.0 // indirect
+ github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
- github.com/go-openapi/jsonreference v0.20.2 // indirect
+ github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
+ github.com/google/go-cmp v0.7.0 // indirect
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
@@ -62,11 +62,11 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
+ github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
- github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/otel v1.33.0 // indirect
@@ -74,19 +74,20 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/mod v0.24.0 // indirect
- golang.org/x/net v0.38.0 // indirect
+ golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
- golang.org/x/sync v0.12.0 // indirect
+ golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
- golang.org/x/term v0.30.0 // indirect
- golang.org/x/text v0.23.0 // indirect
+ golang.org/x/term v0.31.0 // indirect
+ golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.9.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
- google.golang.org/protobuf v1.36.5 // indirect
+ google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
+ k8s.io/apiextensions-apiserver v0.33.1 // indirect
k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
diff --git a/ray-operator/go.sum b/ray-operator/go.sum
index 6d6e0b27493..2d1825ab836 100644
--- a/ray-operator/go.sum
+++ b/ray-operator/go.sum
@@ -10,13 +10,12 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coder/websocket v1.8.13 h1:f3QZdXy7uGVz+4uCJy2nTZyM0yTBj8yANEHhqlXZ9FE=
github.com/coder/websocket v1.8.13/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs=
-github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
-github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
+github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk=
+github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U=
github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
@@ -29,12 +28,10 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
-github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
-github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
-github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
-github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
+github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
@@ -67,11 +64,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
-github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
@@ -116,17 +110,12 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
-github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
-github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
+github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
@@ -158,26 +147,26 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
+golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
+golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
+golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -192,8 +181,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
-google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM=
-google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
+google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
+google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
@@ -203,7 +192,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw=
@@ -230,6 +218,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
+sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M=
+sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
diff --git a/ray-operator/main.go b/ray-operator/main.go
index a10c27ee367..1022f8e7577 100644
--- a/ray-operator/main.go
+++ b/ray-operator/main.go
@@ -27,6 +27,7 @@ import (
k8szap "sigs.k8s.io/controller-runtime/pkg/log/zap"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
@@ -191,6 +192,10 @@ func main() {
}
features.LogFeatureGates(setupLog)
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ utilruntime.Must(gwv1.AddToScheme(scheme))
+ }
+
// Manager options
options := ctrl.Options{
Cache: cache.Options{
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go
new file mode 100644
index 00000000000..a736a964cdb
--- /dev/null
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go
@@ -0,0 +1,50 @@
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+// IncrementalUpgradeOptionsApplyConfiguration represents a declarative configuration of the IncrementalUpgradeOptions type for use
+// with apply.
+type IncrementalUpgradeOptionsApplyConfiguration struct {
+ MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
+ StepSizePercent *int32 `json:"stepSizePercent,omitempty"`
+ IntervalSeconds *int32 `json:"intervalSeconds,omitempty"`
+ GatewayClassName *string `json:"gatewayClassName,omitempty"`
+}
+
+// IncrementalUpgradeOptionsApplyConfiguration constructs a declarative configuration of the IncrementalUpgradeOptions type for use with
+// apply.
+func IncrementalUpgradeOptions() *IncrementalUpgradeOptionsApplyConfiguration {
+ return &IncrementalUpgradeOptionsApplyConfiguration{}
+}
+
+// WithMaxSurgePercent sets the MaxSurgePercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the MaxSurgePercent field is set to the value of the last call.
+func (b *IncrementalUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration {
+ b.MaxSurgePercent = &value
+ return b
+}
+
+// WithStepSizePercent sets the StepSizePercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the StepSizePercent field is set to the value of the last call.
+func (b *IncrementalUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration {
+ b.StepSizePercent = &value
+ return b
+}
+
+// WithIntervalSeconds sets the IntervalSeconds field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the IntervalSeconds field is set to the value of the last call.
+func (b *IncrementalUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *IncrementalUpgradeOptionsApplyConfiguration {
+ b.IntervalSeconds = &value
+ return b
+}
+
+// WithGatewayClassName sets the GatewayClassName field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the GatewayClassName field is set to the value of the last call.
+func (b *IncrementalUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *IncrementalUpgradeOptionsApplyConfiguration {
+ b.GatewayClassName = &value
+ return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
index b0fcd8032bb..2d7f2984cef 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
@@ -2,12 +2,19 @@
package v1
+import (
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
// RayServiceStatusApplyConfiguration represents a declarative configuration of the RayServiceStatus type for use
// with apply.
type RayServiceStatusApplyConfiguration struct {
- Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"`
- RayClusterName *string `json:"rayClusterName,omitempty"`
- RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"`
+ Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"`
+ TargetCapacity *int32 `json:"targetCapacity,omitempty"`
+ TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
+ LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
+ RayClusterName *string `json:"rayClusterName,omitempty"`
+ RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"`
}
// RayServiceStatusApplyConfiguration constructs a declarative configuration of the RayServiceStatus type for use with
@@ -30,6 +37,30 @@ func (b *RayServiceStatusApplyConfiguration) WithApplications(entries map[string
return b
}
+// WithTargetCapacity sets the TargetCapacity field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the TargetCapacity field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithTargetCapacity(value int32) *RayServiceStatusApplyConfiguration {
+ b.TargetCapacity = &value
+ return b
+}
+
+// WithTrafficRoutedPercent sets the TrafficRoutedPercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the TrafficRoutedPercent field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithTrafficRoutedPercent(value int32) *RayServiceStatusApplyConfiguration {
+ b.TrafficRoutedPercent = &value
+ return b
+}
+
+// WithLastTrafficMigratedTime sets the LastTrafficMigratedTime field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the LastTrafficMigratedTime field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithLastTrafficMigratedTime(value metav1.Time) *RayServiceStatusApplyConfiguration {
+ b.LastTrafficMigratedTime = &value
+ return b
+}
+
// WithRayClusterName sets the RayClusterName field in the declarative configuration to the given value
// and returns the receiver, so that objects can be built by chaining "With" function invocations.
// If called multiple times, the RayClusterName field is set to the value of the last call.
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
index 361a98f6ac9..0a190883bff 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
@@ -9,7 +9,8 @@ import (
// RayServiceUpgradeStrategyApplyConfiguration represents a declarative configuration of the RayServiceUpgradeStrategy type for use
// with apply.
type RayServiceUpgradeStrategyApplyConfiguration struct {
- Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"`
+ Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"`
+ IncrementalUpgradeOptions *IncrementalUpgradeOptionsApplyConfiguration `json:"incrementalUpgradeOptions,omitempty"`
}
// RayServiceUpgradeStrategyApplyConfiguration constructs a declarative configuration of the RayServiceUpgradeStrategy type for use with
@@ -25,3 +26,11 @@ func (b *RayServiceUpgradeStrategyApplyConfiguration) WithType(value rayv1.RaySe
b.Type = &value
return b
}
+
+// WithIncrementalUpgradeOptions sets the IncrementalUpgradeOptions field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the IncrementalUpgradeOptions field is set to the value of the last call.
+func (b *RayServiceUpgradeStrategyApplyConfiguration) WithIncrementalUpgradeOptions(value *IncrementalUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration {
+ b.IncrementalUpgradeOptions = value
+ return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go
index 23e455d739a..e46530b7582 100644
--- a/ray-operator/pkg/client/applyconfiguration/utils.go
+++ b/ray-operator/pkg/client/applyconfiguration/utils.go
@@ -30,6 +30,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
return &rayv1.HeadGroupSpecApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("HeadInfo"):
return &rayv1.HeadInfoApplyConfiguration{}
+ case v1.SchemeGroupVersion.WithKind("IncrementalUpgradeOptions"):
+ return &rayv1.IncrementalUpgradeOptionsApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("RayCluster"):
return &rayv1.RayClusterApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("RayClusterSpec"):
diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go
index 2abea2ffbbb..ce5734cee0a 100644
--- a/ray-operator/pkg/features/features.go
+++ b/ray-operator/pkg/features/features.go
@@ -24,6 +24,13 @@ const (
//
// Enables new deletion policy API in RayJob
RayJobDeletionPolicy featuregate.Feature = "RayJobDeletionPolicy"
+
+ // owner: @ryanaoleary
+ // rep: N/A
+ // alpha: v1.0
+ //
+ // Enabled incremental upgrades for RayService zero-downtime upgrades.
+ RayServiceIncrementalUpgrade featuregate.Feature = "RayServiceIncrementalUpgrade"
)
func init() {
@@ -31,8 +38,9 @@ func init() {
}
var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
- RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta},
- RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha},
+ RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta},
+ RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha},
+ RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha},
}
// SetFeatureGateDuringTest is a helper method to override feature gates in tests.
diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go
new file mode 100644
index 00000000000..9ce4e87777d
--- /dev/null
+++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go
@@ -0,0 +1,320 @@
+package e2eincrementalupgrade
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+
+ . "github.com/onsi/gomega"
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/errors"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/utils/ptr"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
+
+ "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+ rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
+ . "github.com/ray-project/kuberay/ray-operator/test/support"
+)
+
+// helper function to get RayCluster head service external IP to use to poll the RayService
+func GetHeadServiceExternalIP(t *testing.T, clusterName, namespace string) (string, error) {
+ test := With(t)
+
+ svc, err := test.Client().Core().CoreV1().Services(namespace).Get(test.Ctx(), clusterName+"-head-svc", metav1.GetOptions{})
+ if err != nil {
+ return "", err
+ }
+ if len(svc.Status.LoadBalancer.Ingress) == 0 {
+ return "", fmt.Errorf("no ingress for service %s", svc.Name)
+ }
+ return svc.Status.LoadBalancer.Ingress[0].IP, nil
+}
+
+func TestRayServiceIncrementalUpgrade(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ test := With(t)
+ g := NewWithT(t)
+
+ namespace := test.NewTestNamespace()
+ rayServiceName := "incremental-rayservice"
+
+ // Create a RayService with IncrementalUpgrade enabled
+ stepSize := ptr.To(int32(25))
+ interval := ptr.To(int32(10))
+ maxSurge := ptr.To(int32(50))
+
+ rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name).
+ WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge))
+ rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(rayService).NotTo(BeNil())
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium).
+ Should(WithTransform(IsRayServiceReady, BeTrue()))
+
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ // Validate Gateway and HTTPRoute objects have been created for incremental upgrade.
+ gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway")
+ LogWithTimestamp(test.T(), "Waiting for Gateway %s/%s to be ready", rayService.Namespace, gatewayName)
+ g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium).
+ Should(WithTransform(utils.IsGatewayReady, BeTrue()))
+
+ // Get the Gateway endpoint to send requests to
+ gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway"))
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(gateway).NotTo(BeNil())
+
+ httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName)
+ LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName)
+ g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium).
+ Should(Not(BeNil()))
+
+ httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue())
+
+ // Create curl pod to test traffic routing through Gateway to RayService
+ curlPodName := "curl-pod"
+ curlContainerName := "curl-container"
+ curlPod, err := CreateCurlPod(g, test, curlPodName, curlContainerName, namespace.Name)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ LogWithTimestamp(test.T(), "Waiting for Curl Pod %s to be ready", curlPodName)
+ g.Eventually(func(g Gomega) *corev1.Pod {
+ updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred())
+ return updatedPod
+ }, TestTimeoutShort).Should(WithTransform(IsPodRunningAndReady, BeTrue()))
+
+ gatewayIP := GetGatewayIP(gateway)
+ g.Expect(gatewayIP).NotTo(BeEmpty())
+
+ LogWithTimestamp(test.T(), "Verifying RayService is serving traffic")
+ stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ g.Expect(stdout.String()).To(Equal("6"))
+ stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`)
+ g.Expect(stdout.String()).To(Equal("15 pizzas please!"))
+
+ // Trigger incremental upgrade by updating RayService serve config and RayCluster spec
+ rayService, err = GetRayService(test, namespace.Name, rayService.Name)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m")
+ serveConfig := rayService.Spec.ServeConfigV2
+ serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1)
+ serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1)
+ rayService.Spec.ServeConfigV2 = serveConfig
+ _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(
+ test.Ctx(),
+ rayService,
+ metav1.UpdateOptions{},
+ )
+ g.Expect(err).NotTo(HaveOccurred())
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue()))
+
+ LogWithTimestamp(test.T(), "Verifying temporary service creation and HTTPRoute backends")
+ upgradingRaySvc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ activeClusterName := upgradingRaySvc.Status.ActiveServiceStatus.RayClusterName
+ g.Expect(activeClusterName).NotTo(BeEmpty(), "The active cluster should be set when a RayService is ready.")
+ pendingClusterName := upgradingRaySvc.Status.PendingServiceStatus.RayClusterName
+ g.Expect(pendingClusterName).NotTo(BeEmpty(), "The controller should have created a pending cluster.")
+
+ // Validate serve service for the active cluster exists.
+ activeServeSvcName := utils.GenerateServeServiceName(activeClusterName)
+ _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), activeServeSvcName, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred(), "The serve service for the active cluster should be created.")
+
+ // Validate serve service for the pending cluster has been created for the upgrade.
+ pendingServeSvcName := utils.GenerateServeServiceName(pendingClusterName)
+ g.Eventually(func(g Gomega) {
+ _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), pendingServeSvcName, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred(), "The serve service for the pending cluster should be created.")
+ }, TestTimeoutShort).Should(Succeed())
+
+ // Verify HTTPRoute is pointing to the correct two backends.
+ g.Eventually(func(g Gomega) {
+ route, err := GetHTTPRoute(test, namespace.Name, httpRouteName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(route.Spec.Rules).To(HaveLen(1))
+ g.Expect(route.Spec.Rules[0].BackendRefs).To(HaveLen(2))
+ g.Expect(string(route.Spec.Rules[0].BackendRefs[0].Name)).To(Equal(activeServeSvcName))
+ g.Expect(string(route.Spec.Rules[0].BackendRefs[1].Name)).To(Equal(pendingServeSvcName))
+ }, TestTimeoutShort).Should(Succeed())
+
+ LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration")
+ intervalSeconds := *interval
+ var lastMigratedTime *metav1.Time
+
+ // Validate expected behavior during an IncrementalUpgrade. The following checks ensures
+ // that no requests are dropped throughout the upgrade process.
+ upgradeSteps := generateUpgradeSteps(*stepSize, *maxSurge)
+ for _, step := range upgradeSteps {
+ LogWithTimestamp(test.T(), "%s", step.name)
+ g.Eventually(func(g Gomega) int32 {
+ // Fetch updated RayService.
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ // Send a request to the RayService to validate no requests are dropped.
+ stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ g.Expect(stdout.String()).To(Equal("6"))
+
+ return step.getValue(svc)
+ }, TestTimeoutShort).Should(Equal(step.expectedValue))
+
+ if strings.Contains(step.name, "pending traffic to shift") {
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ currentMigratedTime := svc.Status.PendingServiceStatus.LastTrafficMigratedTime
+ g.Expect(currentMigratedTime).NotTo(BeNil())
+
+ // Verify IntervalSeconds have passed since last TrafficRoutedPercent update.
+ if lastMigratedTime != nil {
+ duration := currentMigratedTime.Sub(lastMigratedTime.Time)
+ g.Expect(duration).To(BeNumerically(">=", intervalSeconds),
+ "Time between traffic steps should be >= IntervalSeconds")
+ }
+ lastMigratedTime = currentMigratedTime
+ }
+ }
+ // Check that RayService completed upgrade
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be false", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse()))
+
+ LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes")
+ stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ g.Expect(stdout.String()).To(Equal("8"))
+}
+
+func TestRayServiceIncrementalUpgradeRollback(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ test := With(t)
+ g := NewWithT(t)
+
+ namespace := test.NewTestNamespace()
+ rayServiceName := "rollback-rayservice"
+
+ // Create a RayService with IncrementalUpgrade enabled
+ stepSize := ptr.To(int32(25))
+ interval := ptr.To(int32(10))
+ maxSurge := ptr.To(int32(50))
+
+ rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name).
+ WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge))
+ rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(rayService).NotTo(BeNil())
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium).
+ Should(WithTransform(IsRayServiceReady, BeTrue()))
+
+ // Copy original spec to use to trigger a rollback later.
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ originalSpec := rayService.Spec.DeepCopy()
+
+ // Verify Gateway and HTTPRoute are ready.
+ gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway")
+ g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium).
+ Should(WithTransform(utils.IsGatewayReady, BeTrue()))
+
+ gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway"))
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(gateway).NotTo(BeNil())
+
+ httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName)
+ LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName)
+ g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium).
+ Should(Not(BeNil()))
+
+ httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue())
+
+ // Trigger an incremental upgrade through a change to the RayCluster spec.
+ LogWithTimestamp(test.T(), "Triggering an upgrade for RayService %s/%s", rayService.Namespace, rayService.Name)
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m")
+ _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(test.Ctx(), rayService, metav1.UpdateOptions{})
+ g.Expect(err).NotTo(HaveOccurred())
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue()))
+
+ // Wait for the upgrade to be underway with traffic partially migrated.
+ LogWithTimestamp(test.T(), "Waiting for upgrade to be partially complete")
+ g.Eventually(func(g Gomega) {
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(svc.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil())
+ g.Expect(*svc.Status.PendingServiceStatus.TrafficRoutedPercent).Should(BeNumerically(">", 0))
+ }, TestTimeoutMedium).Should(Succeed())
+
+ // Trigger a rollback by updating the spec back to the original version.
+ LogWithTimestamp(test.T(), "Triggering a rollback for RayService %s/%s", rayService.Namespace, rayService.Name)
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ rayService.Spec = *originalSpec
+ _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(test.Ctx(), rayService, metav1.UpdateOptions{})
+ g.Expect(err).NotTo(HaveOccurred())
+
+ // Verify that the controller enters the rollback state.
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s RollbackInProgress condition to be true", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceRollingBack, BeTrue()))
+
+ // Verify that traffic gradually shifts back to the active cluster.
+ LogWithTimestamp(test.T(), "Verifying traffic shifts back to the active cluster")
+ g.Eventually(func(g Gomega) {
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(svc.Status.ActiveServiceStatus.TrafficRoutedPercent).NotTo(BeNil())
+ g.Expect(*svc.Status.ActiveServiceStatus.TrafficRoutedPercent).Should(Equal(int32(100)))
+ g.Expect(svc.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil())
+ g.Expect(*svc.Status.PendingServiceStatus.TrafficRoutedPercent).Should(Equal(int32(0)))
+ }, TestTimeoutMedium).Should(Succeed())
+
+ // Verify that the rollback completes and the pending cluster is cleaned up.
+ LogWithTimestamp(test.T(), "Waiting for rollback to complete and pending cluster to be deleted")
+ g.Eventually(func(g Gomega) {
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ // Rollback is done when both conditions are false and pending status is empty.
+ g.Expect(IsRayServiceRollingBack(svc)).To(BeFalse())
+ g.Expect(IsRayServiceUpgrading(svc)).To(BeFalse())
+ g.Expect(svc.Status.PendingServiceStatus.RayClusterName).To(BeEmpty())
+ }, TestTimeoutMedium).Should(Succeed())
+
+ // Check that the pending RayCluster resource is deleted.
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ pendingClusterName := rayService.Status.PendingServiceStatus.RayClusterName
+ if pendingClusterName != "" {
+ g.Eventually(func() error {
+ _, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), pendingClusterName, metav1.GetOptions{})
+ return err
+ }, TestTimeoutShort).Should(WithTransform(errors.IsNotFound, BeTrue()))
+ }
+
+ // The HTTPRoute should now only have one backend after the rollback completes.
+ g.Eventually(HTTPRoute(test, namespace.Name, httpRouteName), TestTimeoutShort).
+ Should(WithTransform(func(route *gwv1.HTTPRoute) int {
+ if route == nil || len(route.Spec.Rules) == 0 {
+ return 0
+ }
+ return len(route.Spec.Rules[0].BackendRefs)
+ }, Equal(1)))
+}
diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go
new file mode 100644
index 00000000000..68c9e96460e
--- /dev/null
+++ b/ray-operator/test/e2eincrementalupgrade/support.go
@@ -0,0 +1,245 @@
+package e2eincrementalupgrade
+
+import (
+ "bytes"
+ "fmt"
+
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
+ "k8s.io/utils/ptr"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
+
+ rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+ rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
+ . "github.com/ray-project/kuberay/ray-operator/test/support"
+)
+
+func CurlRayServiceGateway(
+ t Test,
+ gatewayIP string,
+ curlPod *corev1.Pod,
+ curlPodContainerName,
+ rayServicePath,
+ body string,
+) (bytes.Buffer, bytes.Buffer) {
+ cmd := []string{
+ "curl",
+ "--max-time", "10",
+ "-X", "POST",
+ "-H", "Content-Type: application/json",
+ fmt.Sprintf("%s:80%s", gatewayIP, rayServicePath),
+ "-d", body,
+ }
+
+ return ExecPodCmd(t, curlPod, curlPodContainerName, cmd)
+}
+
+func IncrementalUpgradeRayServiceApplyConfiguration(
+ stepSizePercent, intervalSeconds, maxSurgePercent *int32,
+) *rayv1ac.RayServiceSpecApplyConfiguration {
+ return rayv1ac.RayServiceSpec().
+ WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy().
+ WithType(rayv1.IncrementalUpgrade).
+ WithIncrementalUpgradeOptions(
+ rayv1ac.IncrementalUpgradeOptions().
+ WithGatewayClassName("istio").
+ WithStepSizePercent(*stepSizePercent).
+ WithIntervalSeconds(*intervalSeconds).
+ WithMaxSurgePercent(*maxSurgePercent),
+ )).
+ WithServeConfigV2(`applications:
+ - name: fruit_app
+ import_path: fruit.deployment_graph
+ route_prefix: /fruit
+ runtime_env:
+ working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip"
+ deployments:
+ - name: MangoStand
+ num_replicas: 1
+ user_config:
+ price: 3
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: OrangeStand
+ num_replicas: 1
+ user_config:
+ price: 2
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: FruitMarket
+ num_replicas: 1
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: math_app
+ import_path: conditional_dag.serve_dag
+ route_prefix: /calc
+ runtime_env:
+ working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip"
+ deployments:
+ - name: Adder
+ num_replicas: 1
+ user_config:
+ increment: 3
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: Multiplier
+ num_replicas: 1
+ user_config:
+ factor: 5
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: Router
+ ray_actor_options:
+ num_cpus: 0.1
+ num_replicas: 1`).
+ WithRayClusterSpec(rayv1ac.RayClusterSpec().
+ WithRayVersion(GetRayVersion()).
+ WithEnableInTreeAutoscaling(true).
+ WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
+ WithRayStartParams(map[string]string{"dashboard-host": "0.0.0.0"}).
+ WithTemplate(corev1ac.PodTemplateSpec().
+ WithSpec(corev1ac.PodSpec().
+ WithRestartPolicy(corev1.RestartPolicyNever).
+ WithContainers(corev1ac.Container().
+ WithName("ray-head").
+ WithImage(GetRayImage()).
+ WithEnv(corev1ac.EnvVar().WithName(utils.RAY_ENABLE_AUTOSCALER_V2).WithValue("1")).
+ WithPorts(
+ corev1ac.ContainerPort().WithName(utils.GcsServerPortName).WithContainerPort(utils.DefaultGcsServerPort),
+ corev1ac.ContainerPort().WithName(utils.ServingPortName).WithContainerPort(utils.DefaultServingPort),
+ corev1ac.ContainerPort().WithName(utils.DashboardPortName).WithContainerPort(utils.DefaultDashboardPort),
+ corev1ac.ContainerPort().WithName(utils.ClientPortName).WithContainerPort(utils.DefaultClientPort),
+ ).
+ WithResources(corev1ac.ResourceRequirements().
+ WithRequests(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("2"),
+ corev1.ResourceMemory: resource.MustParse("3Gi"),
+ }).
+ WithLimits(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("2"),
+ corev1.ResourceMemory: resource.MustParse("3Gi"),
+ })))))).
+ WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
+ WithReplicas(1).
+ WithMinReplicas(1).
+ WithMaxReplicas(4).
+ WithRayStartParams(map[string]string{"num-cpus": "1"}).
+ WithGroupName("small-group").
+ WithTemplate(corev1ac.PodTemplateSpec().
+ WithSpec(corev1ac.PodSpec().
+ WithRestartPolicy(corev1.RestartPolicyNever).
+ WithContainers(corev1ac.Container().
+ WithName("ray-worker").
+ WithImage(GetRayImage()).
+ WithResources(corev1ac.ResourceRequirements().
+ WithRequests(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("300m"),
+ corev1.ResourceMemory: resource.MustParse("1G"),
+ }).
+ WithLimits(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("500m"),
+ corev1.ResourceMemory: resource.MustParse("1G"),
+ })))))),
+ )
+}
+
+// GetGatewayIP retrieves the external IP for a Gateway object
+func GetGatewayIP(gateway *gwv1.Gateway) string {
+ if gateway == nil {
+ return ""
+ }
+ for _, addr := range gateway.Status.Addresses {
+ if addr.Type == nil || *addr.Type == gwv1.IPAddressType {
+ return addr.Value
+ }
+ }
+
+ return ""
+}
+
+func GetPendingCapacity(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.PendingServiceStatus.TargetCapacity, 0)
+}
+
+func GetPendingTraffic(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.PendingServiceStatus.TrafficRoutedPercent, 0)
+}
+
+func GetActiveCapacity(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.ActiveServiceStatus.TargetCapacity, 100)
+}
+
+func GetActiveTraffic(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.ActiveServiceStatus.TrafficRoutedPercent, 100)
+}
+
+func GetLastTrafficMigratedTime(rs *rayv1.RayService) *metav1.Time {
+ return rs.Status.ActiveServiceStatus.LastTrafficMigratedTime
+}
+
+// testStep defines a validation condition to wait for during the upgrade.
+type testStep struct {
+ getValue func(rs *rayv1.RayService) int32
+ name string
+ expectedValue int32
+}
+
+// generateUpgradeSteps is a helper function for testing that the controller follows the expected
+// sequence of updates to TrafficRoutedPercent and TargetCapacity during an incremental upgrade.
+func generateUpgradeSteps(stepSize, maxSurge int32) []testStep {
+ var steps []testStep
+
+ pendingCapacity := int32(0)
+ pendingTraffic := int32(0)
+ activeCapacity := int32(100)
+ activeTraffic := int32(100)
+
+ for pendingTraffic < 100 {
+ // Scale up the pending cluster's TargetCapacity.
+ if pendingTraffic == pendingCapacity {
+ nextPendingCapacity := min(pendingCapacity+maxSurge, 100)
+ if nextPendingCapacity > pendingCapacity {
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for pending capacity to scale up to %d", nextPendingCapacity),
+ getValue: GetPendingCapacity,
+ expectedValue: nextPendingCapacity,
+ })
+ pendingCapacity = nextPendingCapacity
+ }
+ }
+
+ // Shift traffic over from the active to the pending cluster by StepSizePercent.
+ for pendingTraffic < pendingCapacity {
+ nextPendingTraffic := min(pendingTraffic+stepSize, 100)
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for pending traffic to shift to %d", nextPendingTraffic),
+ getValue: GetPendingTraffic,
+ expectedValue: nextPendingTraffic,
+ })
+ pendingTraffic = nextPendingTraffic
+
+ nextActiveTraffic := max(activeTraffic-stepSize, 0)
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for active traffic to shift down to %d", nextActiveTraffic),
+ getValue: GetActiveTraffic,
+ expectedValue: nextActiveTraffic,
+ })
+ activeTraffic = nextActiveTraffic
+ }
+
+ // Scale down the active cluster's target capacity.
+ nextActiveCapacity := max(activeCapacity-maxSurge, 0)
+ if nextActiveCapacity < activeCapacity {
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for active capacity to scale down to %d", nextActiveCapacity),
+ getValue: GetActiveCapacity,
+ expectedValue: nextActiveCapacity,
+ })
+ activeCapacity = nextActiveCapacity
+ }
+ }
+ return steps
+}
diff --git a/ray-operator/test/support/client.go b/ray-operator/test/support/client.go
index 2e313483966..4925184d46b 100644
--- a/ray-operator/test/support/client.go
+++ b/ray-operator/test/support/client.go
@@ -8,6 +8,7 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
+ gatewayclient "sigs.k8s.io/gateway-api/pkg/client/clientset/versioned"
rayclient "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned"
)
@@ -17,6 +18,7 @@ type Client interface {
Ray() rayclient.Interface
Dynamic() dynamic.Interface
Config() rest.Config
+ Gateway() gatewayclient.Interface
}
type testClient struct {
@@ -24,6 +26,7 @@ type testClient struct {
ray rayclient.Interface
dynamic dynamic.Interface
config rest.Config
+ gateway gatewayclient.Interface
}
var _ Client = (*testClient)(nil)
@@ -44,6 +47,10 @@ func (t *testClient) Config() rest.Config {
return t.config
}
+func (t *testClient) Gateway() gatewayclient.Interface {
+ return t.gateway
+}
+
func newTestClient() (Client, error) {
cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
clientcmd.NewDefaultClientConfigLoadingRules(),
@@ -68,10 +75,16 @@ func newTestClient() (Client, error) {
return nil, err
}
+ gatewayClient, err := gatewayclient.NewForConfig(cfg)
+ if err != nil {
+ return nil, err
+ }
+
return &testClient{
core: kubeClient,
ray: rayClient,
dynamic: dynamicClient,
config: *cfg,
+ gateway: gatewayClient,
}, nil
}
diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go
index ffea3c75d87..162910081d6 100644
--- a/ray-operator/test/support/ray.go
+++ b/ray-operator/test/support/ray.go
@@ -9,6 +9,7 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -215,6 +216,10 @@ func IsRayServiceUpgrading(service *rayv1.RayService) bool {
return meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.UpgradeInProgress))
}
+func IsRayServiceRollingBack(service *rayv1.RayService) bool {
+ return meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.RollbackInProgress))
+}
+
func RayServicesNumEndPoints(service *rayv1.RayService) int32 {
return service.Status.NumServeEndpoints
}
@@ -226,3 +231,23 @@ func GetRayClusterWorkerGroupReplicaSum(cluster *rayv1.RayCluster) int32 {
}
return replicas
}
+
+func GetHTTPRoute(t Test, namespace, name string) (*gwv1.HTTPRoute, error) {
+ return t.Client().Gateway().GatewayV1().HTTPRoutes(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+}
+
+func HTTPRoute(t Test, namespace, name string) func() (*gwv1.HTTPRoute, error) {
+ return func() (*gwv1.HTTPRoute, error) {
+ return GetHTTPRoute(t, namespace, name)
+ }
+}
+
+func GetGateway(t Test, namespace, name string) (*gwv1.Gateway, error) {
+ return t.Client().Gateway().GatewayV1().Gateways(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+}
+
+func Gateway(t Test, namespace, name string) func() (*gwv1.Gateway, error) {
+ return func() (*gwv1.Gateway, error) {
+ return GetGateway(t, namespace, name)
+ }
+}