Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b7c4874
Add incremental upgrade API changes to KubeRay
ryanaoleary Mar 4, 2025
a541336
Fix some tests and create Gateway for pending cluster
ryanaoleary Jun 4, 2025
9cb7ed6
Fix merge errors
ryanaoleary Jun 4, 2025
a864f02
Manually sync rbac for gateway
ryanaoleary Jun 4, 2025
8486755
Fix bugs and e2e test
ryanaoleary Jun 4, 2025
3ddd929
Add Makefile command
ryanaoleary Jun 4, 2025
4157f03
Run 'make sync'
ryanaoleary Jun 4, 2025
29c6696
Run 'make generate'
ryanaoleary Jun 4, 2025
8397455
Fix comments
ryanaoleary Jun 4, 2025
a8ee0d5
Run 'make api-docs'
ryanaoleary Jun 4, 2025
be581d6
Fix tests after merge conflicts
ryanaoleary Sep 16, 2025
afa0bc7
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
ef59b86
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
eb57b69
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
db11042
Fix error return
ryanaoleary Sep 29, 2025
f163b82
Add RayServiceIncrementalUpgrade feature gate option to helm
ryanaoleary Sep 29, 2025
5d8cbb5
Remove unnecessary perms
ryanaoleary Sep 29, 2025
e95d0a5
Remove delete perm and run lint
ryanaoleary Sep 29, 2025
c8bbb4e
Fix helm roles
ryanaoleary Sep 29, 2025
085914c
add back required perms
ryanaoleary Sep 30, 2025
3a929f6
Update ray-operator/controllers/ray/utils/validation.go
ryanaoleary Oct 1, 2025
3dd7945
Update ray-operator/controllers/ray/utils/util.go
ryanaoleary Oct 1, 2025
93f5096
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 1, 2025
2db266c
Change controller to use two serve services during upgrade
ryanaoleary Oct 1, 2025
7457d90
Remove Gateway and HTTPRoute API fields
ryanaoleary Oct 1, 2025
689682f
Fix port errors
ryanaoleary Oct 1, 2025
bca4fcf
Fix comments and build issues
ryanaoleary Oct 2, 2025
b48b988
fix helm-chart-verify-rbac
ryanaoleary Oct 2, 2025
f5d0243
Implement rollback support
ryanaoleary Oct 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,25 @@ _Appears in:_
| `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | |


#### IncrementalUpgradeOptions







_Appears in:_
- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy)

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.<br />Defaults to 100%. | 100 | |
| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | |
| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | |
| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | |




#### JobSubmissionMode
Expand Down Expand Up @@ -319,7 +338,8 @@ _Appears in:_

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports<br />`NewCluster`, `IncrementalUpgrade`, and `None`. | | |
| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.<br />RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. | | |


#### RayServiceUpgradeType
Expand Down
11 changes: 6 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ require (
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/term v0.5.0 // indirect
Expand All @@ -95,12 +95,12 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
golang.org/x/sync v0.12.0 // indirect
golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/term v0.31.0 // indirect
golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.10.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
Expand All @@ -112,6 +112,7 @@ require (
k8s.io/component-base v0.33.1 // indirect
k8s.io/component-helpers v0.33.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/gateway-api v1.3.0 // indirect
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
sigs.k8s.io/kustomize/api v0.19.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
Expand Down
21 changes: 12 additions & 9 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions helm-chart/kuberay-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ spec:
| featureGates[0].enabled | bool | `true` | |
| featureGates[1].name | string | `"RayJobDeletionPolicy"` | |
| featureGates[1].enabled | bool | `false` | |
| featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | |
| featureGates[2].enabled | bool | `false` | |
| metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. |
| metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor |
| metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval |
Expand Down
37 changes: 37 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions helm-chart/kuberay-operator/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,17 @@ rules:
- patch
- update
- watch
- apiGroups:
- gateway.networking.k8s.io
resources:
- gateways
- httproutes
verbs:
- create
- get
- list
- update
- watch
- apiGroups:
- networking.k8s.io
resources:
Expand Down
2 changes: 2 additions & 0 deletions helm-chart/kuberay-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ featureGates:
enabled: true
- name: RayJobDeletionPolicy
enabled: false
- name: RayServiceIncrementalUpgrade
enabled: false

# Configurations for KubeRay operator metrics.
metrics:
Expand Down
10 changes: 9 additions & 1 deletion ray-operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,16 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler
test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests.
go test -timeout 30m -v $(WHAT)

test-e2e-rayservice: WHAT ?= ./test/e2erayservice
test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests.
go test -timeout 30m -v $(WHAT)

test-e2e-upgrade: WHAT ?= ./test/e2eupgrade
test-e2e-upgrade: manifests fmt vet ## Run e2e tests.
test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests.
go test -timeout 30m -v $(WHAT)

test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade
test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests.
go test -timeout 30m -v $(WHAT)

test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter
Expand Down
34 changes: 31 additions & 3 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ const (
type RayServiceUpgradeType string

const (
// During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale
// and migrate traffic to using Gateway API.
IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade"
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
Expand Down Expand Up @@ -57,10 +60,27 @@ var DeploymentStatusEnum = struct {
UNHEALTHY: "UNHEALTHY",
}

type IncrementalUpgradeOptions struct {
// The capacity of serve requests the upgraded cluster should scale to handle each interval.
// Defaults to 100%.
// +kubebuilder:default:=100
MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
// The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent.
StepSizePercent *int32 `json:"stepSizePercent"`
// The interval in seconds between transferring StepSize traffic from the old to new RayCluster.
IntervalSeconds *int32 `json:"intervalSeconds"`
// The name of the Gateway Class installed by the Kubernetes Cluster admin.
GatewayClassName string `json:"gatewayClassName"`
}

type RayServiceUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
// Type represents the strategy used when upgrading the RayService. Currently supports
// `NewCluster`, `IncrementalUpgrade`, and `None`.
// +optional
Type *RayServiceUpgradeType `json:"type,omitempty"`
// IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
// RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions.
IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"`
}

// RayServiceSpec defines the desired state of RayService
Expand Down Expand Up @@ -130,6 +150,12 @@ type RayServiceStatus struct {
// +optional
Applications map[string]AppStatus `json:"applicationStatuses,omitempty"`
// +optional
TargetCapacity *int32 `json:"targetCapacity,omitempty"`
// +optional
TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
// +optional
LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
// +optional
RayClusterName string `json:"rayClusterName,omitempty"`
// +optional
RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"`
Expand Down Expand Up @@ -162,6 +188,8 @@ const (
RayServiceReady RayServiceConditionType = "Ready"
// UpgradeInProgress means the RayService is currently performing a zero-downtime upgrade.
UpgradeInProgress RayServiceConditionType = "UpgradeInProgress"
// RollbackInProgress means the RayService is currently rolling back an in-progress upgrade to the original cluster state.
RollbackInProgress RayServiceConditionType = "RollbackInProgress"
)

const (
Expand All @@ -171,6 +199,7 @@ const (
BothActivePendingClustersExist RayServiceConditionReason = "BothActivePendingClustersExist"
NoPendingCluster RayServiceConditionReason = "NoPendingCluster"
NoActiveCluster RayServiceConditionReason = "NoActiveCluster"
GoalClusterChanged RayServiceConditionReason = "GoalClusterChanged"
)

// +kubebuilder:object:root=true
Expand All @@ -184,8 +213,7 @@ const (
type RayService struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec RayServiceSpec `json:"spec,omitempty"`
Spec RayServiceSpec `json:"spec,omitempty"`
// +optional
Status RayServiceStatuses `json:"status,omitempty"`
}
Expand Down
Loading
Loading