Skip to content

Commit 678102c

Browse files
committed
fix: test
1 parent f9d1a4d commit 678102c

File tree

3 files changed

+24
-98
lines changed

3 files changed

+24
-98
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ jobs:
111111
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112112
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113113
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
114-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
114+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs --resource=rayjobs/status
115115
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
116116
kubectl config use-context sdk-user
117117

docs/sphinx/user-docs/e2e.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ instructions <https://www.substratus.ai/blog/kind-with-gpus>`__.
8686
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
8787
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
8888
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
89-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
89+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs --resource=rayjobs/status
9090
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
9191
kubectl config use-context sdk-user
9292

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 22 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from codeflare_sdk import Cluster, ClusterConfiguration
44
from codeflare_sdk.ray.rayjobs import RayJob
5+
from codeflare_sdk.ray.rayjobs.status import CodeflareRayJobStatus
56

67
import pytest
78

@@ -104,120 +105,45 @@ def assert_rayjob_submit_against_existing_cluster(
104105
), f"Job submission failed, expected {job_name}, got {submission_result}"
105106
print(f"✅ Successfully submitted RayJob '{job_name}' against existing cluster")
106107

107-
# Monitor the job status until completion using kubectl (Kind-specific workaround)
108-
self.monitor_rayjob_completion_kubectl(job_name, timeout=900)
108+
# Monitor the job status until completion
109+
self.monitor_rayjob_completion(rayjob, timeout=900)
109110

110111
print(f"✅ RayJob '{job_name}' completed successfully against existing cluster!")
111112

112-
def monitor_rayjob_completion_kubectl(self, job_name: str, timeout: int = 900):
113+
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
113114
"""
114-
Monitor a RayJob until it completes or fails using kubectl directly.
115-
This is a workaround for Kind clusters where the SDK status method doesn't work.
115+
Monitor a RayJob until it completes or fails.
116116
117117
Args:
118-
job_name: The name of the RayJob to monitor
118+
rayjob: The RayJob instance to monitor
119119
timeout: Maximum time to wait in seconds (default: 15 minutes)
120120
"""
121-
import subprocess
122-
import time
123-
124-
print(f"⏳ Monitoring RayJob '{job_name}' status using kubectl...")
121+
print(f"⏳ Monitoring RayJob '{rayjob.name}' status...")
125122

126123
elapsed_time = 0
127124
check_interval = 10 # Check every 10 seconds
128125

129126
while elapsed_time < timeout:
130-
try:
131-
# Get RayJob status using kubectl
132-
result = subprocess.run(
133-
[
134-
"kubectl",
135-
"get",
136-
"rayjobs",
137-
"-n",
138-
self.namespace,
139-
job_name,
140-
"-o",
141-
"jsonpath={.status.jobDeploymentStatus}",
142-
],
143-
capture_output=True,
144-
text=True,
145-
timeout=10,
146-
)
147-
148-
if result.returncode == 0:
149-
status = result.stdout.strip()
150-
151-
# Also get job status for more details
152-
job_status_result = subprocess.run(
153-
[
154-
"kubectl",
155-
"get",
156-
"rayjobs",
157-
"-n",
158-
self.namespace,
159-
job_name,
160-
"-o",
161-
"jsonpath={.status.jobStatus}",
162-
],
163-
capture_output=True,
164-
text=True,
165-
timeout=10,
166-
)
167-
job_status = (
168-
job_status_result.stdout.strip()
169-
if job_status_result.returncode == 0
170-
else "Unknown"
171-
)
172-
173-
print(
174-
f"📊 RayJob '{job_name}' - Deployment Status: {status}, Job Status: {job_status}"
175-
)
176-
177-
# Check completion status
178-
if status == "Complete" or job_status == "SUCCEEDED":
179-
print(f"✅ RayJob '{job_name}' completed successfully!")
180-
return
181-
elif status == "Failed" or job_status == "FAILED":
182-
# Get error details
183-
try:
184-
error_result = subprocess.run(
185-
[
186-
"kubectl",
187-
"get",
188-
"rayjobs",
189-
"-n",
190-
self.namespace,
191-
job_name,
192-
"-o",
193-
"yaml",
194-
],
195-
capture_output=True,
196-
text=True,
197-
timeout=10,
198-
)
199-
print(
200-
f"❌ RayJob '{job_name}' failed! Details:\n{error_result.stdout}"
201-
)
202-
except:
203-
pass
204-
raise AssertionError(f"❌ RayJob '{job_name}' failed!")
205-
elif status == "Running" or job_status == "RUNNING":
206-
print(f"🏃 RayJob '{job_name}' is still running...")
207-
else:
208-
print(f"⏳ RayJob '{job_name}' status: {status}")
209-
210-
else:
211-
print(f"❌ Could not get RayJob status: {result.stderr}")
212-
213-
except Exception as e:
214-
print(f"❌ Error checking RayJob status: {e}")
127+
status, ready = rayjob.status(print_to_console=True)
128+
129+
# Check if job has completed (either successfully or failed)
130+
if status == CodeflareRayJobStatus.COMPLETE:
131+
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
132+
return
133+
elif status == CodeflareRayJobStatus.FAILED:
134+
raise AssertionError(f"❌ RayJob '{rayjob.name}' failed!")
135+
elif status == CodeflareRayJobStatus.RUNNING:
136+
print(f"🏃 RayJob '{rayjob.name}' is still running...")
137+
elif status == CodeflareRayJobStatus.UNKNOWN:
138+
print(f"❓ RayJob '{rayjob.name}' status is unknown")
215139

216140
# Wait before next check
217141
sleep(check_interval)
218142
elapsed_time += check_interval
219143

220144
# If we reach here, the job has timed out
145+
final_status, _ = rayjob.status(print_to_console=True)
221146
raise TimeoutError(
222-
f"⏰ RayJob '{job_name}' did not complete within {timeout} seconds."
147+
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
148+
f"Final status: {final_status}"
223149
)

0 commit comments

Comments
 (0)