Skip to content

Commit f9d1a4d

Browse files
committed
test kind
1 parent 2784494 commit f9d1a4d

File tree

1 file changed

+95
-80
lines changed

1 file changed

+95
-80
lines changed

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 95 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from codeflare_sdk import Cluster, ClusterConfiguration
44
from codeflare_sdk.ray.rayjobs import RayJob
5-
from codeflare_sdk.ray.rayjobs.status import CodeflareRayJobStatus
65

76
import pytest
87

@@ -105,104 +104,120 @@ def assert_rayjob_submit_against_existing_cluster(
105104
), f"Job submission failed, expected {job_name}, got {submission_result}"
106105
print(f"✅ Successfully submitted RayJob '{job_name}' against existing cluster")
107106

108-
# Debug: Check if RayJob resource was actually created
109-
import subprocess
110-
import time
111-
112-
print("🔍 Checking if RayJob resource exists in Kubernetes...")
113-
for attempt in range(6): # Check for 30 seconds
114-
try:
115-
# Check if RayJob resource exists
116-
result = subprocess.run(
117-
["kubectl", "get", "rayjobs", "-n", self.namespace, job_name],
118-
capture_output=True,
119-
text=True,
120-
timeout=10,
121-
)
122-
if result.returncode == 0:
123-
print(f"✅ RayJob resource '{job_name}' found in Kubernetes!")
124-
print(f"RayJob details:\n{result.stdout}")
125-
break
126-
else:
127-
print(
128-
f"❌ Attempt {attempt + 1}: RayJob resource '{job_name}' not found"
129-
)
130-
if attempt < 5:
131-
time.sleep(5)
132-
except Exception as e:
133-
print(f"❌ Error checking RayJob: {e}")
134-
135-
# Also check what RayJob resources exist in the namespace
136-
try:
137-
result = subprocess.run(
138-
["kubectl", "get", "rayjobs", "-n", self.namespace],
139-
capture_output=True,
140-
text=True,
141-
timeout=10,
142-
)
143-
print(f"📋 All RayJobs in namespace '{self.namespace}':\n{result.stdout}")
144-
except Exception as e:
145-
print(f"❌ Error listing RayJobs: {e}")
146-
147-
# Monitor the job status until completion
148-
self.monitor_rayjob_completion(rayjob, timeout=900)
107+
# Monitor the job status until completion using kubectl (Kind-specific workaround)
108+
self.monitor_rayjob_completion_kubectl(job_name, timeout=900)
149109

150110
print(f"✅ RayJob '{job_name}' completed successfully against existing cluster!")
151111

152-
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
112+
def monitor_rayjob_completion_kubectl(self, job_name: str, timeout: int = 900):
153113
"""
154-
Monitor a RayJob until it completes or fails.
114+
Monitor a RayJob until it completes or fails using kubectl directly.
115+
This is a workaround for Kind clusters where the SDK status method doesn't work.
155116
156117
Args:
157-
rayjob: The RayJob instance to monitor
118+
job_name: The name of the RayJob to monitor
158119
timeout: Maximum time to wait in seconds (default: 15 minutes)
159120
"""
160-
print(f"⏳ Monitoring RayJob '{rayjob.name}' status...")
121+
import subprocess
122+
import time
123+
124+
print(f"⏳ Monitoring RayJob '{job_name}' status using kubectl...")
161125

162126
elapsed_time = 0
163127
check_interval = 10 # Check every 10 seconds
164-
job_found = False # Track if we've seen the job at least once
165128

166129
while elapsed_time < timeout:
167-
status, ready = rayjob.status(print_to_console=True)
168-
169-
# Track if we've found the job (not UNKNOWN status)
170-
if status != CodeflareRayJobStatus.UNKNOWN:
171-
job_found = True
172-
173-
# Check if job has completed (either successfully or failed)
174-
if status == CodeflareRayJobStatus.COMPLETE:
175-
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
176-
return
177-
elif status == CodeflareRayJobStatus.FAILED:
178-
raise AssertionError(f"❌ RayJob '{rayjob.name}' failed!")
179-
elif status == CodeflareRayJobStatus.RUNNING:
180-
print(f"🏃 RayJob '{rayjob.name}' is still running...")
181-
elif status == CodeflareRayJobStatus.UNKNOWN:
182-
if job_found:
183-
# If we've seen the job before but now it's unknown, that's concerning
184-
print(
185-
f"⚠️ RayJob '{rayjob.name}' status became unknown after being found"
130+
try:
131+
# Get RayJob status using kubectl
132+
result = subprocess.run(
133+
[
134+
"kubectl",
135+
"get",
136+
"rayjobs",
137+
"-n",
138+
self.namespace,
139+
job_name,
140+
"-o",
141+
"jsonpath={.status.jobDeploymentStatus}",
142+
],
143+
capture_output=True,
144+
text=True,
145+
timeout=10,
146+
)
147+
148+
if result.returncode == 0:
149+
status = result.stdout.strip()
150+
151+
# Also get job status for more details
152+
job_status_result = subprocess.run(
153+
[
154+
"kubectl",
155+
"get",
156+
"rayjobs",
157+
"-n",
158+
self.namespace,
159+
job_name,
160+
"-o",
161+
"jsonpath={.status.jobStatus}",
162+
],
163+
capture_output=True,
164+
text=True,
165+
timeout=10,
186166
)
187-
else:
188-
# Job hasn't appeared yet, this is normal initially
167+
job_status = (
168+
job_status_result.stdout.strip()
169+
if job_status_result.returncode == 0
170+
else "Unknown"
171+
)
172+
189173
print(
190-
f"⏳ Waiting for RayJob '{rayjob.name}' to appear in Kubernetes..."
174+
f"📊 RayJob '{job_name}' - Deployment Status: {status}, Job Status: {job_status}"
191175
)
192176

177+
# Check completion status
178+
if status == "Complete" or job_status == "SUCCEEDED":
179+
print(f"✅ RayJob '{job_name}' completed successfully!")
180+
return
181+
elif status == "Failed" or job_status == "FAILED":
182+
# Get error details
183+
try:
184+
error_result = subprocess.run(
185+
[
186+
"kubectl",
187+
"get",
188+
"rayjobs",
189+
"-n",
190+
self.namespace,
191+
job_name,
192+
"-o",
193+
"yaml",
194+
],
195+
capture_output=True,
196+
text=True,
197+
timeout=10,
198+
)
199+
print(
200+
f"❌ RayJob '{job_name}' failed! Details:\n{error_result.stdout}"
201+
)
202+
except:
203+
pass
204+
raise AssertionError(f"❌ RayJob '{job_name}' failed!")
205+
elif status == "Running" or job_status == "RUNNING":
206+
print(f"🏃 RayJob '{job_name}' is still running...")
207+
else:
208+
print(f"⏳ RayJob '{job_name}' status: {status}")
209+
210+
else:
211+
print(f"❌ Could not get RayJob status: {result.stderr}")
212+
213+
except Exception as e:
214+
print(f"❌ Error checking RayJob status: {e}")
215+
193216
# Wait before next check
194217
sleep(check_interval)
195218
elapsed_time += check_interval
196219

197220
# If we reach here, the job has timed out
198-
final_status, _ = rayjob.status(print_to_console=True)
199-
if not job_found:
200-
raise TimeoutError(
201-
f"⏰ RayJob '{rayjob.name}' was never found in Kubernetes within {timeout} seconds. "
202-
f"Check if the RayJob resource was created successfully."
203-
)
204-
else:
205-
raise TimeoutError(
206-
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
207-
f"Final status: {final_status}"
208-
)
221+
raise TimeoutError(
222+
f"⏰ RayJob '{job_name}' did not complete within {timeout} seconds."
223+
)

0 commit comments

Comments
 (0)