Skip to content

Commit a053981

Browse files
committed
fix: test
1 parent 525e042 commit a053981

File tree

2 files changed

+15
-241
lines changed

2 files changed

+15
-241
lines changed

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 9 additions & 210 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,12 @@ def run_rayjob_against_existing_cluster_kind(
4646
num_workers=1,
4747
head_cpu_requests="500m",
4848
head_cpu_limits="500m",
49-
worker_cpu_requests=2,
50-
worker_cpu_limits=4,
51-
worker_memory_requests=4,
52-
worker_memory_limits=8,
49+
worker_cpu_requests="500m",
50+
worker_cpu_limits=1,
51+
worker_memory_requests=1,
52+
worker_memory_limits=4,
5353
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
54+
image="rayproject/ray:2.47.1",
5455
write_to_file=True,
5556
verify_tls=False,
5657
)
@@ -95,8 +96,7 @@ def assert_rayjob_submit_against_existing_cluster(
9596
"pip": "./tests/e2e/mnist_pip_requirements.txt",
9697
"env_vars": get_setup_env_variables(ACCELERATOR=accelerator),
9798
},
98-
shutdown_after_job_finishes=False,
99-
# entrypoint_num_gpus=number_of_gpus if number_of_gpus > 0 else None, # Temporarily disabled to test basic functionality
99+
shutdown_after_job_finishes=False, # Don't shutdown the existing cluster
100100
)
101101

102102
# Submit the job
@@ -107,16 +107,13 @@ def assert_rayjob_submit_against_existing_cluster(
107107
print(f"✅ Successfully submitted RayJob '{job_name}' against existing cluster")
108108

109109
# Monitor the job status until completion
110-
self.monitor_rayjob_completion(
111-
rayjob, timeout=360
112-
) # 6 minutes for faster debugging
110+
self.monitor_rayjob_completion(rayjob, timeout=900)
113111

114112
print(f"✅ RayJob '{job_name}' completed successfully against existing cluster!")
115113

116-
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 360):
114+
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
117115
"""
118116
Monitor a RayJob until it completes or fails.
119-
120117
Args:
121118
rayjob: The RayJob instance to monitor
122119
timeout: Maximum time to wait in seconds (default: 15 minutes)
@@ -134,209 +131,11 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 360):
134131
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
135132
return
136133
elif status == CodeflareRayJobStatus.FAILED:
137-
# Get more details about the failure
138-
print(f"❌ RayJob '{rayjob.name}' failed! Investigating...")
139-
140-
# Try to get failure details using kubectl
141-
import subprocess
142-
143-
try:
144-
result = subprocess.run(
145-
[
146-
"kubectl",
147-
"get",
148-
"rayjobs",
149-
"-n",
150-
self.namespace,
151-
rayjob.name,
152-
"-o",
153-
"yaml",
154-
],
155-
capture_output=True,
156-
text=True,
157-
timeout=10,
158-
)
159-
if result.returncode == 0:
160-
print(f"📋 RayJob YAML details:\n{result.stdout}")
161-
162-
# Try to get job submitter pod logs (these pods may be cleaned up quickly)
163-
pod_result = subprocess.run(
164-
[
165-
"kubectl",
166-
"get",
167-
"pods",
168-
"-n",
169-
self.namespace,
170-
"-l",
171-
f"ray.io/rayjob={rayjob.name}",
172-
"-o",
173-
"name",
174-
"--sort-by=.metadata.creationTimestamp",
175-
],
176-
capture_output=True,
177-
text=True,
178-
timeout=10,
179-
)
180-
if pod_result.returncode == 0 and pod_result.stdout.strip():
181-
pod_name = pod_result.stdout.strip().split("/")[-1]
182-
log_result = subprocess.run(
183-
[
184-
"kubectl",
185-
"logs",
186-
"-n",
187-
self.namespace,
188-
pod_name,
189-
"--tail=50",
190-
],
191-
capture_output=True,
192-
text=True,
193-
timeout=10,
194-
)
195-
if log_result.returncode == 0:
196-
print(f"📝 Pod logs for {pod_name}:\n{log_result.stdout}")
197-
else:
198-
print(f"❌ Could not get pod logs: {log_result.stderr}")
199-
else:
200-
print(f"❌ Could not find pods for RayJob: {pod_result.stderr}")
201-
202-
# Also try to get events related to the RayJob
203-
events_result = subprocess.run(
204-
[
205-
"kubectl",
206-
"get",
207-
"events",
208-
"-n",
209-
self.namespace,
210-
"--field-selector",
211-
f"involvedObject.name={rayjob.name}",
212-
"-o",
213-
"wide",
214-
],
215-
capture_output=True,
216-
text=True,
217-
timeout=10,
218-
)
219-
if events_result.returncode == 0 and events_result.stdout.strip():
220-
print(f"📅 Events for RayJob:\n{events_result.stdout}")
221-
222-
except Exception as e:
223-
print(f"❌ Error getting failure details: {e}")
224-
225134
raise AssertionError(f"❌ RayJob '{rayjob.name}' failed!")
226135
elif status == CodeflareRayJobStatus.RUNNING:
227136
print(f"🏃 RayJob '{rayjob.name}' is still running...")
228137
elif status == CodeflareRayJobStatus.UNKNOWN:
229-
print(f"❓ RayJob '{rayjob.name}' status is unknown - investigating...")
230-
231-
# If we've been in Unknown status for too long, get debug info
232-
if elapsed_time > 120: # After 2 minutes of Unknown status
233-
print(
234-
f"⚠️ Job has been in Unknown status for {elapsed_time}s - getting debug info..."
235-
)
236-
237-
# Get detailed YAML to understand why status is Unknown
238-
import subprocess
239-
240-
try:
241-
result = subprocess.run(
242-
[
243-
"kubectl",
244-
"get",
245-
"rayjobs",
246-
"-n",
247-
self.namespace,
248-
rayjob.name,
249-
"-o",
250-
"yaml",
251-
],
252-
capture_output=True,
253-
text=True,
254-
timeout=10,
255-
)
256-
if result.returncode == 0:
257-
print(
258-
f"📋 RayJob YAML (Unknown status debug):\n{result.stdout}"
259-
)
260-
261-
# Also check for job pods that might be stuck
262-
job_pods_result = subprocess.run(
263-
[
264-
"kubectl",
265-
"get",
266-
"pods",
267-
"-n",
268-
self.namespace,
269-
"-l",
270-
f"ray.io/group=rayjob",
271-
"-o",
272-
"wide",
273-
],
274-
capture_output=True,
275-
text=True,
276-
timeout=10,
277-
)
278-
if job_pods_result.returncode == 0:
279-
print(f"🔍 RayJob-related pods:\n{job_pods_result.stdout}")
280-
281-
# Check for any pending pods in the namespace
282-
pending_pods_result = subprocess.run(
283-
[
284-
"kubectl",
285-
"get",
286-
"pods",
287-
"-n",
288-
self.namespace,
289-
"--field-selector=status.phase=Pending",
290-
"-o",
291-
"wide",
292-
],
293-
capture_output=True,
294-
text=True,
295-
timeout=10,
296-
)
297-
if (
298-
pending_pods_result.returncode == 0
299-
and pending_pods_result.stdout.strip()
300-
):
301-
print(
302-
f"⏸️ Pending pods in namespace:\n{pending_pods_result.stdout}"
303-
)
304-
305-
# Get events for the entire namespace to see scheduling issues
306-
namespace_events_result = subprocess.run(
307-
[
308-
"kubectl",
309-
"get",
310-
"events",
311-
"-n",
312-
self.namespace,
313-
"--sort-by=.metadata.creationTimestamp",
314-
"-o",
315-
"wide",
316-
],
317-
capture_output=True,
318-
text=True,
319-
timeout=10,
320-
)
321-
if (
322-
namespace_events_result.returncode == 0
323-
and namespace_events_result.stdout.strip()
324-
):
325-
print(
326-
f"📅 Recent namespace events:\n{namespace_events_result.stdout}"
327-
)
328-
329-
except Exception as e:
330-
print(f"❌ Error getting debug info: {e}")
331-
332-
# Break out of Unknown status loop after 4 minutes
333-
if elapsed_time > 240:
334-
print(
335-
f"⏰ Breaking out of Unknown status loop after {elapsed_time}s"
336-
)
337-
raise AssertionError(
338-
f"❌ RayJob '{rayjob.name}' stuck in Unknown status for too long"
339-
)
138+
print(f"❓ RayJob '{rayjob.name}' status is unknown")
340139

341140
# Wait before next check
342141
sleep(check_interval)

tests/e2e/rayjob_existing_cluster_oauth_test.py

Lines changed: 6 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,6 @@ def run_rayjob_against_existing_cluster_oauth(self):
9090
), f"Job submission failed, expected {job_name}, got {submission_result}"
9191
print(f"✅ Successfully submitted RayJob '{job_name}'")
9292

93-
# Wait a moment for the RayJob resource to be created in Kubernetes
94-
print("⏳ Waiting for RayJob resource to be processed by KubeRay operator...")
95-
sleep(5)
96-
9793
# Monitor the job status until completion
9894
self.monitor_rayjob_completion(rayjob)
9995

@@ -104,24 +100,18 @@ def run_rayjob_against_existing_cluster_oauth(self):
104100
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
105101
"""
106102
Monitor a RayJob until it completes or fails.
107-
108103
Args:
109104
rayjob: The RayJob instance to monitor
110-
timeout: Maximum time to wait in seconds (default: 5 minutes)
105+
timeout: Maximum time to wait in seconds (default: 15 minutes)
111106
"""
112107
print(f"⏳ Monitoring RayJob '{rayjob.name}' status...")
113108

114109
elapsed_time = 0
115110
check_interval = 10 # Check every 10 seconds
116-
job_found = False # Track if we've seen the job at least once
117111

118112
while elapsed_time < timeout:
119113
status, ready = rayjob.status(print_to_console=True)
120114

121-
# Track if we've found the job (not UNKNOWN status)
122-
if status != CodeflareRayJobStatus.UNKNOWN:
123-
job_found = True
124-
125115
# Check if job has completed (either successfully or failed)
126116
if status == CodeflareRayJobStatus.COMPLETE:
127117
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
@@ -131,30 +121,15 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
131121
elif status == CodeflareRayJobStatus.RUNNING:
132122
print(f"🏃 RayJob '{rayjob.name}' is still running...")
133123
elif status == CodeflareRayJobStatus.UNKNOWN:
134-
if job_found:
135-
# If we've seen the job before but now it's unknown, that's concerning
136-
print(
137-
f"⚠️ RayJob '{rayjob.name}' status became unknown after being found"
138-
)
139-
else:
140-
# Job hasn't appeared yet, this is normal initially
141-
print(
142-
f"⏳ Waiting for RayJob '{rayjob.name}' to appear in Kubernetes..."
143-
)
124+
print(f"❓ RayJob '{rayjob.name}' status is unknown")
144125

145126
# Wait before next check
146127
sleep(check_interval)
147128
elapsed_time += check_interval
148129

149130
# If we reach here, the job has timed out
150131
final_status, _ = rayjob.status(print_to_console=True)
151-
if not job_found:
152-
raise TimeoutError(
153-
f"⏰ RayJob '{rayjob.name}' was never found in Kubernetes within {timeout} seconds. "
154-
f"Check if the RayJob resource was created successfully."
155-
)
156-
else:
157-
raise TimeoutError(
158-
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
159-
f"Final status: {final_status}"
160-
)
132+
raise TimeoutError(
133+
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
134+
f"Final status: {final_status}"
135+
)

0 commit comments

Comments
 (0)