@@ -46,11 +46,12 @@ def run_rayjob_against_existing_cluster_kind(
46
46
num_workers = 1 ,
47
47
head_cpu_requests = "500m" ,
48
48
head_cpu_limits = "500m" ,
49
- worker_cpu_requests = 2 ,
50
- worker_cpu_limits = 4 ,
51
- worker_memory_requests = 4 ,
52
- worker_memory_limits = 8 ,
49
+ worker_cpu_requests = "500m" ,
50
+ worker_cpu_limits = 1 ,
51
+ worker_memory_requests = 1 ,
52
+ worker_memory_limits = 4 ,
53
53
worker_extended_resource_requests = {gpu_resource_name : number_of_gpus },
54
+ image = "rayproject/ray:2.47.1" ,
54
55
write_to_file = True ,
55
56
verify_tls = False ,
56
57
)
@@ -95,8 +96,7 @@ def assert_rayjob_submit_against_existing_cluster(
95
96
"pip" : "./tests/e2e/mnist_pip_requirements.txt" ,
96
97
"env_vars" : get_setup_env_variables (ACCELERATOR = accelerator ),
97
98
},
98
- shutdown_after_job_finishes = False ,
99
- # entrypoint_num_gpus=number_of_gpus if number_of_gpus > 0 else None, # Temporarily disabled to test basic functionality
99
+ shutdown_after_job_finishes = False , # Don't shutdown the existing cluster
100
100
)
101
101
102
102
# Submit the job
@@ -107,16 +107,13 @@ def assert_rayjob_submit_against_existing_cluster(
107
107
print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
108
108
109
109
# Monitor the job status until completion
110
- self .monitor_rayjob_completion (
111
- rayjob , timeout = 360
112
- ) # 6 minutes for faster debugging
110
+ self .monitor_rayjob_completion (rayjob , timeout = 900 )
113
111
114
112
print (f"✅ RayJob '{ job_name } ' completed successfully against existing cluster!" )
115
113
116
- def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 360 ):
114
+ def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 900 ):
117
115
"""
118
116
Monitor a RayJob until it completes or fails.
119
-
120
117
Args:
121
118
rayjob: The RayJob instance to monitor
122
119
timeout: Maximum time to wait in seconds (default: 15 minutes)
@@ -134,209 +131,11 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 360):
134
131
print (f"✅ RayJob '{ rayjob .name } ' completed successfully!" )
135
132
return
136
133
elif status == CodeflareRayJobStatus .FAILED :
137
- # Get more details about the failure
138
- print (f"❌ RayJob '{ rayjob .name } ' failed! Investigating..." )
139
-
140
- # Try to get failure details using kubectl
141
- import subprocess
142
-
143
- try :
144
- result = subprocess .run (
145
- [
146
- "kubectl" ,
147
- "get" ,
148
- "rayjobs" ,
149
- "-n" ,
150
- self .namespace ,
151
- rayjob .name ,
152
- "-o" ,
153
- "yaml" ,
154
- ],
155
- capture_output = True ,
156
- text = True ,
157
- timeout = 10 ,
158
- )
159
- if result .returncode == 0 :
160
- print (f"📋 RayJob YAML details:\n { result .stdout } " )
161
-
162
- # Try to get job submitter pod logs (these pods may be cleaned up quickly)
163
- pod_result = subprocess .run (
164
- [
165
- "kubectl" ,
166
- "get" ,
167
- "pods" ,
168
- "-n" ,
169
- self .namespace ,
170
- "-l" ,
171
- f"ray.io/rayjob={ rayjob .name } " ,
172
- "-o" ,
173
- "name" ,
174
- "--sort-by=.metadata.creationTimestamp" ,
175
- ],
176
- capture_output = True ,
177
- text = True ,
178
- timeout = 10 ,
179
- )
180
- if pod_result .returncode == 0 and pod_result .stdout .strip ():
181
- pod_name = pod_result .stdout .strip ().split ("/" )[- 1 ]
182
- log_result = subprocess .run (
183
- [
184
- "kubectl" ,
185
- "logs" ,
186
- "-n" ,
187
- self .namespace ,
188
- pod_name ,
189
- "--tail=50" ,
190
- ],
191
- capture_output = True ,
192
- text = True ,
193
- timeout = 10 ,
194
- )
195
- if log_result .returncode == 0 :
196
- print (f"📝 Pod logs for { pod_name } :\n { log_result .stdout } " )
197
- else :
198
- print (f"❌ Could not get pod logs: { log_result .stderr } " )
199
- else :
200
- print (f"❌ Could not find pods for RayJob: { pod_result .stderr } " )
201
-
202
- # Also try to get events related to the RayJob
203
- events_result = subprocess .run (
204
- [
205
- "kubectl" ,
206
- "get" ,
207
- "events" ,
208
- "-n" ,
209
- self .namespace ,
210
- "--field-selector" ,
211
- f"involvedObject.name={ rayjob .name } " ,
212
- "-o" ,
213
- "wide" ,
214
- ],
215
- capture_output = True ,
216
- text = True ,
217
- timeout = 10 ,
218
- )
219
- if events_result .returncode == 0 and events_result .stdout .strip ():
220
- print (f"📅 Events for RayJob:\n { events_result .stdout } " )
221
-
222
- except Exception as e :
223
- print (f"❌ Error getting failure details: { e } " )
224
-
225
134
raise AssertionError (f"❌ RayJob '{ rayjob .name } ' failed!" )
226
135
elif status == CodeflareRayJobStatus .RUNNING :
227
136
print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
228
137
elif status == CodeflareRayJobStatus .UNKNOWN :
229
- print (f"❓ RayJob '{ rayjob .name } ' status is unknown - investigating..." )
230
-
231
- # If we've been in Unknown status for too long, get debug info
232
- if elapsed_time > 120 : # After 2 minutes of Unknown status
233
- print (
234
- f"⚠️ Job has been in Unknown status for { elapsed_time } s - getting debug info..."
235
- )
236
-
237
- # Get detailed YAML to understand why status is Unknown
238
- import subprocess
239
-
240
- try :
241
- result = subprocess .run (
242
- [
243
- "kubectl" ,
244
- "get" ,
245
- "rayjobs" ,
246
- "-n" ,
247
- self .namespace ,
248
- rayjob .name ,
249
- "-o" ,
250
- "yaml" ,
251
- ],
252
- capture_output = True ,
253
- text = True ,
254
- timeout = 10 ,
255
- )
256
- if result .returncode == 0 :
257
- print (
258
- f"📋 RayJob YAML (Unknown status debug):\n { result .stdout } "
259
- )
260
-
261
- # Also check for job pods that might be stuck
262
- job_pods_result = subprocess .run (
263
- [
264
- "kubectl" ,
265
- "get" ,
266
- "pods" ,
267
- "-n" ,
268
- self .namespace ,
269
- "-l" ,
270
- f"ray.io/group=rayjob" ,
271
- "-o" ,
272
- "wide" ,
273
- ],
274
- capture_output = True ,
275
- text = True ,
276
- timeout = 10 ,
277
- )
278
- if job_pods_result .returncode == 0 :
279
- print (f"🔍 RayJob-related pods:\n { job_pods_result .stdout } " )
280
-
281
- # Check for any pending pods in the namespace
282
- pending_pods_result = subprocess .run (
283
- [
284
- "kubectl" ,
285
- "get" ,
286
- "pods" ,
287
- "-n" ,
288
- self .namespace ,
289
- "--field-selector=status.phase=Pending" ,
290
- "-o" ,
291
- "wide" ,
292
- ],
293
- capture_output = True ,
294
- text = True ,
295
- timeout = 10 ,
296
- )
297
- if (
298
- pending_pods_result .returncode == 0
299
- and pending_pods_result .stdout .strip ()
300
- ):
301
- print (
302
- f"⏸️ Pending pods in namespace:\n { pending_pods_result .stdout } "
303
- )
304
-
305
- # Get events for the entire namespace to see scheduling issues
306
- namespace_events_result = subprocess .run (
307
- [
308
- "kubectl" ,
309
- "get" ,
310
- "events" ,
311
- "-n" ,
312
- self .namespace ,
313
- "--sort-by=.metadata.creationTimestamp" ,
314
- "-o" ,
315
- "wide" ,
316
- ],
317
- capture_output = True ,
318
- text = True ,
319
- timeout = 10 ,
320
- )
321
- if (
322
- namespace_events_result .returncode == 0
323
- and namespace_events_result .stdout .strip ()
324
- ):
325
- print (
326
- f"📅 Recent namespace events:\n { namespace_events_result .stdout } "
327
- )
328
-
329
- except Exception as e :
330
- print (f"❌ Error getting debug info: { e } " )
331
-
332
- # Break out of Unknown status loop after 4 minutes
333
- if elapsed_time > 240 :
334
- print (
335
- f"⏰ Breaking out of Unknown status loop after { elapsed_time } s"
336
- )
337
- raise AssertionError (
338
- f"❌ RayJob '{ rayjob .name } ' stuck in Unknown status for too long"
339
- )
138
+ print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
340
139
341
140
# Wait before next check
342
141
sleep (check_interval )
0 commit comments