@@ -90,6 +90,10 @@ def run_rayjob_against_existing_cluster_oauth(self):
90
90
), f"Job submission failed, expected { job_name } , got { submission_result } "
91
91
print (f"✅ Successfully submitted RayJob '{ job_name } '" )
92
92
93
+ # Wait a moment for the RayJob resource to be created in Kubernetes
94
+ print ("⏳ Waiting for RayJob resource to be processed by KubeRay operator..." )
95
+ sleep (5 )
96
+
93
97
# Monitor the job status until completion
94
98
self .monitor_rayjob_completion (rayjob )
95
99
@@ -103,16 +107,21 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
103
107
104
108
Args:
105
109
rayjob: The RayJob instance to monitor
106
- timeout: Maximum time to wait in seconds (default: 15 minutes)
110
+ timeout: Maximum time to wait in seconds (default: 5 minutes)
107
111
"""
108
112
print (f"⏳ Monitoring RayJob '{ rayjob .name } ' status..." )
109
113
110
114
elapsed_time = 0
111
115
check_interval = 10 # Check every 10 seconds
116
+ job_found = False # Track if we've seen the job at least once
112
117
113
118
while elapsed_time < timeout :
114
119
status , ready = rayjob .status (print_to_console = True )
115
120
121
+ # Track if we've found the job (not UNKNOWN status)
122
+ if status != CodeflareRayJobStatus .UNKNOWN :
123
+ job_found = True
124
+
116
125
# Check if job has completed (either successfully or failed)
117
126
if status == CodeflareRayJobStatus .COMPLETE :
118
127
print (f"✅ RayJob '{ rayjob .name } ' completed successfully!" )
@@ -122,15 +131,30 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
122
131
elif status == CodeflareRayJobStatus .RUNNING :
123
132
print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
124
133
elif status == CodeflareRayJobStatus .UNKNOWN :
125
- print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
134
+ if job_found :
135
+ # If we've seen the job before but now it's unknown, that's concerning
136
+ print (
137
+ f"⚠️ RayJob '{ rayjob .name } ' status became unknown after being found"
138
+ )
139
+ else :
140
+ # Job hasn't appeared yet, this is normal initially
141
+ print (
142
+ f"⏳ Waiting for RayJob '{ rayjob .name } ' to appear in Kubernetes..."
143
+ )
126
144
127
145
# Wait before next check
128
146
sleep (check_interval )
129
147
elapsed_time += check_interval
130
148
131
149
# If we reach here, the job has timed out
132
150
final_status , _ = rayjob .status (print_to_console = True )
133
- raise TimeoutError (
134
- f"⏰ RayJob '{ rayjob .name } ' did not complete within { timeout } seconds. "
135
- f"Final status: { final_status } "
136
- )
151
+ if not job_found :
152
+ raise TimeoutError (
153
+ f"⏰ RayJob '{ rayjob .name } ' was never found in Kubernetes within { timeout } seconds. "
154
+ f"Check if the RayJob resource was created successfully."
155
+ )
156
+ else :
157
+ raise TimeoutError (
158
+ f"⏰ RayJob '{ rayjob .name } ' did not complete within { timeout } seconds. "
159
+ f"Final status: { final_status } "
160
+ )
0 commit comments