Skip to content

Commit a3ab7e7

Browse files
committed
fix: test
1 parent b114a5b commit a3ab7e7

File tree

3 files changed

+62
-11
lines changed

3 files changed

+62
-11
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.languageServer": "None"
3+
}

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ def assert_rayjob_submit_against_existing_cluster(
105105
), f"Job submission failed, expected {job_name}, got {submission_result}"
106106
print(f"✅ Successfully submitted RayJob '{job_name}' against existing cluster")
107107

108+
# Wait a moment for the RayJob resource to be created in Kubernetes
109+
print("⏳ Waiting for RayJob resource to be processed by KubeRay operator...")
110+
sleep(5)
111+
108112
# Monitor the job status until completion
109113
self.monitor_rayjob_completion(rayjob, timeout=900)
110114

@@ -122,10 +126,15 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
122126

123127
elapsed_time = 0
124128
check_interval = 10 # Check every 10 seconds
129+
job_found = False # Track if we've seen the job at least once
125130

126131
while elapsed_time < timeout:
127132
status, ready = rayjob.status(print_to_console=True)
128133

134+
# Track if we've found the job (not UNKNOWN status)
135+
if status != CodeflareRayJobStatus.UNKNOWN:
136+
job_found = True
137+
129138
# Check if job has completed (either successfully or failed)
130139
if status == CodeflareRayJobStatus.COMPLETE:
131140
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
@@ -135,15 +144,30 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
135144
elif status == CodeflareRayJobStatus.RUNNING:
136145
print(f"🏃 RayJob '{rayjob.name}' is still running...")
137146
elif status == CodeflareRayJobStatus.UNKNOWN:
138-
print(f"❓ RayJob '{rayjob.name}' status is unknown")
147+
if job_found:
148+
# If we've seen the job before but now it's unknown, that's concerning
149+
print(
150+
f"⚠️ RayJob '{rayjob.name}' status became unknown after being found"
151+
)
152+
else:
153+
# Job hasn't appeared yet, this is normal initially
154+
print(
155+
f"⏳ Waiting for RayJob '{rayjob.name}' to appear in Kubernetes..."
156+
)
139157

140158
# Wait before next check
141159
sleep(check_interval)
142160
elapsed_time += check_interval
143161

144162
# If we reach here, the job has timed out
145163
final_status, _ = rayjob.status(print_to_console=True)
146-
raise TimeoutError(
147-
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
148-
f"Final status: {final_status}"
149-
)
164+
if not job_found:
165+
raise TimeoutError(
166+
f"⏰ RayJob '{rayjob.name}' was never found in Kubernetes within {timeout} seconds. "
167+
f"Check if the RayJob resource was created successfully."
168+
)
169+
else:
170+
raise TimeoutError(
171+
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
172+
f"Final status: {final_status}"
173+
)

tests/e2e/rayjob_existing_cluster_oauth_test.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ def run_rayjob_against_existing_cluster_oauth(self):
9090
), f"Job submission failed, expected {job_name}, got {submission_result}"
9191
print(f"✅ Successfully submitted RayJob '{job_name}'")
9292

93+
# Wait a moment for the RayJob resource to be created in Kubernetes
94+
print("⏳ Waiting for RayJob resource to be processed by KubeRay operator...")
95+
sleep(5)
96+
9397
# Monitor the job status until completion
9498
self.monitor_rayjob_completion(rayjob)
9599

@@ -103,16 +107,21 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
103107
104108
Args:
105109
rayjob: The RayJob instance to monitor
106-
timeout: Maximum time to wait in seconds (default: 15 minutes)
110+
timeout: Maximum time to wait in seconds (default: 5 minutes)
107111
"""
108112
print(f"⏳ Monitoring RayJob '{rayjob.name}' status...")
109113

110114
elapsed_time = 0
111115
check_interval = 10 # Check every 10 seconds
116+
job_found = False # Track if we've seen the job at least once
112117

113118
while elapsed_time < timeout:
114119
status, ready = rayjob.status(print_to_console=True)
115120

121+
# Track if we've found the job (not UNKNOWN status)
122+
if status != CodeflareRayJobStatus.UNKNOWN:
123+
job_found = True
124+
116125
# Check if job has completed (either successfully or failed)
117126
if status == CodeflareRayJobStatus.COMPLETE:
118127
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
@@ -122,15 +131,30 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
122131
elif status == CodeflareRayJobStatus.RUNNING:
123132
print(f"🏃 RayJob '{rayjob.name}' is still running...")
124133
elif status == CodeflareRayJobStatus.UNKNOWN:
125-
print(f"❓ RayJob '{rayjob.name}' status is unknown")
134+
if job_found:
135+
# If we've seen the job before but now it's unknown, that's concerning
136+
print(
137+
f"⚠️ RayJob '{rayjob.name}' status became unknown after being found"
138+
)
139+
else:
140+
# Job hasn't appeared yet, this is normal initially
141+
print(
142+
f"⏳ Waiting for RayJob '{rayjob.name}' to appear in Kubernetes..."
143+
)
126144

127145
# Wait before next check
128146
sleep(check_interval)
129147
elapsed_time += check_interval
130148

131149
# If we reach here, the job has timed out
132150
final_status, _ = rayjob.status(print_to_console=True)
133-
raise TimeoutError(
134-
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
135-
f"Final status: {final_status}"
136-
)
151+
if not job_found:
152+
raise TimeoutError(
153+
f"⏰ RayJob '{rayjob.name}' was never found in Kubernetes within {timeout} seconds. "
154+
f"Check if the RayJob resource was created successfully."
155+
)
156+
else:
157+
raise TimeoutError(
158+
f"⏰ RayJob '{rayjob.name}' did not complete within {timeout} seconds. "
159+
f"Final status: {final_status}"
160+
)

0 commit comments

Comments
 (0)