Skip to content

Commit b990433

Browse files
committed
fix: test
1 parent 3cb800f commit b990433

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,73 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
132132
print(f"✅ RayJob '{rayjob.name}' completed successfully!")
133133
return
134134
elif status == CodeflareRayJobStatus.FAILED:
135+
# Get more details about the failure
136+
print(f"❌ RayJob '{rayjob.name}' failed! Investigating...")
137+
138+
# Try to get failure details using kubectl
139+
import subprocess
140+
141+
try:
142+
result = subprocess.run(
143+
[
144+
"kubectl",
145+
"get",
146+
"rayjobs",
147+
"-n",
148+
self.namespace,
149+
rayjob.name,
150+
"-o",
151+
"yaml",
152+
],
153+
capture_output=True,
154+
text=True,
155+
timeout=10,
156+
)
157+
if result.returncode == 0:
158+
print(f"📋 RayJob YAML details:\n{result.stdout}")
159+
160+
# Also try to get pod logs
161+
pod_result = subprocess.run(
162+
[
163+
"kubectl",
164+
"get",
165+
"pods",
166+
"-n",
167+
self.namespace,
168+
"-l",
169+
f"ray.io/rayjob={rayjob.name}",
170+
"-o",
171+
"name",
172+
],
173+
capture_output=True,
174+
text=True,
175+
timeout=10,
176+
)
177+
if pod_result.returncode == 0 and pod_result.stdout.strip():
178+
pod_name = pod_result.stdout.strip().split("/")[-1]
179+
log_result = subprocess.run(
180+
[
181+
"kubectl",
182+
"logs",
183+
"-n",
184+
self.namespace,
185+
pod_name,
186+
"--tail=50",
187+
],
188+
capture_output=True,
189+
text=True,
190+
timeout=10,
191+
)
192+
if log_result.returncode == 0:
193+
print(f"📝 Pod logs for {pod_name}:\n{log_result.stdout}")
194+
else:
195+
print(f"❌ Could not get pod logs: {log_result.stderr}")
196+
else:
197+
print(f"❌ Could not find pods for RayJob: {pod_result.stderr}")
198+
199+
except Exception as e:
200+
print(f"❌ Error getting failure details: {e}")
201+
135202
raise AssertionError(f"❌ RayJob '{rayjob.name}' failed!")
136203
elif status == CodeflareRayJobStatus.RUNNING:
137204
print(f"🏃 RayJob '{rayjob.name}' is still running...")

0 commit comments

Comments
 (0)