@@ -105,9 +105,38 @@ def assert_rayjob_submit_against_existing_cluster(
105
105
), f"Job submission failed, expected { job_name } , got { submission_result } "
106
106
print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
107
107
108
- # Wait a moment for the RayJob resource to be created in Kubernetes
109
- print ("⏳ Waiting for RayJob resource to be processed by KubeRay operator..." )
110
- sleep (5 )
108
+ # Debug: Check if RayJob resource was actually created
109
+ import subprocess
110
+ import time
111
+
112
+ print ("🔍 Checking if RayJob resource exists in Kubernetes..." )
113
+ for attempt in range (6 ): # Check for 30 seconds
114
+ try :
115
+ # Check if RayJob resource exists
116
+ result = subprocess .run (
117
+ ["kubectl" , "get" , "rayjobs" , "-n" , self .namespace , job_name ],
118
+ capture_output = True , text = True , timeout = 10
119
+ )
120
+ if result .returncode == 0 :
121
+ print (f"✅ RayJob resource '{ job_name } ' found in Kubernetes!" )
122
+ print (f"RayJob details:\n { result .stdout } " )
123
+ break
124
+ else :
125
+ print (f"❌ Attempt { attempt + 1 } : RayJob resource '{ job_name } ' not found" )
126
+ if attempt < 5 :
127
+ time .sleep (5 )
128
+ except Exception as e :
129
+ print (f"❌ Error checking RayJob: { e } " )
130
+
131
+ # Also check what RayJob resources exist in the namespace
132
+ try :
133
+ result = subprocess .run (
134
+ ["kubectl" , "get" , "rayjobs" , "-n" , self .namespace ],
135
+ capture_output = True , text = True , timeout = 10
136
+ )
137
+ print (f"📋 All RayJobs in namespace '{ self .namespace } ':\n { result .stdout } " )
138
+ except Exception as e :
139
+ print (f"❌ Error listing RayJobs: { e } " )
111
140
112
141
# Monitor the job status until completion
113
142
self .monitor_rayjob_completion (rayjob , timeout = 900 )
0 commit comments