Skip to content

Commit 5ea3e31

Browse files
committed
fix: test
1 parent b990433 commit 5ea3e31

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ jobs:
124124
pip install poetry
125125
poetry install --with test,docs
126126
echo "Running e2e tests..."
127-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
127+
# poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
128+
poetry run pytest -v -s ./tests/e2e/rayjob_existing_cluster_kind_test.py::TestRayJobExistingClusterKind::test_rayjob_ray_cluster_sdk_kind_nvidia_gpu > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
128129
env:
129130
GRPC_DNS_RESOLVER: "native"
130131

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def __init__(
5656
active_deadline_seconds: Optional[int] = None,
5757
entrypoint_num_cpus: Optional[int] = None,
5858
entrypoint_num_gpus: Optional[int] = None,
59+
backoff_limit: int = 3,
5960
):
6061
"""
6162
Initialize a RayJob instance.
@@ -104,6 +105,7 @@ def __init__(
104105
self.active_deadline_seconds = active_deadline_seconds
105106
self.entrypoint_num_cpus = entrypoint_num_cpus
106107
self.entrypoint_num_gpus = entrypoint_num_gpus
108+
self.backoff_limit = backoff_limit
107109

108110
# Auto-set shutdown_after_job_finishes based on cluster_config presence
109111
# If cluster_config is provided, we want to clean up the cluster after job finishes
@@ -186,6 +188,7 @@ def _build_rayjob_cr(self) -> Dict[str, Any]:
186188
"entrypoint": self.entrypoint,
187189
"shutdownAfterJobFinishes": self.shutdown_after_job_finishes,
188190
"ttlSecondsAfterFinished": self.ttl_seconds_after_finished,
191+
"backoffLimit": self.backoff_limit,
189192
},
190193
}
191194

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
157157
if result.returncode == 0:
158158
print(f"📋 RayJob YAML details:\n{result.stdout}")
159159

160-
# Also try to get pod logs
160+
# Try to get job submitter pod logs (these pods may be cleaned up quickly)
161161
pod_result = subprocess.run(
162162
[
163163
"kubectl",
@@ -169,6 +169,7 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
169169
f"ray.io/rayjob={rayjob.name}",
170170
"-o",
171171
"name",
172+
"--sort-by=.metadata.creationTimestamp",
172173
],
173174
capture_output=True,
174175
text=True,
@@ -196,6 +197,26 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
196197
else:
197198
print(f"❌ Could not find pods for RayJob: {pod_result.stderr}")
198199

200+
# Also try to get events related to the RayJob
201+
events_result = subprocess.run(
202+
[
203+
"kubectl",
204+
"get",
205+
"events",
206+
"-n",
207+
self.namespace,
208+
"--field-selector",
209+
f"involvedObject.name={rayjob.name}",
210+
"-o",
211+
"wide",
212+
],
213+
capture_output=True,
214+
text=True,
215+
timeout=10,
216+
)
217+
if events_result.returncode == 0 and events_result.stdout.strip():
218+
print(f"📅 Events for RayJob:\n{events_result.stdout}")
219+
199220
except Exception as e:
200221
print(f"❌ Error getting failure details: {e}")
201222

0 commit comments

Comments
 (0)