diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2 index cb66d3aa..025aa393 100644 --- a/nemo_run/run/ray/templates/ray.sub.j2 +++ b/nemo_run/run/ray/templates/ray.sub.j2 @@ -47,6 +47,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR={{ cluster_dir }} mkdir -p $CLUSTER_DIR @@ -208,9 +211,16 @@ EOF srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub index ceaea0cc..7bcb8af0 100644 --- a/test/core/execution/artifacts/expected_ray_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_cluster.sub @@ -49,6 +49,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster mkdir -p $CLUSTER_DIR @@ -202,9 +205,16 @@ EOF srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub index 983b62fc..5d16afdd 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub @@ -50,6 +50,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training mkdir -p $CLUSTER_DIR @@ -207,9 +210,16 @@ EOF srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))