Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion nemo_run/run/ray/templates/ray.sub.j2
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}

# Timeout in seconds for Ray head node to start (default 10 minutes)
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}

# Directory setup
export CLUSTER_DIR={{ cluster_dir }}
mkdir -p $CLUSTER_DIR
Expand Down Expand Up @@ -208,9 +211,16 @@ EOF
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &

# Wait for the head node container to start and for Ray to be ready
elapsed_time=0
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
touch $LOG_DIR/ENDED
exit 1
fi
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
sleep 2
elapsed_time=$((elapsed_time + 2))
done

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
Expand Down
12 changes: 11 additions & 1 deletion test/core/execution/artifacts/expected_ray_cluster.sub
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}

# Timeout in seconds for Ray head node to start (default 10 minutes)
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}

# Directory setup
export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster
mkdir -p $CLUSTER_DIR
Expand Down Expand Up @@ -202,9 +205,16 @@ EOF
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &

# Wait for the head node container to start and for Ray to be ready
elapsed_time=0
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
touch $LOG_DIR/ENDED
exit 1
fi
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
sleep 2
elapsed_time=$((elapsed_time + 2))
done

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
Expand Down
12 changes: 11 additions & 1 deletion test/core/execution/artifacts/expected_ray_cluster_ssh.sub
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}

# Timeout in seconds for Ray head node to start (default 10 minutes)
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}

# Directory setup
export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
mkdir -p $CLUSTER_DIR
Expand Down Expand Up @@ -207,9 +210,16 @@ EOF
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &

# Wait for the head node container to start and for Ray to be ready
elapsed_time=0
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
touch $LOG_DIR/ENDED
exit 1
fi
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
sleep 2
elapsed_time=$((elapsed_time + 2))
done

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
Expand Down
Loading