diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2
index cb66d3aa..025aa393 100644
--- a/nemo_run/run/ray/templates/ray.sub.j2
+++ b/nemo_run/run/ray/templates/ray.sub.j2
@@ -47,6 +47,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
 # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
 RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
 
+# Timeout in seconds for Ray head node to start (default 10 minutes)
+RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
+
 # Directory setup
 export CLUSTER_DIR={{ cluster_dir }}
 mkdir -p $CLUSTER_DIR
@@ -208,9 +211,16 @@ EOF
 srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
 
 # Wait for the head node container to start and for Ray to be ready
+elapsed_time=0
 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
-  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
+  if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
+    echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
+    touch $LOG_DIR/ENDED
+    exit 1
+  fi
+  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
   sleep 2
+  elapsed_time=$((elapsed_time + 2))
 done
 
 NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub
index ceaea0cc..7bcb8af0 100644
--- a/test/core/execution/artifacts/expected_ray_cluster.sub
+++ b/test/core/execution/artifacts/expected_ray_cluster.sub
@@ -49,6 +49,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
 # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
 RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
 
+# Timeout in seconds for Ray head node to start (default 10 minutes)
+RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
+
 # Directory setup
 export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster
 mkdir -p $CLUSTER_DIR
@@ -202,9 +205,16 @@ EOF
 srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
 
 # Wait for the head node container to start and for Ray to be ready
+elapsed_time=0
 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
-  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
+  if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
+    echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
+    touch $LOG_DIR/ENDED
+    exit 1
+  fi
+  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
   sleep 2
+  elapsed_time=$((elapsed_time + 2))
 done
 
 NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub
index 983b62fc..5d16afdd 100644
--- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub
+++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub
@@ -50,6 +50,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
 # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
 RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
 
+# Timeout in seconds for Ray head node to start (default 10 minutes)
+RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
+
 # Directory setup
 export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
 mkdir -p $CLUSTER_DIR
@@ -207,9 +210,16 @@ EOF
 srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
 
 # Wait for the head node container to start and for Ray to be ready
+elapsed_time=0
 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
-  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
+  if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
+    echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
+    touch $LOG_DIR/ENDED
+    exit 1
+  fi
+  echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
   sleep 2
+  elapsed_time=$((elapsed_time + 2))
 done
 
 NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))