Skip to content

Commit d799b0e

Browse files
authored
Update ray_enroot template (#406)
1 parent 9b60d5d commit d799b0e

2 files changed

Lines changed: 10 additions & 4 deletions

File tree

nemo_run/run/ray/templates/ray_enroot.sub.j2

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ rm -f $LOG_DIR/ENDED
9999
# Number of GPUs per node
100100
gpus_per_node=8
101101

102+
# Number of CPUs per worker node
103+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-112}
104+
102105
num_retries={{ num_retries }}
103106

104107
# Track backgrounded srun client PIDs for head and workers
@@ -279,7 +282,7 @@ touch $LOG_DIR/ENDED
279282
exit 1
280283
EOF
281284
)
282-
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
285+
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283286
SRUN_PIDS["ray-head"]=$!
284287

285288
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
@@ -380,7 +383,7 @@ EOF
380383
if [[ $i -eq 0 ]]; then
381384
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
382385
fi
383-
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
386+
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
384387
SRUN_PIDS["ray-worker-$i"]=$!
385388
sleep 3
386389
done

test/core/execution/artifacts/expected_ray_cluster_enroot.sub

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ rm -f $LOG_DIR/ENDED
101101
# Number of GPUs per node
102102
gpus_per_node=8
103103

104+
# Number of CPUs per worker node
105+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-112}
106+
104107
num_retries=1
105108

106109
# Track backgrounded srun client PIDs for head and workers
@@ -273,7 +276,7 @@ touch $LOG_DIR/ENDED
273276
exit 1
274277
EOF
275278
)
276-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
279+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
277280
SRUN_PIDS["ray-head"]=$!
278281

279282
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
@@ -370,7 +373,7 @@ EOF
370373
if [[ $i -eq 0 ]]; then
371374
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
372375
fi
373-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
376+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
374377
SRUN_PIDS["ray-worker-$i"]=$!
375378
sleep 3
376379
done

0 commit comments

Comments
 (0)