diff --git a/src/nemo_run/core/execution/templates/slurm.sh.j2 b/src/nemo_run/core/execution/templates/slurm.sh.j2 index 5826fb4d..c47261fb 100644 --- a/src/nemo_run/core/execution/templates/slurm.sh.j2 +++ b/src/nemo_run/core/execution/templates/slurm.sh.j2 @@ -15,9 +15,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES={{max_retries}} -{%- for env_var in env_vars %} -{{env_var}} -{%- endfor %} set +e @@ -33,6 +30,10 @@ head_node=${nodes_array[0]} {%- endfor %} {% endif %} +{%- for env_var in env_vars %} +{{env_var}} +{%- endfor %} + {%- if setup_lines %} {{setup_lines}} {%- endif %} diff --git a/test/core/execution/artifacts/dummy_slurm.sh b/test/core/execution/artifacts/dummy_slurm.sh index 3c65ef02..1673f3f0 100644 --- a/test/core/execution/artifacts/dummy_slurm.sh +++ b/test/core/execution/artifacts/dummy_slurm.sh @@ -18,7 +18,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -29,6 +28,7 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +export ENV_VAR=value # Command 1 diff --git a/test/core/execution/artifacts/ft_het_slurm.sh b/test/core/execution/artifacts/ft_het_slurm.sh index e4563d42..176dd0db 100644 --- a/test/core/execution/artifacts/ft_het_slurm.sh +++ b/test/core/execution/artifacts/ft_het_slurm.sh @@ -26,7 +26,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -41,6 +40,7 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) het_group_host_0=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_0 | head -n1) het_group_host_1=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_1 | head -n1) +export ENV_VAR=value # This script uses experimental fault tolerance launcher # Fault tolerance related items export FAULT_TOL_CFG_PATH="/root/experiment/sample_job/sample_job_ft_cfg.yml" diff --git a/test/core/execution/artifacts/ft_slurm.sh b/test/core/execution/artifacts/ft_slurm.sh index d135b7b3..59b15123 100644 --- a/test/core/execution/artifacts/ft_slurm.sh +++ b/test/core/execution/artifacts/ft_slurm.sh @@ -18,7 +18,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -29,6 +28,7 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +export ENV_VAR=value # This script uses experimental fault tolerance launcher # Fault tolerance related items export FAULT_TOL_CFG_PATH="/root/sample_job/sample_job_ft_cfg.yml" diff --git a/test/core/execution/artifacts/group_resource_req_slurm.sh b/test/core/execution/artifacts/group_resource_req_slurm.sh index c90607e4..3bbde92e 100644 --- a/test/core/execution/artifacts/group_resource_req_slurm.sh +++ b/test/core/execution/artifacts/group_resource_req_slurm.sh @@ -20,7 +20,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -31,6 +30,7 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +export ENV_VAR=value # Command 1 diff --git a/test/core/execution/artifacts/group_slurm.sh b/test/core/execution/artifacts/group_slurm.sh index ceba96a0..00833abc 100644 --- a/test/core/execution/artifacts/group_slurm.sh +++ b/test/core/execution/artifacts/group_slurm.sh @@ -20,7 +20,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -31,6 +30,7 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +export ENV_VAR=value # Command 1 diff --git a/test/core/execution/artifacts/group_slurm_no_monitor.sh b/test/core/execution/artifacts/group_slurm_no_monitor.sh index eb47bde4..7e334777 100644 --- a/test/core/execution/artifacts/group_slurm_no_monitor.sh +++ b/test/core/execution/artifacts/group_slurm_no_monitor.sh @@ -20,7 +20,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -31,6 +30,7 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +export ENV_VAR=value # Command 1 diff --git a/test/core/execution/artifacts/het_slurm.sh b/test/core/execution/artifacts/het_slurm.sh index 73577ab8..9be58607 100644 --- a/test/core/execution/artifacts/het_slurm.sh +++ b/test/core/execution/artifacts/het_slurm.sh @@ -30,7 +30,6 @@ set -evx export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 export TORCHX_MAX_RETRIES=3 -export ENV_VAR=value set +e @@ -45,6 +44,7 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) het_group_host_0=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_0 | head -n1) het_group_host_1=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_1 | head -n1) +export ENV_VAR=value # Command 1