diff --git a/conf/common/test/nemo_run_llama3_8b_lora.toml b/conf/common/test/nemo_run_llama3_8b_lora.toml new file mode 100644 index 000000000..3fa0d359e --- /dev/null +++ b/conf/common/test/nemo_run_llama3_8b_lora.toml @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo_run_llama3_8b_lora" +description = "nemo_run_llama3_8b_lora" +test_template_name = "NeMoRun" + +extra_container_mounts = [ + "/path/to/nemo_models:/path/to/nemo_models/nemo_models", +] + +[cmd_args] +docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3" +task = "finetune" +recipe_name = "llama3_8b" + + [cmd_args.data] + micro_batch_size = 1 + global_batch_size = 32 + seq_length = 4096 + force_redownload = true + packed_sequence_specs = "packed_sequence_data_lora" + + [cmd_args.trainer] + max_steps = 100 + val_check_interval = 100 + num_nodes = 1 + callbacks = "combined_callbacks_lora" + log_every_n_steps = 1 + + [cmd_args.trainer.strategy] + tensor_model_parallel_size = 1 + pipeline_model_parallel_size = 1 + context_parallel_size = 1 + + [cmd_args.trainer.plugins] + grad_reduce_in_fp32 = true + + + [cmd_args.optim] + config.lr = 1e-4 + config.use_distributed_optimizer = false + + [cmd_args.peft] + target_modules = "\"['linear_qkv']\"" + + [cmd_args.model.config] + seq_length = 4096 + +[extra_env_vars] +NCCL_P2P_NET_CHUNKSIZE = "2097152" +NCCL_NVLS_ENABLE = "0" +NVTE_DP_AMAX_REDUCE_INTERVAL = "0" +NVTE_ASYNC_AMAX_REDUCTION = "1" +NVTE_FUSED_ATTN = "1" +NVTE_FLASH_ATTN = "1" +NEMO_LOG_MEMORY_USAGE = "1" +CUDA_DEVICE_MAX_CONNECTIONS = "1" +NVTE_FWD_LAYERNORM_SM_MARGIN = "16" +NVTE_BWD_LAYERNORM_SM_MARGIN = "16" +NEMO_HOME = "/path/to/nemo/home" diff --git a/conf/common/test_scenario/nemo_run_llama3_8b_lora.toml b/conf/common/test_scenario/nemo_run_llama3_8b_lora.toml new file mode 100644 index 000000000..97496944c --- /dev/null +++ b/conf/common/test_scenario/nemo_run_llama3_8b_lora.toml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo_run_llama3_8b_lora" + +[[Tests]] +id = "nemo_run_llama3_8b_lora" +test_name = "nemo_run_llama3_8b_lora" +num_nodes = "1" +time_limit = "00:30:00" diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index b687cf09b..9a01c9014 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -234,7 +234,7 @@ def _gen_srun_command( return " ".join(srun_command_parts + nsys_command_parts + test_command_parts) def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]: - srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] + srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"] if slurm_args.get("image_path"): srun_command_parts.append(f"--container-image={slurm_args['image_path']}") mounts = self.container_mounts(tr) diff --git a/src/cloudai/workloads/nemo_run/cloudai_nemorun.py b/src/cloudai/workloads/nemo_run/cloudai_nemorun.py index 4f0defb0d..0a119c602 100644 --- a/src/cloudai/workloads/nemo_run/cloudai_nemorun.py +++ b/src/cloudai/workloads/nemo_run/cloudai_nemorun.py @@ -19,14 +19,19 @@ import lightning.pytorch as pl import nemo_run as run import torch -from lightning.pytorch.loggers import WandbLogger from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig from nemo import lightning as nl from nemo.collections import llm from nemo.collections.common.tokenizers.huggingface import AutoTokenizer from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.nemotron import ( + Nemotron4Config15B, + NemotronModel, +) +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.lightning.pytorch.callbacks.nsys import NsysCallback @@ -45,6 +50,12 @@ def hf_tokenizer() -> run.Config[AutoTokenizer]: ) +@run.cli.factory(target=MockDataModule, target_arg="tokenizer") +@run.autoconvert +def null_tokenizer() -> run.Config[AutoTokenizer]: + return run.Config(get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000) + + @run.cli.factory @run.autoconvert def timing_callback() -> run.Config[TimingCallback]: @@ -74,27 +85,45 @@ def nsys_callbacks() -> list[pl.Callback]: @run.cli.factory @run.autoconvert -def comms_overlap_callbacks() -> list[pl.Callback]: +def comms_overlap_callbacks_lora() -> list[pl.Callback]: return [ timing_callback(), - run.Config( - MegatronCommOverlapCallback, - overlap_param_gather_with_optimizer_step=False, - ), + run.Config(MegatronCommOverlapCallback, tp_comm_overlap=False), + ] + + +@run.cli.factory +@run.autoconvert +def comms_overlap_callbacks_pretrain() -> list[pl.Callback]: + return [ + timing_callback(), + run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False), ] @run.cli.factory @run.autoconvert -def combined_callbacks() -> list[pl.Callback]: +def combined_callbacks_lora() -> list[pl.Callback]: start_step = 5 end_step = 10 return [ timing_callback(), run.Config( MegatronCommOverlapCallback, - overlap_param_gather_with_optimizer_step=False, + tp_comm_overlap=False, ), + run.Config(GarbageCollectionCallback, gc_interval_train=start_step, gc_interval_val=end_step), + ] + + +@run.cli.factory +@run.autoconvert +def combined_callbacks_pretrain() -> list[pl.Callback]: + start_step = 5 + end_step = 10 + return [ + timing_callback(), + run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False), run.Config( NsysCallback, start_step=start_step, @@ -104,17 +133,24 @@ def combined_callbacks() -> list[pl.Callback]: ] +@run.cli.factory(target=SquadDataModule, target_arg="packed_sequence_specs") +@run.autoconvert +def packed_sequence_data_lora() -> run.Config[PackedSequenceSpecs]: + return run.Config(PackedSequenceSpecs, pad_cu_seqlens=False, packed_sequence_size=4096) + + @run.cli.factory(target=llm.pretrain) @run.autoconvert def cloudai_recipe() -> run.Partial: recipe = run.Partial( llm.pretrain, - model=run.Config(LlamaModel, config=run.Config(Llama3Config8B)), + model=run.Config(NemotronModel, config=run.Config(Nemotron4Config15B)), data=run.Config( MockDataModule, seq_length=2048, micro_batch_size=4, global_batch_size=8, + tokenizer=null_tokenizer(), ), trainer=run.Config( nl.Trainer, @@ -145,7 +181,6 @@ def cloudai_recipe() -> run.Partial: val_check_interval=1000, max_epochs=10, ), - log=nl.NeMoLogger(wandb=(WandbLogger() if "WANDB_API_KEY" in os.environ else None)), optim=run.Config( nl.MegatronOptimizerModule, config=run.Config( @@ -169,4 +204,11 @@ def cloudai_recipe() -> run.Partial: if __name__ == "__main__": - run.cli.main(fn=llm.pretrain) + mode = os.getenv("CLOUDAI_NEMO_TASK") + print(f"Running in mode {mode}") + if mode == "pretrain": + run.cli.main(fn=llm.pretrain) + elif mode == "finetune": + run.cli.main(fn=llm.finetune) + else: + raise ValueError(f"Unknown mode {mode}") diff --git a/src/cloudai/workloads/nemo_run/nemo_run.py b/src/cloudai/workloads/nemo_run/nemo_run.py index 8d789ad55..07b7d2600 100644 --- a/src/cloudai/workloads/nemo_run/nemo_run.py +++ b/src/cloudai/workloads/nemo_run/nemo_run.py @@ -73,14 +73,23 @@ class LogCkpt(BaseModel): model_config = ConfigDict(extra="allow") - save_on_train_epoch_end: bool = Field(default=False) - save_last: bool = Field(default=False) + save_on_train_epoch_end: Optional[bool] = Field(default=None) + save_last: Optional[bool] = Field(default=None) + + +class LogTensorboard(BaseModel): + """Logging tensorboard configuration for NeMoRun.""" + + model_config = ConfigDict(extra="allow") + save_dir: Union[str, Path] = Field(default="logs") + name: Optional[str] = Field(default="default") class Log(BaseModel): """Base logging configuration for NeMoRun.""" - ckpt: LogCkpt = Field(default_factory=LogCkpt) + ckpt: Optional[LogCkpt] = Field(default=None) + tensorboard: Optional[LogTensorboard] = Field(default=None) model_config = ConfigDict(extra="allow") diff --git a/src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py index 3e4bf271b..06a3e598a 100644 --- a/src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py @@ -35,6 +35,9 @@ def _parse_slurm_args( cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun, ) -> Dict[str, Any]: + cloudai_nemo_task = cmd_args.get("task", "") + env_vars["CLOUDAI_NEMO_TASK"] = f"{cloudai_nemo_task}" + base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr) tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition) diff --git a/tests/ref_data/gpt-no-hook.sbatch b/tests/ref_data/gpt-no-hook.sbatch index 17b4f485a..f034d3529 100644 --- a/tests/ref_data/gpt-no-hook.sbatch +++ b/tests/ref_data/gpt-no-hook.sbatch @@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true @@ -22,4 +22,4 @@ echo "Loading container with srun command" -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \ - /opt/paxml/workspace/run.sh + /opt/paxml/workspace/run.sh \ No newline at end of file diff --git a/tests/ref_data/gpt-pre-test.sbatch b/tests/ref_data/gpt-pre-test.sbatch index 5d14bda8f..a9e1f23bb 100644 --- a/tests/ref_data/gpt-pre-test.sbatch +++ b/tests/ref_data/gpt-pre-test.sbatch @@ -7,11 +7,11 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0) PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) if [ $PRE_TEST_SUCCESS -eq 1 ]; then @@ -27,4 +27,4 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then --container-name=cont \ --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \ /opt/paxml/workspace/run.sh -fi +fi \ No newline at end of file diff --git a/tests/ref_data/grok-no-hook.sbatch b/tests/ref_data/grok-no-hook.sbatch index 3f69ba52b..1831c8e0c 100644 --- a/tests/ref_data/grok-no-hook.sbatch +++ b/tests/ref_data/grok-no-hook.sbatch @@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true @@ -22,4 +22,4 @@ echo "Loading container with srun command" -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \ - /opt/paxml/workspace/run.sh + /opt/paxml/workspace/run.sh \ No newline at end of file diff --git a/tests/ref_data/grok-pre-test.sbatch b/tests/ref_data/grok-pre-test.sbatch index 72c363de1..360bce4af 100644 --- a/tests/ref_data/grok-pre-test.sbatch +++ b/tests/ref_data/grok-pre-test.sbatch @@ -7,11 +7,11 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0) PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) if [ $PRE_TEST_SUCCESS -eq 1 ]; then @@ -27,4 +27,4 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then --container-name=cont \ --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \ /opt/paxml/workspace/run.sh -fi +fi \ No newline at end of file diff --git a/tests/ref_data/megatron-run.sbatch b/tests/ref_data/megatron-run.sbatch index 4e1249f5c..76596bf29 100644 --- a/tests/ref_data/megatron-run.sbatch +++ b/tests/ref_data/megatron-run.sbatch @@ -7,8 +7,8 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --recompute-activations --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__/save --load __CLOUDAI_DIR__/load --tokenizer-model __CLOUDAI_DIR__/model.m +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --recompute-activations --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__/save --load __CLOUDAI_DIR__/load --tokenizer-model __CLOUDAI_DIR__/model.m \ No newline at end of file diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch index 214f234ec..691ff0f11 100644 --- a/tests/ref_data/nccl.sbatch +++ b/tests/ref_data/nccl.sbatch @@ -7,8 +7,8 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 \ No newline at end of file diff --git a/tests/ref_data/nemo-run-no-hook.sbatch b/tests/ref_data/nemo-run-no-hook.sbatch index 99b20eccd..887812bd1 100644 --- a/tests/ref_data/nemo-run-no-hook.sbatch +++ b/tests/ref_data/nemo-run-no-hook.sbatch @@ -6,9 +6,9 @@ #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +export CLOUDAI_NEMO_TASK=pretrain +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh - -srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install python /cloudai_workspace/cloudai_nemorun.py --factory llama_3b -y trainer.max_steps=100 trainer.val_check_interval=1000 trainer.num_nodes=1 trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.pipeline_model_parallel_size=1 trainer.strategy.context_parallel_size=2 log.ckpt.save_on_train_epoch_end=False log.ckpt.save_last=False data.micro_batch_size=1 data.global_batch_size=1 \ No newline at end of file +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install python /cloudai_workspace/cloudai_nemorun.py --factory llama_3b -y trainer.max_steps=100 trainer.val_check_interval=1000 trainer.num_nodes=1 trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.pipeline_model_parallel_size=1 trainer.strategy.context_parallel_size=2 data.micro_batch_size=1 data.global_batch_size=1 \ No newline at end of file diff --git a/tests/ref_data/nemo-run-pre-test.sbatch b/tests/ref_data/nemo-run-pre-test.sbatch index 2dfc2146c..c1a859e8e 100644 --- a/tests/ref_data/nemo-run-pre-test.sbatch +++ b/tests/ref_data/nemo-run-pre-test.sbatch @@ -6,14 +6,14 @@ #SBATCH --partition=main export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +export CLOUDAI_NEMO_TASK=pretrain +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh - -srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0) PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) if [ $PRE_TEST_SUCCESS -eq 1 ]; then - srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install python /cloudai_workspace/cloudai_nemorun.py --factory llama_3b -y trainer.max_steps=100 trainer.val_check_interval=1000 trainer.num_nodes=1 trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.pipeline_model_parallel_size=1 trainer.strategy.context_parallel_size=2 log.ckpt.save_on_train_epoch_end=False log.ckpt.save_last=False data.micro_batch_size=1 data.global_batch_size=1 -fi + srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__CLOUDAI_DIR__/src/cloudai/workloads/nemo_run:/cloudai_workspace,__OUTPUT_DIR__/output/nemorun-workspace:/workspace,__OUTPUT_DIR__/install:/cloudai_install python /cloudai_workspace/cloudai_nemorun.py --factory llama_3b -y trainer.max_steps=100 trainer.val_check_interval=1000 trainer.num_nodes=1 trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.pipeline_model_parallel_size=1 trainer.strategy.context_parallel_size=2 data.micro_batch_size=1 data.global_batch_size=1 +fi \ No newline at end of file diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch index 4a969bdf7..5293b6afb 100644 --- a/tests/ref_data/sleep.sbatch +++ b/tests/ref_data/sleep.sbatch @@ -7,8 +7,8 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix sleep 5 +srun --export=ALL --mpi=pmix sleep 5 \ No newline at end of file diff --git a/tests/ref_data/slurm_container.sbatch b/tests/ref_data/slurm_container.sbatch index c623e4542..8d7f9e9aa 100644 --- a/tests/ref_data/slurm_container.sbatch +++ b/tests/ref_data/slurm_container.sbatch @@ -7,8 +7,8 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home bash -c "pwd ; ls" +srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --no-container-mount-home bash -c "pwd ; ls" \ No newline at end of file diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch index afa62e5b4..b48b3e825 100644 --- a/tests/ref_data/ucc.sbatch +++ b/tests/ref_data/ucc.sbatch @@ -7,8 +7,8 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh -srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F \ No newline at end of file diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index aa7aaddc4..78023129e 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -318,7 +318,7 @@ def test_ranks_mapping_cmd(strategy_fixture: SlurmCommandGenStrategy, testrun_fi slurm_args = {"job_name": "test_job", "num_nodes": 2, "node_list_str": "node1,node2"} expected_command = ( - f"srun --mpi={strategy_fixture.system.mpi} " + f"srun --export=ALL --mpi={strategy_fixture.system.mpi} " f"--output={testrun_fixture.output_path.absolute()}/mapping-stdout.txt " f"--error={testrun_fixture.output_path.absolute()}/mapping-stderr.txt " "bash -c " diff --git a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py index bb06200c7..73a6c1626 100644 --- a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py @@ -25,8 +25,6 @@ from cloudai.systems import SlurmSystem from cloudai.workloads.nemo_run import ( Data, - Log, - LogCkpt, NeMoRunCmdArgs, NeMoRunSlurmCommandGenStrategy, NeMoRunTestDefinition, @@ -72,7 +70,6 @@ def test_generate_test_command(self, cmd_gen_strategy: NeMoRunSlurmCommandGenStr trainer=Trainer( strategy=TrainerStrategy(tensor_model_parallel_size=2, virtual_pipeline_model_parallel_size=None), ), - log=Log(ckpt=LogCkpt(save_last=False)), data=Data(micro_batch_size=1), ) test_run.test.test_definition.cmd_args = cmd_args @@ -90,7 +87,6 @@ def test_generate_test_command(self, cmd_gen_strategy: NeMoRunSlurmCommandGenStr assert ( f"trainer.strategy.tensor_model_parallel_size={cmd_args.trainer.strategy.tensor_model_parallel_size}" in cmd ) - assert f"log.ckpt.save_last={cmd_args.log.ckpt.save_last}" in cmd assert f"data.micro_batch_size={cmd_args.data.micro_batch_size}" in cmd def test_num_nodes(self, cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, test_run: TestRun) -> None: diff --git a/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py index ef24bf20d..6be7aae74 100644 --- a/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py @@ -45,11 +45,12 @@ def test_run(slurm_system: SlurmSystem) -> TestRun: def test_default(slurm_system: SlurmSystem, test_run: TestRun) -> None: cgs = SlurmContainerCommandGenStrategy(slurm_system, {}) cmd = cgs.gen_srun_command(test_run) - srun_part = ( - f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} " - f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results,{slurm_system.install_path.absolute()}:" - "/cloudai_install --no-container-mount-home" + f"srun --export=ALL --mpi={slurm_system.mpi} " + f"--container-image={test_run.test.test_definition.cmd_args.docker_image_url} " + f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results," + f"{slurm_system.install_path.absolute()}:/cloudai_install " + f"--no-container-mount-home" ) assert cmd == f'{srun_part} bash -c "cmd"' @@ -62,9 +63,11 @@ def test_with_nsys(slurm_system: SlurmSystem, test_run: TestRun) -> None: cmd = cgs.gen_srun_command(test_run) srun_part = ( - f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} " - f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results,{slurm_system.install_path.absolute()}:" - "/cloudai_install --no-container-mount-home" + f"srun --export=ALL --mpi={slurm_system.mpi} " + f"--container-image={test_run.test.test_definition.cmd_args.docker_image_url} " + f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results," + f"{slurm_system.install_path.absolute()}:/cloudai_install " + f"--no-container-mount-home" ) assert cmd == f'{srun_part} bash -c "{" ".join(nsys.cmd_args)} cmd"'