NVIDIA · srivatsankrishnan · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b_lora"
+description = "nemo_run_llama3_8b_lora"
+test_template_name = "NeMoRun"
+
+extra_container_mounts = [
+  "/path/to/nemo_models:/path/to/nemo_models/nemo_models",
+]
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3"
+task = "finetune"
+recipe_name = "llama3_8b"
+
+  [cmd_args.data]
+  micro_batch_size = 1
+  global_batch_size = 32
+  seq_length = 4096
+  force_redownload = true
+  packed_sequence_specs = "packed_sequence_data_lora"
+
+  [cmd_args.trainer]
+  max_steps = 100
+  val_check_interval = 100
+  num_nodes = 1
+  callbacks = "combined_callbacks_lora"
+  log_every_n_steps = 1
+
+    [cmd_args.trainer.strategy]
+    tensor_model_parallel_size = 1
+    pipeline_model_parallel_size = 1
+    context_parallel_size = 1
+
+    [cmd_args.trainer.plugins]
+    grad_reduce_in_fp32 = true
+
+
+  [cmd_args.optim]
+  config.lr = 1e-4
+  config.use_distributed_optimizer = false
+
+  [cmd_args.peft]
+  target_modules = "\"['linear_qkv']\""
+
+  [cmd_args.model.config]
+  seq_length = 4096
+
+[extra_env_vars]
+NCCL_P2P_NET_CHUNKSIZE = "2097152"
+NCCL_NVLS_ENABLE = "0"
+NVTE_DP_AMAX_REDUCE_INTERVAL = "0"
+NVTE_ASYNC_AMAX_REDUCTION = "1"
+NVTE_FUSED_ATTN = "1"
+NVTE_FLASH_ATTN = "1"
+NEMO_LOG_MEMORY_USAGE = "1"
+CUDA_DEVICE_MAX_CONNECTIONS = "1"
+NVTE_FWD_LAYERNORM_SM_MARGIN = "16"
+NVTE_BWD_LAYERNORM_SM_MARGIN = "16"
+NEMO_HOME = "/path/to/nemo/home"
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b_lora"
+
+[[Tests]]
+id = "nemo_run_llama3_8b_lora"
+test_name = "nemo_run_llama3_8b_lora"
+num_nodes = "1"
+time_limit = "00:30:00"
@@ -234,7 +234,7 @@ def _gen_srun_command(
         return " ".join(srun_command_parts + nsys_command_parts + test_command_parts)
 
     def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]:
-        srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
+        srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"]
         if slurm_args.get("image_path"):
             srun_command_parts.append(f"--container-image={slurm_args['image_path']}")
             mounts = self.container_mounts(tr)

@@ -19,14 +19,19 @@
 import lightning.pytorch as pl
 import nemo_run as run
 import torch
-from lightning.pytorch.loggers import WandbLogger
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.nemotron import (
+    Nemotron4Config15B,
+    NemotronModel,
+)
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
@@ -45,6 +50,12 @@ def hf_tokenizer() -> run.Config[AutoTokenizer]:
     )
 
 
+@run.cli.factory(target=MockDataModule, target_arg="tokenizer")
+@run.autoconvert
+def null_tokenizer() -> run.Config[AutoTokenizer]:
+    return run.Config(get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000)
+
+
 @run.cli.factory
 @run.autoconvert
 def timing_callback() -> run.Config[TimingCallback]:
@@ -74,27 +85,45 @@ def nsys_callbacks() -> list[pl.Callback]:
 
 @run.cli.factory
 @run.autoconvert
-def comms_overlap_callbacks() -> list[pl.Callback]:
+def comms_overlap_callbacks_lora() -> list[pl.Callback]:
     return [
         timing_callback(),
-        run.Config(
-            MegatronCommOverlapCallback,
-            overlap_param_gather_with_optimizer_step=False,
-        ),
+        run.Config(MegatronCommOverlapCallback, tp_comm_overlap=False),
+    ]
+
+
+@run.cli.factory
+@run.autoconvert
+def comms_overlap_callbacks_pretrain() -> list[pl.Callback]:
+    return [
+        timing_callback(),
+        run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False),
     ]
 
 
 @run.cli.factory
 @run.autoconvert
-def combined_callbacks() -> list[pl.Callback]:
+def combined_callbacks_lora() -> list[pl.Callback]:
     start_step = 5
     end_step = 10
     return [
         timing_callback(),
         run.Config(
             MegatronCommOverlapCallback,
-            overlap_param_gather_with_optimizer_step=False,
+            tp_comm_overlap=False,
         ),
+        run.Config(GarbageCollectionCallback, gc_interval_train=start_step, gc_interval_val=end_step),
+    ]
+
+
+@run.cli.factory
+@run.autoconvert
+def combined_callbacks_pretrain() -> list[pl.Callback]:
+    start_step = 5
+    end_step = 10
+    return [
+        timing_callback(),
+        run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False),
         run.Config(
             NsysCallback,
             start_step=start_step,
@@ -104,17 +133,24 @@ def combined_callbacks() -> list[pl.Callback]:
     ]
 
 
+@run.cli.factory(target=SquadDataModule, target_arg="packed_sequence_specs")
+@run.autoconvert
+def packed_sequence_data_lora() -> run.Config[PackedSequenceSpecs]:
+    return run.Config(PackedSequenceSpecs, pad_cu_seqlens=False, packed_sequence_size=4096)
+
+
 @run.cli.factory(target=llm.pretrain)
 @run.autoconvert
 def cloudai_recipe() -> run.Partial:
     recipe = run.Partial(
         llm.pretrain,
-        model=run.Config(LlamaModel, config=run.Config(Llama3Config8B)),
+        model=run.Config(NemotronModel, config=run.Config(Nemotron4Config15B)),
         data=run.Config(
             MockDataModule,
             seq_length=2048,
             micro_batch_size=4,
             global_batch_size=8,
+            tokenizer=null_tokenizer(),
         ),
         trainer=run.Config(
             nl.Trainer,
@@ -145,7 +181,6 @@ def cloudai_recipe() -> run.Partial:
             val_check_interval=1000,
             max_epochs=10,
         ),
-        log=nl.NeMoLogger(wandb=(WandbLogger() if "WANDB_API_KEY" in os.environ else None)),
         optim=run.Config(
             nl.MegatronOptimizerModule,
             config=run.Config(
@@ -169,4 +204,11 @@ def cloudai_recipe() -> run.Partial:
 
 
 if __name__ == "__main__":
-    run.cli.main(fn=llm.pretrain)
+    mode = os.getenv("CLOUDAI_NEMO_TASK")
+    print(f"Running in mode {mode}")
+    if mode == "pretrain":
+        run.cli.main(fn=llm.pretrain)
+    elif mode == "finetune":
+        run.cli.main(fn=llm.finetune)
+    else:
+        raise ValueError(f"Unknown mode {mode}")
@@ -73,14 +73,23 @@ class LogCkpt(BaseModel):
 
     model_config = ConfigDict(extra="allow")
 
-    save_on_train_epoch_end: bool = Field(default=False)
-    save_last: bool = Field(default=False)
+    save_on_train_epoch_end: Optional[bool] = Field(default=None)
+    save_last: Optional[bool] = Field(default=None)
+
+
+class LogTensorboard(BaseModel):
+    """Logging tensorboard configuration for NeMoRun."""
+
+    model_config = ConfigDict(extra="allow")
+    save_dir: Union[str, Path] = Field(default="logs")
+    name: Optional[str] = Field(default="default")
 
 
 class Log(BaseModel):
     """Base logging configuration for NeMoRun."""
 
-    ckpt: LogCkpt = Field(default_factory=LogCkpt)
+    ckpt: Optional[LogCkpt] = Field(default=None)
+    tensorboard: Optional[LogTensorboard] = Field(default=None)
 
     model_config = ConfigDict(extra="allow")
 

@@ -35,6 +35,9 @@ def _parse_slurm_args(
         cmd_args: Dict[str, Union[str, List[str]]],
         tr: TestRun,
     ) -> Dict[str, Any]:
+        cloudai_nemo_task = cmd_args.get("task", "")
+        env_vars["CLOUDAI_NEMO_TASK"] = f"{cloudai_nemo_task}"
+
         base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)
 
         tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)

@@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
 export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
 
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
 
 echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
@@ -22,4 +22,4 @@ echo "Loading container with srun command"
     -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
-    /opt/paxml/workspace/run.sh
+    /opt/paxml/workspace/run.sh
@@ -7,11 +7,11 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
 export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
 
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
 
-srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
 SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0)
 PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
 if [ $PRE_TEST_SUCCESS -eq 1 ]; then
@@ -27,4 +27,4 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
     /opt/paxml/workspace/run.sh
-fi
+fi
@@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
 export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
 
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
+srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
 
 echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
@@ -22,4 +22,4 @@ echo "Loading container with srun command"
     -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
-    /opt/paxml/workspace/run.sh
+    /opt/paxml/workspace/run.sh