Skip to content
Merged
74 changes: 74 additions & 0 deletions conf/common/test/nemo_run_llama3_8b_lora.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nemo_run_llama3_8b_lora"
description = "nemo_run_llama3_8b_lora"
test_template_name = "NeMoRun"

extra_container_mounts = [
"/path/to/nemo_models:/path/to/nemo_models/nemo_models",
]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3"
task = "finetune"
recipe_name = "llama3_8b"

[cmd_args.data]
micro_batch_size = 1
global_batch_size = 32
seq_length = 4096
force_redownload = true
packed_sequence_specs = "packed_sequence_data_lora"

[cmd_args.trainer]
max_steps = 100
val_check_interval = 100
num_nodes = 1
callbacks = "combined_callbacks_lora"
log_every_n_steps = 1

[cmd_args.trainer.strategy]
tensor_model_parallel_size = 1
pipeline_model_parallel_size = 1
context_parallel_size = 1

[cmd_args.trainer.plugins]
grad_reduce_in_fp32 = true


[cmd_args.optim]
config.lr = 1e-4
config.use_distributed_optimizer = false

[cmd_args.peft]
target_modules = "\"['linear_qkv']\""

[cmd_args.model.config]
seq_length = 4096

[extra_env_vars]
NCCL_P2P_NET_CHUNKSIZE = "2097152"
NCCL_NVLS_ENABLE = "0"
NVTE_DP_AMAX_REDUCE_INTERVAL = "0"
NVTE_ASYNC_AMAX_REDUCTION = "1"
NVTE_FUSED_ATTN = "1"
NVTE_FLASH_ATTN = "1"
NEMO_LOG_MEMORY_USAGE = "1"
CUDA_DEVICE_MAX_CONNECTIONS = "1"
NVTE_FWD_LAYERNORM_SM_MARGIN = "16"
NVTE_BWD_LAYERNORM_SM_MARGIN = "16"
NEMO_HOME = "/path/to/nemo/home"
23 changes: 23 additions & 0 deletions conf/common/test_scenario/nemo_run_llama3_8b_lora.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nemo_run_llama3_8b_lora"

[[Tests]]
id = "nemo_run_llama3_8b_lora"
test_name = "nemo_run_llama3_8b_lora"
num_nodes = "1"
time_limit = "00:30:00"
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def _gen_srun_command(
return " ".join(srun_command_parts + nsys_command_parts + test_command_parts)

def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]:
srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"]
if slurm_args.get("image_path"):
srun_command_parts.append(f"--container-image={slurm_args['image_path']}")
mounts = self.container_mounts(tr)
Expand Down
66 changes: 54 additions & 12 deletions src/cloudai/workloads/nemo_run/cloudai_nemorun.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,19 @@
import lightning.pytorch as pl
import nemo_run as run
import torch
from lightning.pytorch.loggers import WandbLogger
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig
from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.gpt.model.nemotron import (
Nemotron4Config15B,
NemotronModel,
)
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
Expand All @@ -45,6 +50,12 @@ def hf_tokenizer() -> run.Config[AutoTokenizer]:
)


@run.cli.factory(target=MockDataModule, target_arg="tokenizer")
@run.autoconvert
def null_tokenizer() -> run.Config[AutoTokenizer]:
return run.Config(get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000)


@run.cli.factory
@run.autoconvert
def timing_callback() -> run.Config[TimingCallback]:
Expand Down Expand Up @@ -74,27 +85,45 @@ def nsys_callbacks() -> list[pl.Callback]:

@run.cli.factory
@run.autoconvert
def comms_overlap_callbacks() -> list[pl.Callback]:
def comms_overlap_callbacks_lora() -> list[pl.Callback]:
return [
timing_callback(),
run.Config(
MegatronCommOverlapCallback,
overlap_param_gather_with_optimizer_step=False,
),
run.Config(MegatronCommOverlapCallback, tp_comm_overlap=False),
]


@run.cli.factory
@run.autoconvert
def comms_overlap_callbacks_pretrain() -> list[pl.Callback]:
return [
timing_callback(),
run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False),
]


@run.cli.factory
@run.autoconvert
def combined_callbacks() -> list[pl.Callback]:
def combined_callbacks_lora() -> list[pl.Callback]:
start_step = 5
end_step = 10
return [
timing_callback(),
run.Config(
MegatronCommOverlapCallback,
overlap_param_gather_with_optimizer_step=False,
tp_comm_overlap=False,
),
run.Config(GarbageCollectionCallback, gc_interval_train=start_step, gc_interval_val=end_step),
]


@run.cli.factory
@run.autoconvert
def combined_callbacks_pretrain() -> list[pl.Callback]:
start_step = 5
end_step = 10
return [
timing_callback(),
run.Config(MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=False),
run.Config(
NsysCallback,
start_step=start_step,
Expand All @@ -104,17 +133,24 @@ def combined_callbacks() -> list[pl.Callback]:
]


@run.cli.factory(target=SquadDataModule, target_arg="packed_sequence_specs")
@run.autoconvert
def packed_sequence_data_lora() -> run.Config[PackedSequenceSpecs]:
return run.Config(PackedSequenceSpecs, pad_cu_seqlens=False, packed_sequence_size=4096)


@run.cli.factory(target=llm.pretrain)
@run.autoconvert
def cloudai_recipe() -> run.Partial:
recipe = run.Partial(
llm.pretrain,
model=run.Config(LlamaModel, config=run.Config(Llama3Config8B)),
model=run.Config(NemotronModel, config=run.Config(Nemotron4Config15B)),
data=run.Config(
MockDataModule,
seq_length=2048,
micro_batch_size=4,
global_batch_size=8,
tokenizer=null_tokenizer(),
),
trainer=run.Config(
nl.Trainer,
Expand Down Expand Up @@ -145,7 +181,6 @@ def cloudai_recipe() -> run.Partial:
val_check_interval=1000,
max_epochs=10,
),
log=nl.NeMoLogger(wandb=(WandbLogger() if "WANDB_API_KEY" in os.environ else None)),
optim=run.Config(
nl.MegatronOptimizerModule,
config=run.Config(
Expand All @@ -169,4 +204,11 @@ def cloudai_recipe() -> run.Partial:


if __name__ == "__main__":
run.cli.main(fn=llm.pretrain)
mode = os.getenv("CLOUDAI_NEMO_TASK")
print(f"Running in mode {mode}")
if mode == "pretrain":
run.cli.main(fn=llm.pretrain)
elif mode == "finetune":
run.cli.main(fn=llm.finetune)
else:
raise ValueError(f"Unknown mode {mode}")
15 changes: 12 additions & 3 deletions src/cloudai/workloads/nemo_run/nemo_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,23 @@ class LogCkpt(BaseModel):

model_config = ConfigDict(extra="allow")

save_on_train_epoch_end: bool = Field(default=False)
save_last: bool = Field(default=False)
save_on_train_epoch_end: Optional[bool] = Field(default=None)
save_last: Optional[bool] = Field(default=None)


class LogTensorboard(BaseModel):
"""Logging tensorboard configuration for NeMoRun."""

model_config = ConfigDict(extra="allow")
save_dir: Union[str, Path] = Field(default="logs")
name: Optional[str] = Field(default="default")


class Log(BaseModel):
"""Base logging configuration for NeMoRun."""

ckpt: LogCkpt = Field(default_factory=LogCkpt)
ckpt: Optional[LogCkpt] = Field(default=None)
tensorboard: Optional[LogTensorboard] = Field(default=None)

model_config = ConfigDict(extra="allow")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def _parse_slurm_args(
cmd_args: Dict[str, Union[str, List[str]]],
tr: TestRun,
) -> Dict[str, Any]:
cloudai_nemo_task = cmd_args.get("task", "")
env_vars["CLOUDAI_NEMO_TASK"] = f"{cloudai_nemo_task}"

base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)

tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)
Expand Down
6 changes: 3 additions & 3 deletions tests/ref_data/gpt-no-hook.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
export COMBINE_THRESHOLD=1
export PER_GPU_COMBINE_THRESHOLD=0
export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."

srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh

echo "Loading container with srun command"
srun --mpi=none --container-image=https://docker/url --container-name=cont true
Expand All @@ -22,4 +22,4 @@ echo "Loading container with srun command"
-e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
--container-name=cont \
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
/opt/paxml/workspace/run.sh
/opt/paxml/workspace/run.sh
8 changes: 4 additions & 4 deletions tests/ref_data/gpt-pre-test.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
export COMBINE_THRESHOLD=1
export PER_GPU_COMBINE_THRESHOLD=0
export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."

srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh

srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output/pre_test/nccl:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0)
PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
if [ $PRE_TEST_SUCCESS -eq 1 ]; then
Expand All @@ -27,4 +27,4 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then
--container-name=cont \
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
/opt/paxml/workspace/run.sh
fi
fi
6 changes: 3 additions & 3 deletions tests/ref_data/grok-no-hook.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
export COMBINE_THRESHOLD=1
export PER_GPU_COMBINE_THRESHOLD=0
export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."

srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh

echo "Loading container with srun command"
srun --mpi=none --container-image=https://docker/url --container-name=cont true
Expand All @@ -22,4 +22,4 @@ echo "Loading container with srun command"
-e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
--container-name=cont \
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/output:/opt/paxml/workspace/,__OUTPUT_DIR__/install:/cloudai_install \
/opt/paxml/workspace/run.sh
/opt/paxml/workspace/run.sh
Loading