NVIDIA-NeMo · marcromeyn · Feb 20, 2026 · Mar 3, 2026
@@ -0,0 +1,91 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Example: Fine-tuning with pre-packed Parquet SFT data.
+#
+# Pre-packed Parquet files use the RFC packed SFT format:
+#   - input_ids: list<int32>    (variable-length token IDs, already packed)
+#   - loss_mask: list<uint8>    (1 = compute loss, 0 = ignore)
+#   - seq_start_id: list<int32> (sequence boundary positions within each pack)
+#
+# Files should be named *.idx.parquet and can be produced by the Nemotron
+# data prep pipeline or any tool that writes this schema.
+#
+# To run:
+#   torchrun --nproc-per-node=8 examples/llm_finetune/finetune.py \
+#     --config examples/llm_finetune/packed_parquet_example.yaml
+#
+# Override data_path on the command line:
+#   --dataset.data_path /data/packed_sft/shard_*.idx.parquet
+
+
+step_scheduler:
+  global_batch_size: 32
+  local_batch_size: 4
+  ckpt_every_steps: 500
+  num_epochs: 2
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
+
+checkpoint:
+  enabled: false
+  checkpoint_dir: checkpoints/
+
+distributed:
+  strategy: fsdp2
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+# Pre-packed Parquet dataset — data is already packed, no runtime packing needed.
+# packed_sequence_size must match the pack size used during data preparation.
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.packed_parquet_dataset.PackedParquetDataset
+  data_path: /path/to/packed_sft/  # single file, glob, or directory
+  packed_sequence_size: 4096
+  padding_idx: 0
+  split: train
+
+# packed_sequence_size > 0 tells the model to use THD attention.
+# The is_pre_packed attribute on the dataset prevents re-packing.
+packed_sequence:
+  packed_sequence_size: 4096
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.packed_sequence_thd_collater
+  shuffle: true
+  num_workers: 4
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-5
+  lr: 1.0e-4
+  weight_decay: 0
@@ -20,6 +20,7 @@
     is_delta_lake_path,
 )
 from .nanogpt_dataset import NanogptDataset  # noqa: F401
+from .packed_parquet_dataset import PackedParquetDataset  # noqa: F401
 from .retrieval_collator import RetrievalBiencoderCollator  # noqa: F401
 from .retrieval_dataset import make_retrieval_dataset  # noqa: F401
 from .squad import make_squad_dataset  # noqa: F401
@@ -36,4 +37,5 @@
     "ChatDataset",
     "DeltaLakeDataset",
     "is_delta_lake_path",
+    "PackedParquetDataset",
 ]