Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions examples/llm_finetune/packed_parquet_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Example: Fine-tuning with pre-packed Parquet SFT data.
#
# Pre-packed Parquet files use the RFC packed SFT format:
# - input_ids: list<int32> (variable-length token IDs, already packed)
# - loss_mask: list<uint8> (1 = compute loss, 0 = ignore)
# - seq_start_id: list<int32> (sequence boundary positions within each pack)
#
# Files should be named *.idx.parquet and can be produced by the Nemotron
# data prep pipeline or any tool that writes this schema.
#
# To run:
# torchrun --nproc-per-node=8 examples/llm_finetune/finetune.py \
# --config examples/llm_finetune/packed_parquet_example.yaml
#
# Override data_path on the command line:
# --dataset.data_path /data/packed_sft/shard_*.idx.parquet


step_scheduler:
global_batch_size: 32
local_batch_size: 4
ckpt_every_steps: 500
num_epochs: 2

dist_env:
backend: nccl
timeout_minutes: 10

rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 1111
ranked: true

model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B

checkpoint:
enabled: false
checkpoint_dir: checkpoints/

distributed:
strategy: fsdp2
tp_size: 1
cp_size: 1
pp_size: 1

loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

# Pre-packed Parquet dataset — data is already packed, no runtime packing needed.
# packed_sequence_size must match the pack size used during data preparation.
dataset:
_target_: nemo_automodel.components.datasets.llm.packed_parquet_dataset.PackedParquetDataset
data_path: /path/to/packed_sft/ # single file, glob, or directory
packed_sequence_size: 4096
padding_idx: 0
split: train

# packed_sequence_size > 0 tells the model to use THD attention.
# The is_pre_packed attribute on the dataset prevents re-packing.
packed_sequence:
packed_sequence_size: 4096

dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.packed_sequence_thd_collater
shuffle: true
num_workers: 4

optimizer:
_target_: torch.optim.Adam
betas: [0.9, 0.999]
eps: 1e-5
lr: 1.0e-4
weight_decay: 0
2 changes: 2 additions & 0 deletions nemo_automodel/components/datasets/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
is_delta_lake_path,
)
from .nanogpt_dataset import NanogptDataset # noqa: F401
from .packed_parquet_dataset import PackedParquetDataset # noqa: F401
from .retrieval_collator import RetrievalBiencoderCollator # noqa: F401
from .retrieval_dataset import make_retrieval_dataset # noqa: F401
from .squad import make_squad_dataset # noqa: F401
Expand All @@ -36,4 +37,5 @@
"ChatDataset",
"DeltaLakeDataset",
"is_delta_lake_path",
"PackedParquetDataset",
]
Loading
Loading