Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 33 additions & 9 deletions examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
from megatron.bridge.recipes.utils.nemo_run_utils import get_partial_fn
from megatron.bridge.training.config import ConfigContainer, ProfilingConfig
from megatron.bridge.training.config import ConfigContainer, ProfilingConfig, TokenizerConfig
from megatron.bridge.training.gpt_step import forward_step
from megatron.bridge.training.pretrain import pretrain

Expand All @@ -37,25 +37,49 @@ def main(args: argparse.Namespace) -> None:

# Get the base ConfigContainer from the recipe
cfg: ConfigContainer = pretrain_config()


cfg.dataset.sequence_length = 2048
cfg.model.seq_length = 2048
cfg.dataset.num_workers = 2
cfg.model.context_parallel_size = 1
cfg.model.num_layers = 4
cfg.model.num_attention_heads = 8
cfg.model.num_query_groups = 8
cfg.model.hidden_size = 768
cfg.model.ffn_hidden_size = 2048
cfg.tokenizer = TokenizerConfig(tokenizer_path="/home/data/llama/tokenizer.model")
# Example of applying programmatic overrides
cfg.train.train_iters = 20
cfg.train.global_batch_size = 8
cfg.train.micro_batch_size = 1
cfg.train.eval_iters = 0
cfg.train.train_iters = 100
cfg.train.eval_iters = 4
cfg.logger.log_interval = 1

cfg.scheduler.lr_warmup_iters = 5
# Example of applying programmatic overrides
#cfg.train.train_iters = 20
#cfg.train.global_batch_size = 8
#cfg.train.micro_batch_size = 1
#cfg.train.eval_iters = 0

cfg.logger.log_interval = 1
cfg.scheduler.lr_warmup_iters = 5

cfg.dataset.sequence_length = 4096
cfg.checkpoint.save = None
#cfg.logger.log_interval = 1

#cfg.dataset.sequence_length = 4096
#cfg.checkpoint.save = None
paths = ["/home/data/llama/my-llama_00_text_document"]
cfg.dataset.split = "900,95,5"
from megatron.core.datasets.utils import get_blend_from_list
paths, weights = get_blend_from_list(paths)
cfg.dataset.blend = [paths, weights]
#print(cfg.dataset)
if cfg.profiling is None:
cfg.profiling = ProfilingConfig()
cfg.profiling.use_nsys_profiler = False
cfg.profiling.use_pytorch_profiler = True
cfg.profiling.record_shapes = True

import torch
cfg.model.embedding_init_method = run.Partial(torch.nn.init.normal_, mean=0.0, std=0.01)

# Create a run.Partial object for the pretrain function
fn = get_partial_fn(pretrain, cfg, forward_step)
Expand Down
3 changes: 1 addition & 2 deletions src/megatron/bridge/data/builders/finetuning_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
from megatron.bridge.data.datasets.sft import create_sft_dataset
from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
from megatron.bridge.utils.common_utils import get_rank_safe, print_rank_0


Expand Down Expand Up @@ -304,7 +303,7 @@ def _extract_tokenizer_model_name(self) -> str:
"""Automatically get the model name from model path."""
if self.packed_sequence_specs and self.packed_sequence_specs.tokenizer_model_name is not None:
return self.packed_sequence_specs.tokenizer_model_name
elif isinstance(self.tokenizer, _HuggingFaceTokenizer):
elif self.tokenizer.library == "huggingface":
name = self.tokenizer._tokenizer.name_or_path
if name.endswith("context/nemo_tokenizer"):
# NEMO_HOME/hf_org/hf_model/context/nemo_tokenizer => hf_org--hf_model
Expand Down
6 changes: 3 additions & 3 deletions src/megatron/bridge/data/builders/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
from typing import Any, Callable, Optional, Protocol, TypedDict, Union, cast

from datasets import Dataset, DatasetDict, load_dataset
from megatron.core.tokenizers import MegatronTokenizerBase
from tqdm import tqdm

from megatron.bridge.data.builders.finetuning_dataset import FinetuningDatasetBuilder
from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
from megatron.bridge.data.datasets.sft import get_dataset_root
from megatron.bridge.training.config import FinetuningDatasetConfig
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
from megatron.bridge.utils.common_utils import print_rank_0


Expand All @@ -47,7 +47,7 @@ class ProcessExampleFn(Protocol):
"""Protocol defining the signature for a function that processes a single dataset example."""

def __call__(
self, example: dict[str, Any], tokenizer: Optional[MegatronTokenizer] = None
self, example: dict[str, Any], tokenizer: Optional[MegatronTokenizerBase] = None
) -> ProcessExampleOutput: ...


Expand Down Expand Up @@ -95,7 +95,7 @@ def preprocess_and_split_data(
dset: DatasetDict,
dataset_name: str,
dataset_root: Path,
tokenizer: MegatronTokenizer,
tokenizer: MegatronTokenizerBase,
process_example_fn: ProcessExampleFn,
split_val_from_train: bool = True,
val_proportion: Optional[float] = None,
Expand Down
8 changes: 4 additions & 4 deletions src/megatron/bridge/data/datasets/packed_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@
from typing import Optional

import numpy as np
from megatron.core.tokenizers import MegatronTokenizerBase

from megatron.bridge.data.datasets.packing_utils import create_hist, create_packing_strategy, fill_packing_strategy
from megatron.bridge.data.datasets.sft import create_sft_dataset
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer


logger = logging.getLogger(__name__)


def tokenize_dataset(path: Path, tokenizer: MegatronTokenizer, max_seq_length: int, seed: int):
def tokenize_dataset(path: Path, tokenizer: MegatronTokenizerBase, max_seq_length: int, seed: int):
"""
Tokenizes a dataset from the provided path using the specified tokenizer
and prepares it for further processing.

Args:
path (Path): Path to the dataset file.
tokenizer (TokenizerSpec): The tokenizer to use for tokenization.
tokenizer (MegatronTokenizerBase): The tokenizer to use for tokenization.
max_seq_length (int): Maximum sequence length for the tokens.
seed (int): Random seed for shuffling the dataset (optional).

Expand All @@ -56,7 +56,7 @@ def prepare_packed_sequence_data(
output_path: Path,
output_metadata_path: Path,
packed_sequence_size: int,
tokenizer: MegatronTokenizer,
tokenizer: MegatronTokenizerBase,
max_seq_length: int,
seed: Optional[int] = 0,
packing_algorithm: str = "first_fit_shuffle",
Expand Down
12 changes: 6 additions & 6 deletions src/megatron/bridge/data/datasets/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import numpy as np
import torch
from datasets import load_dataset
from megatron.core.tokenizers import MegatronTokenizerBase
from torch.utils.data import Dataset

from megatron.bridge.data.datasets.utils import (
Expand All @@ -32,7 +33,6 @@
_OnlineSampleMapping,
_preprocess,
)
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer


DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_dataset_root(name: str) -> Path:

def create_sft_dataset(
path: Path,
tokenizer: "MegatronTokenizer",
tokenizer: "MegatronTokenizerBase",
seq_length: int = 2048,
add_bos: bool = False,
add_eos: bool = True,
Expand Down Expand Up @@ -106,7 +106,7 @@ def create_sft_dataset(

Args:
path (Path): Path to the dataset file. For packed datasets, this should be a .npy file.
tokenizer (MegatronTokenizer): The tokenizer to use for tokenizing the data.
tokenizer (MegatronTokenizerBase): The tokenizer to use for tokenizing the data.
seq_length (int, optional): Maximum sequence length for each example. Defaults to 2048.
add_bos (bool, optional): Whether to add a beginning-of-sentence token. Defaults to False.
add_eos (bool, optional): Whether to add an end-of-sentence token. Defaults to True.
Expand Down Expand Up @@ -182,7 +182,7 @@ class GPTSFTDataset(Dataset):
def __init__(
self,
file_path: str,
tokenizer: MegatronTokenizer,
tokenizer: MegatronTokenizerBase,
max_seq_length: int = 1024,
min_seq_length: int = 1,
pad_seq_length_to_mult: int = 16,
Expand Down Expand Up @@ -219,7 +219,7 @@ def __init__(
Q: What did the math of artificial viscosity do?',
'output': 'smoothed the shock transition without sacrificing basic physics'
}
tokenizer: Tokenizer for the dataset. Instance of a class that inherits MegatronTokenizer (ex: SentencePiece).
tokenizer: Tokenizer for the dataset. Instance of a class that inherits MegatronTokenizerBase (ex: SentencePiece).
max_seq_length (int): maximum sequence length for each dataset examples.
Examples will either be truncated to fit this length or dropped if they cannot be truncated.
min_seq_length (int): min length of each data example in the dataset.
Expand Down Expand Up @@ -724,7 +724,7 @@ class GPTSFTPackedDataset(GPTSFTDataset):
def __init__(
self,
file_path: str,
tokenizer: MegatronTokenizer,
tokenizer: MegatronTokenizerBase,
return_cu_seqlen: bool = True,
pad_cu_seqlens: bool = False,
pack_metadata_file_path: Optional[str] = None,
Expand Down
10 changes: 5 additions & 5 deletions src/megatron/bridge/data/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@

import numpy as np
import torch
from megatron.core.tokenizers import MegatronTokenizerBase
from torch.utils.data import Dataset

from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
from megatron.bridge.utils.common_utils import get_rank_safe


Expand Down Expand Up @@ -90,7 +90,7 @@ def __init__(
newline_int: Optional[int] = 10,
header_lines: Optional[int] = 0,
workers: Optional[int] = None,
tokenizer: Optional[Type["MegatronTokenizer"]] = None,
tokenizer: Optional[Type["MegatronTokenizerBase"]] = None,
build_index_fn: Optional[Callable[[str, Optional[int]], bool]] = build_index_from_memdata,
sort_dataset_paths: Optional[bool] = True,
index_mapping_dir: Optional[str] = None,
Expand Down Expand Up @@ -314,7 +314,7 @@ def __init__(
newline_int: Optional[int] = 10,
header_lines: Optional[int] = 0,
workers: Optional[int] = None,
tokenizer: Optional[Type["MegatronTokenizer"]] = None,
tokenizer: Optional[Type["MegatronTokenizerBase"]] = None,
sort_dataset_paths: Optional[bool] = True,
index_mapping_dir: Optional[str] = None,
):
Expand Down Expand Up @@ -736,7 +736,7 @@ def _make_indexed_dataset_compatibility(dataset):

def _preprocess(
source: dict,
tokenizer: MegatronTokenizer,
tokenizer: MegatronTokenizerBase,
name_end_token_ids: int,
label_start_ids: list,
special_tokens: dict,
Expand Down Expand Up @@ -827,7 +827,7 @@ def _mask_targets(
speakers (List[str]): array of speakers of each turns
header_len (int): the system prompt length
s_ids (List[Tensor]): array of tokenized ids of each turns
tokenizer (MegatronTokenizer): tokenizer object
tokenizer (MegatronTokenizerBase): tokenizer object
mask_role (str): the speaker id to be masked from loss computation.
gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT'
name_end_token_ids (int): end of name token ids
Expand Down
6 changes: 3 additions & 3 deletions src/megatron/bridge/data/hf_processors/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

from typing import Any, Optional

from megatron.bridge.data.builders.hf_dataset import ProcessExampleOutput
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
from megatron.core.tokenizers import MegatronTokenizerBase

from megatron.bridge.data.builders.hf_dataset import ProcessExampleOutput

def process_squad_example(
example: dict[str, Any], tokenizer: Optional[MegatronTokenizer] = None
example: dict[str, Any], tokenizer: Optional[MegatronTokenizerBase] = None
) -> ProcessExampleOutput:
"""Process a single Squad example into the required format.

Expand Down
10 changes: 5 additions & 5 deletions src/megatron/bridge/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
from megatron.core.datasets.gpt_dataset import GPTDataset, MockGPTDataset
from megatron.core.tokenizers import MegatronTokenizerBase

from megatron.bridge.data.builders.finetuning_dataset import FinetuningDatasetBuilder
from megatron.bridge.data.builders.hf_dataset import HFDatasetBuilder, HFDatasetConfig
Expand All @@ -28,7 +29,6 @@
GPTDatasetConfig,
MockGPTDatasetConfig,
)
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
from megatron.bridge.utils.common_utils import print_rank_0


Expand Down Expand Up @@ -79,7 +79,7 @@ def pretrain_train_valid_test_datasets_provider(


def hf_train_valid_test_datasets_provider(
train_val_test_num_samples: list[int], dataset_config: HFDatasetConfig, tokenizer: MegatronTokenizer
train_val_test_num_samples: list[int], dataset_config: HFDatasetConfig, tokenizer: MegatronTokenizerBase
) -> tuple[Any, Any, Any]:
"""Build train, validation, and test datasets from a Hugging Face dataset.

Expand All @@ -89,7 +89,7 @@ def hf_train_valid_test_datasets_provider(
train_val_test_num_samples: A list containing the number of samples for
train, validation, and test datasets.
dataset_config: Configuration object for the Hugging Face dataset.
tokenizer: The MegatronTokenizer instance.
tokenizer: The MegatronTokenizerBase instance.

Returns:
A tuple containing the train, validation, and test datasets.
Expand All @@ -113,7 +113,7 @@ def hf_train_valid_test_datasets_provider(


def finetuning_train_valid_test_datasets_provider(
train_val_test_num_samples: list[int], dataset_config: FinetuningDatasetConfig, tokenizer: MegatronTokenizer
train_val_test_num_samples: list[int], dataset_config: FinetuningDatasetConfig, tokenizer: MegatronTokenizerBase
) -> tuple[Any, Any, Any]:
"""Build finetuning train, validation, and test datasets.

Expand All @@ -123,7 +123,7 @@ def finetuning_train_valid_test_datasets_provider(
train_val_test_num_samples: A list containing the number of samples for
train, validation, and test datasets.
dataset_config: Configuration object for the finetuning dataset.
tokenizer: The MegatronTokenizer instance.
tokenizer: The MegatronTokenizerBase instance.

Returns:
A tuple containing the train, validation, and test datasets.
Expand Down
5 changes: 4 additions & 1 deletion src/megatron/bridge/recipes/llama/llama2_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ def pretrain_config(
log_interval=10,
tensorboard_dir=tensorboard_dir,
),
tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
tokenizer=TokenizerConfig(
metadata_path={"library": "null"},
vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
),
checkpoint=CheckpointConfig(
save_interval=2000,
save=checkpoint_dir,
Expand Down
5 changes: 4 additions & 1 deletion src/megatron/bridge/recipes/llama/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,10 @@ def pretrain_config(
log_interval=10,
tensorboard_dir=tensorboard_dir,
),
tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
tokenizer=TokenizerConfig(
metadata_path={"library": "null"},
vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
),
checkpoint=CheckpointConfig(
save_interval=2000,
save=checkpoint_dir,
Expand Down
5 changes: 4 additions & 1 deletion src/megatron/bridge/recipes/llama/llama31_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ def pretrain_config(
log_interval=10,
tensorboard_dir=tensorboard_dir,
),
tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
tokenizer=TokenizerConfig(
metadata_path={"library": "null"},
vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
),
checkpoint=CheckpointConfig(
save_interval=2000,
save=checkpoint_dir,
Expand Down
5 changes: 4 additions & 1 deletion src/megatron/bridge/recipes/llama/llama31_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ def pretrain_config(
log_interval=10,
tensorboard_dir=tensorboard_dir,
),
tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
tokenizer=TokenizerConfig(
metadata_path={"library": "null"},
vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
),
checkpoint=CheckpointConfig(
save_interval=2000,
save=checkpoint_dir,
Expand Down
5 changes: 4 additions & 1 deletion src/megatron/bridge/recipes/llama/llama32_1b.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,10 @@ def pretrain_config(
log_interval=10,
tensorboard_dir=tensorboard_dir,
),
tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
tokenizer=TokenizerConfig(
metadata_path={"library": "null"},
vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
),
checkpoint=CheckpointConfig(
save_interval=2000,
save=checkpoint_dir,
Expand Down
Loading
Loading