Skip to content

Commit

Permalink
added a warning
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Jan 20, 2024
1 parent 62f4008 commit ce71dcf
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions python/dolma/cli/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ def __post__init__(self):
logger.warning("No pad token ID provided; using EOS token ID.")
self.pad_token_id = self.eos_token_id

if self.segment_before_tokenization:
logger.warning(
"EXPERIMENTAL FEATURE: segmenting before tokenization is enabled. "
"This option has only been tested with Llama and GPT-NeoX tokenizers. "
"USE AT YOUR OWN RISK."
)

@classmethod
def deprecated_init(cls, tokenizer_name_or_path: str) -> "TokenizerConfig":
logger = get_logger(__file__)
Expand Down

0 comments on commit ce71dcf

Please sign in to comment.