Skip to content

Commit

Permalink
BOS/EOS/PAD options in tokens cli; speed up tokenization by segment…
Browse files Browse the repository at this point in the history
…ing paragraphs. (#102)

* fixes for tokenizer

* added test for corner case

* added a warning

* fix order imports
  • Loading branch information
soldni authored Jan 20, 2024
1 parent f294ab1 commit 45b5eea
Show file tree
Hide file tree
Showing 10 changed files with 194,293 additions and 28 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "dolma"
version = "0.9.2"
version = "0.9.3"
edition = "2021"
license = "Apache-2.0"

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dolma"
version = "0.9.2"
version = "0.9.3"
description = "Data filters"
license = {text = "Apache-2.0"}
readme = "README.md"
Expand Down Expand Up @@ -190,7 +190,7 @@ recursive = true
aggressive = 3

[tool.mypy]
python_version = 3.8
python_version = "3.8"
ignore_missing_imports = true
no_site_packages = true
allow_redefinition = false
Expand Down
95 changes: 89 additions & 6 deletions python/dolma/cli/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,75 @@

@dataclass
class TokenizerConfig:
name_or_path: Optional[str] = field(
default=None,
help="Name or path of the tokenizer to use. Must be a HuggingFace-compatible tokenizer. Required.",
)
bos_token_id: Optional[int] = field(
default=None, help="The token ID corresponding to the 'beginning-of-sentence' token."
)
eos_token_id: Optional[int] = field(
default=None,
help="The token ID corresponding to the 'end-of-sentence' token.",
)
pad_token_id: Optional[int] = field(
default=None,
help="The token ID corresponding to the 'padding' token.",
)
segment_before_tokenization: bool = field(
default=False,
help=(
"Whether to segment documents by paragraph before tokenization. "
"This is useful for tokenizers like Llama that are very slow on long documents. "
"Might not be needed once this bugfix is merged https://github.com/huggingface/tokenizers/pull/1413"
),
)

def __post__init__(self):
logger = get_logger(__file__)

if self.eos_token_id is None:
logger.warning("NO EOS TOKEN PROVIDED. Are you sure this is what you want?")

if self.bos_token_id is None:
logger.warning("NO BOS TOKEN PROVIDED. Are you sure this is what you want?")

if self.pad_token_id is None:
logger = get_logger(__file__)
logger.warning("No pad token ID provided; using EOS token ID.")
self.pad_token_id = self.eos_token_id

if self.segment_before_tokenization:
logger.warning(
"EXPERIMENTAL FEATURE: segmenting before tokenization is enabled. "
"This option has only been tested with Llama and GPT-NeoX tokenizers. "
"USE AT YOUR OWN RISK."
)

@classmethod
def deprecated_init(cls, tokenizer_name_or_path: str) -> "TokenizerConfig":
logger = get_logger(__file__)
logger.warning(
"The `tokenizer_name_or_path` argument is deprecated and will be removed in a future release. "
"Please use --tokenizer.name_or_path, and provide --tokenizer.eos_token_id as well "
"(and, optionally, --tokenizer.pad_token_id)."
)
from tokenizers import Tokenizer

# before options to pass eos_token_id and pad_token_id were added, eos was set to
# the last token in the vocab. We need to do the same here to maintain compatibility
legacy_tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path)
old_eos_token_id = len(legacy_tokenizer.get_vocab()) - 1

return cls(
name_or_path=tokenizer_name_or_path,
eos_token_id=old_eos_token_id,
segment_before_tokenization=False,
)


@dataclass
class TokenizationConfig:
documents: List[str] = field(
default=[],
help=(
Expand All @@ -27,7 +96,11 @@ class TokenizerConfig:
)
tokenizer_name_or_path: Optional[str] = field(
default=None,
help="Name or path of the tokenizer to use. Must be a HuggingFace-compatible tokenizer. Required.",
help="Deprecated. Use --tokenizer.name_or_path instead.",
)
tokenizer: Optional[TokenizerConfig] = field(
default=None,
help="Configuration for the tokenizer.",
)
processes: int = field(
default=1,
Expand Down Expand Up @@ -66,11 +139,11 @@ class TokenizerConfig:


class TokenizerCli(BaseCli):
CONFIG = TokenizerConfig
CONFIG = TokenizationConfig
DESCRIPTION = "Tokenize documents using the provided tokenizer."

@classmethod
def run(cls, parsed_config: TokenizerConfig):
def run(cls, parsed_config: TokenizationConfig):
logger = get_logger("tagger")

with make_workdirs(parsed_config.work_dir) as work_dirs:
Expand All @@ -97,8 +170,15 @@ def run(cls, parsed_config: TokenizerConfig):
if parsed_config.destination is None:
raise DolmaConfigError("Destination must be provided.")

if parsed_config.tokenizer_name_or_path is None:
raise DolmaConfigError("Tokenizer ID must be provided.")
# must handle new and deprecated way to get tokenizer config
if parsed_config.tokenizer is None:
if parsed_config.tokenizer_name_or_path is None:
raise DolmaConfigError("Tokenizer configuration is missing.")
else:
parsed_config.tokenizer = TokenizerConfig.deprecated_init(parsed_config.tokenizer_name_or_path)

if parsed_config.tokenizer.name_or_path is None:
raise DolmaConfigError("Tokenizer name or path must be provided.")

tokenize_in_parallel(
sources=documents,
Expand All @@ -107,7 +187,10 @@ def run(cls, parsed_config: TokenizerConfig):
num_readers=parsed_config.files_per_process,
local_shuffle=parsed_config.batch_size,
ring_size=parsed_config.ring_size,
tokenizer_name_or_path=parsed_config.tokenizer_name_or_path,
tokenizer_name_or_path=parsed_config.tokenizer.name_or_path,
eos_token_id=parsed_config.tokenizer.eos_token_id,
pad_token_id=parsed_config.tokenizer.pad_token_id,
segment_before_tokenization=parsed_config.tokenizer.segment_before_tokenization,
seed=parsed_config.seed,
metadata_dir=work_dirs.output,
max_size=parsed_config.max_size,
Expand Down
6 changes: 5 additions & 1 deletion python/dolma/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,16 @@ def split_paragraphs(text: str, remove_empty: bool = True) -> List[TextSlice]:
"""
Split a string into paragraphs. A paragraph is defined as a sequence of zero or more characters, followed
by a newline character, or a sequence of one or more characters, followed by the end of the string.
Args:
text (str): The text to split into paragraphs.
remove_empty (bool): Whether to remove empty paragraphs. Defaults to True.
"""
text_slices = [
TextSlice(doc=text, start=match.start(), end=match.end())
for match in re.finditer(r"([^\n]*\n|[^\n]+$)", text)
]
if remove_empty is True:
if remove_empty:
text_slices = [text_slice for text_slice in text_slices if text_slice.text.strip()]
return text_slices

Expand Down
27 changes: 25 additions & 2 deletions python/dolma/tokenizer/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
from typing_extensions import TypeAlias

from ..core.loggers import get_logger
from ..core.parallel import BaseParallelProcessor, QueueType
from ..core.paths import glob_path, join_path, mkdir_p
from .data_types import TokenizerOutput
Expand Down Expand Up @@ -38,6 +39,8 @@ def increment_progressbar( # type: ignore[override]

@classmethod
def process_single(cls, source_path: str, destination_path: str, queue: QueueType, **kwargs: Any):
logger = get_logger(__name__)

max_size: int = kwargs.pop("max_size", 1024 * 1024 * 1024)
dtype: np.dtype = np.dtype(kwargs.pop("dtype", "uint16"))
local_shuffle: int = kwargs.pop("local_shuffle", 10_000)
Expand All @@ -52,13 +55,27 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp
if tokenizer_name_or_path is None:
raise RuntimeError("tokenizer_name_or_path not provided")

eos_token_id = kwargs.pop("eos_token_id", None)
if eos_token_id is None:
raise ValueError("eos_token_id not provided")

pad_token_id = kwargs.pop("pad_token_id", None)
if pad_token_id is None:
logger.warning("pad_token_id not provided, using eos_token_id")
pad_token_id = eos_token_id

# this is useful for making sure the queue does not grows too much
cpu_count = multiprocessing.cpu_count()

documents_cnt = tokens_cnt = 0
update_interval = 1
mm_cnt = 0

tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path)
tokenizer = Tokenizer.from_pretrained(
tokenizer_name_or_path,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
)
tokenizer_ring = []
for _ in range(min(ring_size, len(source_paths))):
path = source_paths.pop()
Expand Down Expand Up @@ -185,7 +202,10 @@ def tokenize_in_parallel(
num_readers: Optional[int] = None,
local_shuffle: int = 10_000,
ring_size: int = 8,
tokenizer_name_or_path: str = "allenai/eleuther-ai-gpt-neox-20b-pii-special",
tokenizer_name_or_path: str = "allenai/gpt-neox-olmo-dolma-v1_5",
eos_token_id: Optional[int] = 50279,
pad_token_id: Optional[int] = 1,
segment_before_tokenization: bool = False,
seed: int = 3920,
metadata_dir: Optional[str] = None,
max_size: int = 1024 * 1024 * 1024,
Expand Down Expand Up @@ -222,5 +242,8 @@ def tokenize_in_parallel(
ring_size=ring_size,
max_size=max_size,
dtype=dtype,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
segment_docs_before_tokenization=segment_before_tokenization,
tokenizer_name_or_path=tokenizer_name_or_path,
)
Loading

0 comments on commit 45b5eea

Please sign in to comment.