From 052d318af167279092e9de44147a890197800a56 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Fri, 21 Jun 2024 15:53:40 -0700 Subject: [PATCH 1/2] instance filter test --- conftest.py | 3 +++ tests/data/collator_test.py | 18 ++++++++++++++++++ tests/data/memmap_dataset_test.py | 17 +++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/conftest.py b/conftest.py index a1ab2aed5..2eb17aa01 100644 --- a/conftest.py +++ b/conftest.py @@ -11,6 +11,7 @@ SchedulerConfig, TokenizerConfig, TrainConfig, + InstanceFilterConfig, ) from olmo.tokenizer import Tokenizer @@ -72,6 +73,8 @@ def train_config(tmp_path, model_config) -> TrainConfig: "test_fixtures/c4-sample.03.json.gz", ], pad_direction=PaddingDirection.right, + instance_filter=InstanceFilterConfig(repetition_max_period=3, repetition_min_period=1, + repetition_max_count=3), ), tokenizer=TokenizerConfig(identifier=TEST_MODEL), save_folder=str(tmp_path / "checkpoints"), diff --git a/tests/data/collator_test.py b/tests/data/collator_test.py index e94451313..4e676d563 100644 --- a/tests/data/collator_test.py +++ b/tests/data/collator_test.py @@ -129,3 +129,21 @@ def test_collate_with_label_mask(train_config, pad_direction): [[True, False, True, True], [False, True, True, False]], ) ).all() + + +@pytest.mark.parametrize( + "pad_direction", + [pytest.param(PaddingDirection.right, id="pad-right"), pytest.param(PaddingDirection.left, id="pad-left")], +) +def test_collate_with_instance_filter(train_config, pad_direction): + train_config.data.pad_direction = pad_direction + collator = DataCollator.from_train_config(train_config) + + inputs = [torch.tensor([0, 0, 2, 3]), torch.tensor([1, 1, 1])] + batch = collator(inputs) + assert batch["input_ids"].shape == (2, 4) + if pad_direction == "right": + assert batch["input_ids"][1][-1] == train_config.model.pad_token_id + else: + assert batch["input_ids"][1][0] == train_config.model.pad_token_id + diff --git a/tests/data/memmap_dataset_test.py b/tests/data/memmap_dataset_test.py index e267043ee..01c9e0d5d 100644 --- a/tests/data/memmap_dataset_test.py +++ b/tests/data/memmap_dataset_test.py @@ -3,6 +3,7 @@ import numpy as np +from olmo.config import InstanceFilterConfig from olmo.data.memmap_dataset import MemMapDataset from olmo.tokenizer import Tokenizer @@ -106,3 +107,19 @@ def test_concat_mmap_datasets(tmp_path: Path): # Should get the same with negative index. assert ds[-1]["input_ids"].tolist() == [3, 4, 5] assert ds[-1]["metadata"]["label"] == "test2" + + +def test_instance_filter(tmp_path: Path): + # Write some bad data to disk. + mmap = np.memmap(tmp_path / "bad_tokens.npy", dtype=np.uint16, mode="w+", shape=(128,)) + mmap[:] = list(np.ones(31)) + list(range(64-31)) + list(np.ones(32)) + list(range(64-32)) + mmap.flush() + + instance_filter_config = InstanceFilterConfig(repetition_min_period=1, repetition_max_period=13, repetition_max_count=32) + ds = MemMapDataset(tmp_path / "bad_tokens.npy", chunk_size=64, instance_filter_config=instance_filter_config) + + out = ds[0] + assert out["instance_mask"] is True + + out = ds[1] + assert out["instance_mask"] is False From 4ea61ae0eaff884639e8a6ed880828e3e145bdff Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Wed, 17 Jul 2024 14:43:24 -0700 Subject: [PATCH 2/2] isort --- conftest.py | 7 ++++--- tests/data/collator_test.py | 1 - tests/data/memmap_dataset_test.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index 2eb17aa01..ec975c428 100644 --- a/conftest.py +++ b/conftest.py @@ -5,13 +5,13 @@ from olmo.config import ( DataConfig, InitFnType, + InstanceFilterConfig, ModelConfig, OptimizerConfig, PaddingDirection, SchedulerConfig, TokenizerConfig, TrainConfig, - InstanceFilterConfig, ) from olmo.tokenizer import Tokenizer @@ -73,8 +73,9 @@ def train_config(tmp_path, model_config) -> TrainConfig: "test_fixtures/c4-sample.03.json.gz", ], pad_direction=PaddingDirection.right, - instance_filter=InstanceFilterConfig(repetition_max_period=3, repetition_min_period=1, - repetition_max_count=3), + instance_filter=InstanceFilterConfig( + repetition_max_period=3, repetition_min_period=1, repetition_max_count=3 + ), ), tokenizer=TokenizerConfig(identifier=TEST_MODEL), save_folder=str(tmp_path / "checkpoints"), diff --git a/tests/data/collator_test.py b/tests/data/collator_test.py index 4e676d563..9c3b60f29 100644 --- a/tests/data/collator_test.py +++ b/tests/data/collator_test.py @@ -146,4 +146,3 @@ def test_collate_with_instance_filter(train_config, pad_direction): assert batch["input_ids"][1][-1] == train_config.model.pad_token_id else: assert batch["input_ids"][1][0] == train_config.model.pad_token_id - diff --git a/tests/data/memmap_dataset_test.py b/tests/data/memmap_dataset_test.py index 01c9e0d5d..39579cef1 100644 --- a/tests/data/memmap_dataset_test.py +++ b/tests/data/memmap_dataset_test.py @@ -112,10 +112,12 @@ def test_concat_mmap_datasets(tmp_path: Path): def test_instance_filter(tmp_path: Path): # Write some bad data to disk. mmap = np.memmap(tmp_path / "bad_tokens.npy", dtype=np.uint16, mode="w+", shape=(128,)) - mmap[:] = list(np.ones(31)) + list(range(64-31)) + list(np.ones(32)) + list(range(64-32)) + mmap[:] = list(np.ones(31)) + list(range(64 - 31)) + list(np.ones(32)) + list(range(64 - 32)) mmap.flush() - instance_filter_config = InstanceFilterConfig(repetition_min_period=1, repetition_max_period=13, repetition_max_count=32) + instance_filter_config = InstanceFilterConfig( + repetition_min_period=1, repetition_max_period=13, repetition_max_count=32 + ) ds = MemMapDataset(tmp_path / "bad_tokens.npy", chunk_size=64, instance_filter_config=instance_filter_config) out = ds[0]