allenai · undfined · Nov 1, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/src/olmo_core/data/numpy_dataset.py b/src/olmo_core/data/numpy_dataset.py
@@ -515,6 +515,7 @@ def __init__(
         self,
         *paths: PathOrStr,
         path_offset_index: Dict[Tuple[str, int], int],
+        seed: int,
         sequence_length: int,
         pad_token_id: int,
         eos_token_id: int,
@@ -565,6 +566,7 @@ def __init__(
         self._instances_per_bucket: Optional[Tuple[Tuple[int, int], ...]] = None
         self._path_offset_index = path_offset_index
         self._bust_index_cache = bust_index_cache
+        self._seed = seed
 
     def prepare(self):
         if self.fs_local_rank == 0:
@@ -612,7 +614,7 @@ def _write_document_indices(self):
                         eos_token_id=self.eos_token_id,
                         dtype=self.dtype,
                         indices_dtype=self.dtype,
-                        max_instances=max_instances,
+                        sample=(max_instances, self._seed),
                     )
                     futures.append(future)
 
@@ -1694,6 +1696,7 @@ def build(self) -> NumpyDatasetBase:
                 mixture = self.source_mixture_config.build()
                 return NumpyFSLDatasetMixture(
                     *mixture.to_paths(),
+                    seed=mixture.seed,
                     sequence_length=self.sequence_length,
                     max_target_sequence_length=self.max_target_sequence_length,
                     pad_token_id=self.tokenizer.pad_token_id,

diff --git a/src/olmo_core/data/source_mixture.py b/src/olmo_core/data/source_mixture.py
@@ -91,6 +91,7 @@ class SourceMixtureDataset:
     A dataset consisting of a fractionalized mixture of data sources.
     """
 
+    seed: int
     sources: List[SourceMixtureOutcome]
 
     def to_index(self) -> Dict[Tuple[str, int], int]:
@@ -196,7 +197,7 @@ def build(self) -> SourceMixtureDataset:
             for item in outcome.path_tokens:
                 log.info(f"Selected {item.tokens} tokens from {outcome.name} at {item.path}")
 
-        return SourceMixtureDataset(completed)
+        return SourceMixtureDataset(seed=self.seed, sources=completed)
 
     def get_paths_and_tokens_for_source(
         self, source_config: SourceMixtureConfig, token_details: SourceTokenDetails

diff --git a/src/olmo_core/data/utils.py b/src/olmo_core/data/utils.py
@@ -412,25 +412,32 @@ def segment_documents_into_instances(
     indices_dtype: Union[
         Type[np.uint8], Type[np.uint16], Type[np.uint32], Type[np.uint64]
     ] = np.uint32,
-    max_instances: Optional[int] = None,
+    sample: Optional[Tuple[int, int]] = None,
 ) -> Tuple[int, int]:
     """
     Segment documents into instances of at most ``sequence_length`` tokens.
     Saving the indices of the instances to ``target``.
 
+    Sample a subset of the instances if ``sample`` is provided as a tuple of ``(max_instances, seed)``.
+
     Returns the number of original documents and the number of resulting instances documents.
     """
     total_og_docs = 0
     indices: List[int] = []
     for start_idx, end_idx in iter_document_indices(path, eos_token_id=eos_token_id, dtype=dtype):
-        if max_instances is not None and len(indices) // 2 >= max_instances:
-            break
         total_og_docs += 1
         length = end_idx - start_idx
         indices.append(start_idx)
         indices.append(start_idx + min(length, max_sequence_length))
         start_idx += length
 
+    if sample is not None:
+        max_instances, seed = sample
+        rng = get_rng(seed)
+        indices = (
+            rng.choice(np.array(indices).reshape(-1, 2).tolist(), size=max_instances).flatten()
+        ).tolist()
+
     with memmap_to_write(target, dtype=indices_dtype, shape=(len(indices),)) as indices_mmap:
         indices_mmap[:] = indices