Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds mixing loader for FSL datasets #70

Merged
merged 61 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
a8b5ec6
WIP: Generate a mixture dataset
undfined Oct 18, 2024
637fee9
WIP: Adds dry run
undfined Oct 18, 2024
346135c
Test cleanup
undfined Oct 18, 2024
53def38
WIP: Make it fast
undfined Oct 19, 2024
8649cc8
WIP: Simple benchmark
undfined Oct 19, 2024
e3d7011
WIP: Refactor
undfined Oct 23, 2024
efe766b
Launch script
undfined Oct 23, 2024
5dff40c
temp changes to test
undfined Oct 23, 2024
2703538
deps for now
undfined Oct 23, 2024
3ee3278
Try with session
undfined Oct 23, 2024
cd1c6d2
Try internal client
undfined Oct 23, 2024
9895b23
Try boto3
undfined Oct 23, 2024
3c15f52
Fixes
undfined Oct 24, 2024
0c9355b
?
undfined Oct 24, 2024
abb362a
Cleanup + session stuff
undfined Oct 24, 2024
82a1af9
Use environ
undfined Oct 24, 2024
d0a80ba
JUST use env vars please boto
undfined Oct 24, 2024
e621f8e
No unions of containers
undfined Oct 24, 2024
0689c42
prepare first
undfined Oct 24, 2024
8ab2e99
Loader handles prepare
undfined Oct 24, 2024
dcfda67
Try recording torch exceptions
undfined Oct 24, 2024
23a0806
Don't need overrides
undfined Oct 24, 2024
8cfa282
Figure out why config/creds are missing
undfined Oct 24, 2024
fd1a508
fmt
undfined Oct 24, 2024
01a40ea
Env not ready yet
undfined Oct 24, 2024
8bde2b3
print beaker user
undfined Oct 24, 2024
ae208f6
uncomment eval file
undfined Oct 24, 2024
ce9d06f
replicate CommonComponents setup
undfined Oct 24, 2024
d1eb4df
Some class init stuff
undfined Oct 24, 2024
dbce279
Some more config logging
undfined Oct 24, 2024
5daa274
Conflict in CHANGELOG
undfined Oct 24, 2024
980e05a
checks cleanup
undfined Oct 24, 2024
f27bd73
Fixes for duplicate paths in mixture
undfined Oct 25, 2024
c69b228
In case there a ton of files
undfined Oct 25, 2024
18efafd
Maybe fix trainer launch
undfined Oct 25, 2024
5ceed46
Match other example
undfined Oct 25, 2024
4c7513e
More tests
undfined Oct 25, 2024
8401580
Try diff gpus
undfined Oct 25, 2024
0d77422
keep fsdp
undfined Oct 25, 2024
d22ed10
checks
undfined Oct 25, 2024
68a4d28
Less tokens
undfined Oct 25, 2024
c35514c
Exclude ai2/allennlp-elanding-a100-40g temp
undfined Oct 25, 2024
02cb49b
Merge branch 'main' of github.com:allenai/OLMo-core into undfined/mix…
undfined Oct 28, 2024
c453e65
Feedback
undfined Oct 28, 2024
a288d9e
Drop examples
undfined Oct 28, 2024
9c49f25
A bit more cleanup
undfined Oct 28, 2024
89504bc
Outdated changelog
undfined Oct 28, 2024
3aa5c35
Unused deps
undfined Oct 28, 2024
8f729dd
One more dep
undfined Oct 28, 2024
a848195
uncomment test assertions
undfined Oct 28, 2024
5322bf1
Drop todo
undfined Oct 28, 2024
5c22665
0 is an invalid token
undfined Oct 28, 2024
87e9168
More feedback
undfined Oct 29, 2024
fe50a32
Randomly sample instances when segmenting
undfined Oct 29, 2024
cffcba3
Memray + limit marker
undfined Oct 30, 2024
293be02
Add dep
undfined Oct 30, 2024
e45d2c3
Lint
undfined Oct 30, 2024
ffe7660
Bigger array is more informative
undfined Oct 30, 2024
19db2a9
Merge branch 'main' into undfined/mixing-loader
undfined Oct 30, 2024
1fdb995
Feedback
undfined Oct 30, 2024
74d4c5a
Merge branch 'main' into undfined/mixing-loader
undfined Nov 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/olmo_core/data/numpy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ def __init__(
self,
*paths: PathOrStr,
path_offset_index: Dict[Tuple[str, int], int],
seed: int,
sequence_length: int,
pad_token_id: int,
eos_token_id: int,
Expand Down Expand Up @@ -565,6 +566,7 @@ def __init__(
self._instances_per_bucket: Optional[Tuple[Tuple[int, int], ...]] = None
self._path_offset_index = path_offset_index
self._bust_index_cache = bust_index_cache
self._seed = seed

def prepare(self):
if self.fs_local_rank == 0:
Expand Down Expand Up @@ -612,7 +614,7 @@ def _write_document_indices(self):
eos_token_id=self.eos_token_id,
dtype=self.dtype,
indices_dtype=self.dtype,
max_instances=max_instances,
sample=(max_instances, self._seed),
)
futures.append(future)

Expand Down Expand Up @@ -1694,6 +1696,7 @@ def build(self) -> NumpyDatasetBase:
mixture = self.source_mixture_config.build()
return NumpyFSLDatasetMixture(
*mixture.to_paths(),
seed=mixture.seed,
sequence_length=self.sequence_length,
max_target_sequence_length=self.max_target_sequence_length,
pad_token_id=self.tokenizer.pad_token_id,
Expand Down
3 changes: 2 additions & 1 deletion src/olmo_core/data/source_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class SourceMixtureDataset:
A dataset consisting of a fractionalized mixture of data sources.
"""

seed: int
sources: List[SourceMixtureOutcome]

def to_index(self) -> Dict[Tuple[str, int], int]:
Expand Down Expand Up @@ -196,7 +197,7 @@ def build(self) -> SourceMixtureDataset:
for item in outcome.path_tokens:
log.info(f"Selected {item.tokens} tokens from {outcome.name} at {item.path}")

return SourceMixtureDataset(completed)
return SourceMixtureDataset(seed=self.seed, sources=completed)

def get_paths_and_tokens_for_source(
self, source_config: SourceMixtureConfig, token_details: SourceTokenDetails
Expand Down
13 changes: 10 additions & 3 deletions src/olmo_core/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,25 +412,32 @@ def segment_documents_into_instances(
indices_dtype: Union[
Type[np.uint8], Type[np.uint16], Type[np.uint32], Type[np.uint64]
] = np.uint32,
max_instances: Optional[int] = None,
sample: Optional[Tuple[int, int]] = None,
) -> Tuple[int, int]:
"""
Segment documents into instances of at most ``sequence_length`` tokens.
Saving the indices of the instances to ``target``.

Sample a subset of the instances if ``sample`` is provided as a tuple of ``(max_instances, seed)``.

Returns the number of original documents and the number of resulting instances documents.
"""
total_og_docs = 0
indices: List[int] = []
for start_idx, end_idx in iter_document_indices(path, eos_token_id=eos_token_id, dtype=dtype):
if max_instances is not None and len(indices) // 2 >= max_instances:
break
total_og_docs += 1
length = end_idx - start_idx
indices.append(start_idx)
indices.append(start_idx + min(length, max_sequence_length))
start_idx += length

if sample is not None:
max_instances, seed = sample
rng = get_rng(seed)
indices = (
rng.choice(np.array(indices).reshape(-1, 2).tolist(), size=max_instances).flatten()
undfined marked this conversation as resolved.
Show resolved Hide resolved
).tolist()

with memmap_to_write(target, dtype=indices_dtype, shape=(len(indices),)) as indices_mmap:
indices_mmap[:] = indices

Expand Down
Loading