@@ -308,84 +308,84 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
308
308
shuffle_idx_filename = _filename + '_shuffle_idx.npy'
309
309
310
310
# Build the indexed mapping if not exist.
311
- if torch .distributed .get_rank () == 0 :
312
- if (not os .path .isfile (doc_idx_filename )) or \
313
- ( not os .path .isfile (sample_idx_filename )) or \
314
- ( not os .path .isfile (shuffle_idx_filename )):
311
+ if torch .distributed .get_rank () == 0 and \
312
+ (not os .path .isfile (doc_idx_filename ) or
313
+ not os .path .isfile (sample_idx_filename ) or
314
+ not os .path .isfile (shuffle_idx_filename )):
315
315
316
- print_rank_0 (' > WARNING: could not find index map files, building '
317
- 'the indices on rank 0 ...' )
316
+ print_rank_0 (' > WARNING: could not find index map files, building '
317
+ 'the indices on rank 0 ...' )
318
318
319
- # For the last epoch, decide whether include the entire epoch
320
- # in the global shuffle or not.
319
+ # For the last epoch, decide whether include the entire epoch
320
+ # in the global shuffle or not.
321
321
322
- # If we need only one epoch, then separating last epoch does
323
- # not mean anything.
324
- if num_epochs == 1 :
325
- separate_last_epoch = False
326
- print (' > only one epoch required, setting '
327
- 'separate_last_epoch to False' , flush = True )
322
+ # If we need only one epoch, then separating last epoch does
323
+ # not mean anything.
324
+ if num_epochs == 1 :
325
+ separate_last_epoch = False
326
+ print (' > only one epoch required, setting '
327
+ 'separate_last_epoch to False' , flush = True )
328
328
329
- else :
330
- # Get the number of samples for the last epoch
331
- num_samples_from_epochs_minus_one = (
332
- (num_epochs - 1 ) * tokens_per_epoch - 1 ) // seq_length
333
- last_epoch_num_samples = num_samples - \
334
- num_samples_from_epochs_minus_one
335
- assert last_epoch_num_samples >= 0 , \
336
- 'last epoch number of samples should be non-negative.'
337
- num_samples_per_epoch = (tokens_per_epoch - 1 ) // seq_length
338
- assert last_epoch_num_samples < (num_samples_per_epoch + 1 ), \
339
- 'last epoch number of samples exceeded max value.'
340
- # If we have less than 80% of the samples for the last epoch,
341
- # seperate out the epoch and treat it differently.
342
- # Note: the 80% number is just based on common sense and can
343
- # be adjusted if needed.
344
- separate_last_epoch = (last_epoch_num_samples <
345
- int (0.80 * num_samples_per_epoch ))
346
- if separate_last_epoch :
347
- string = ' > last epoch number of samples ({}) is smaller ' \
348
- 'than 80% of number of samples per epoch ({}), ' \
349
- 'setting separate_last_epoch to True'
350
- else :
351
- string = ' > last epoch number of samples ({}) is larger ' \
352
- 'than 80% of number of samples per epoch ({}), ' \
353
- 'setting separate_last_epoch to False'
354
- print (string .format (last_epoch_num_samples ,
355
- num_samples_per_epoch ), flush = True )
356
-
357
- # doc-idx.
358
- start_time = time .time ()
359
- doc_idx = _build_doc_idx (documents , num_epochs , np_rng ,
360
- separate_last_epoch )
361
- np .save (doc_idx_filename , doc_idx , allow_pickle = True )
362
- print_rank_0 (' > elasped time to build and save doc-idx mapping '
363
- '(seconds): {:4f}' .format (time .time () - start_time ))
364
- # sample-idx.
365
- start_time = time .time ()
366
- # Use C++ implementation for speed.
367
- # First compile and then import.
368
- from megatron .data import helpers
369
- assert doc_idx .dtype == np .int32
370
- assert sizes .dtype == np .int32
371
- sample_idx = helpers .build_sample_idx (sizes , doc_idx , seq_length ,
372
- num_epochs , tokens_per_epoch )
373
- np .save (sample_idx_filename , sample_idx , allow_pickle = True )
374
- print_rank_0 (' > elasped time to build and save sample-idx mapping '
375
- '(seconds): {:4f}' .format (time .time () - start_time ))
376
- # shuffle-idx.
377
- start_time = time .time ()
378
- # -1 is due to data structure used to retieve the index:
379
- # sample i --> [sample_idx[i], sample_idx[i+1])
329
+ else :
330
+ # Get the number of samples for the last epoch
331
+ num_samples_from_epochs_minus_one = (
332
+ (num_epochs - 1 ) * tokens_per_epoch - 1 ) // seq_length
333
+ last_epoch_num_samples = num_samples - \
334
+ num_samples_from_epochs_minus_one
335
+ assert last_epoch_num_samples >= 0 , \
336
+ 'last epoch number of samples should be non-negative.'
337
+ num_samples_per_epoch = (tokens_per_epoch - 1 ) // seq_length
338
+ assert last_epoch_num_samples < (num_samples_per_epoch + 1 ), \
339
+ 'last epoch number of samples exceeded max value.'
340
+ # If we have less than 80% of the samples for the last epoch,
341
+ # seperate out the epoch and treat it differently.
342
+ # Note: the 80% number is just based on common sense and can
343
+ # be adjusted if needed.
344
+ separate_last_epoch = (last_epoch_num_samples <
345
+ int (0.80 * num_samples_per_epoch ))
380
346
if separate_last_epoch :
381
- num_samples_ = num_samples_from_epochs_minus_one
347
+ string = ' > last epoch number of samples ({}) is smaller ' \
348
+ 'than 80% of number of samples per epoch ({}), ' \
349
+ 'setting separate_last_epoch to True'
382
350
else :
383
- num_samples_ = sample_idx .shape [0 ] - 1
384
- shuffle_idx = _build_shuffle_idx (num_samples_ ,
385
- sample_idx .shape [0 ] - 1 , np_rng )
386
- np .save (shuffle_idx_filename , shuffle_idx , allow_pickle = True )
387
- print_rank_0 (' > elasped time to build and save shuffle-idx mapping'
388
- ' (seconds): {:4f}' .format (time .time () - start_time ))
351
+ string = ' > last epoch number of samples ({}) is larger ' \
352
+ 'than 80% of number of samples per epoch ({}), ' \
353
+ 'setting separate_last_epoch to False'
354
+ print (string .format (last_epoch_num_samples ,
355
+ num_samples_per_epoch ), flush = True )
356
+
357
+ # doc-idx.
358
+ start_time = time .time ()
359
+ doc_idx = _build_doc_idx (documents , num_epochs , np_rng ,
360
+ separate_last_epoch )
361
+ np .save (doc_idx_filename , doc_idx , allow_pickle = True )
362
+ print_rank_0 (' > elasped time to build and save doc-idx mapping '
363
+ '(seconds): {:4f}' .format (time .time () - start_time ))
364
+ # sample-idx.
365
+ start_time = time .time ()
366
+ # Use C++ implementation for speed.
367
+ # First compile and then import.
368
+ from megatron .data import helpers
369
+ assert doc_idx .dtype == np .int32
370
+ assert sizes .dtype == np .int32
371
+ sample_idx = helpers .build_sample_idx (sizes , doc_idx , seq_length ,
372
+ num_epochs , tokens_per_epoch )
373
+ np .save (sample_idx_filename , sample_idx , allow_pickle = True )
374
+ print_rank_0 (' > elasped time to build and save sample-idx mapping '
375
+ '(seconds): {:4f}' .format (time .time () - start_time ))
376
+ # shuffle-idx.
377
+ start_time = time .time ()
378
+ # -1 is due to data structure used to retieve the index:
379
+ # sample i --> [sample_idx[i], sample_idx[i+1])
380
+ if separate_last_epoch :
381
+ num_samples_ = num_samples_from_epochs_minus_one
382
+ else :
383
+ num_samples_ = sample_idx .shape [0 ] - 1
384
+ shuffle_idx = _build_shuffle_idx (num_samples_ ,
385
+ sample_idx .shape [0 ] - 1 , np_rng )
386
+ np .save (shuffle_idx_filename , shuffle_idx , allow_pickle = True )
387
+ print_rank_0 (' > elasped time to build and save shuffle-idx mapping'
388
+ ' (seconds): {:4f}' .format (time .time () - start_time ))
389
389
390
390
# This should be a barrier but nccl barrier assumes
391
391
# device_index=rank which is not the case for model
0 commit comments