SysML-project
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎megatron/core/enums.py
+3-1 b/‎megatron/core/enums.py
+3-1
diff --git a/‎megatron/data/gpt_dataset.py
+72-72 b/‎megatron/data/gpt_dataset.py
+72-72
diff --git a/‎megatron/data/indexed_dataset.py
+1-1 b/‎megatron/data/indexed_dataset.py
+1-1
diff --git a/‎megatron/model/enums.py
+4-1 b/‎megatron/model/enums.py
+4-1
diff --git a/‎megatron/model/gpt_model.py
+7-5 b/‎megatron/model/gpt_model.py
+7-5
diff --git a/‎megatron/model/language_model.py
+31-53 b/‎megatron/model/language_model.py
+31-53
@@ -4,3 +4,5 @@ build
 .coverage_*
 *.egg-info
 *~
+slurm*
+logs
@@ -1,7 +1,9 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 import enum
 
 class ModelType(enum.Enum):
     encoder_or_decoder = 1
     encoder_and_decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
@@ -308,84 +308,84 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     shuffle_idx_filename = _filename + '_shuffle_idx.npy'
 
     # Build the indexed mapping if not exist.
-    if torch.distributed.get_rank() == 0:
-        if (not os.path.isfile(doc_idx_filename)) or \
-           (not os.path.isfile(sample_idx_filename)) or \
-           (not os.path.isfile(shuffle_idx_filename)):
+    if torch.distributed.get_rank() == 0 and \
+       (not os.path.isfile(doc_idx_filename) or
+        not os.path.isfile(sample_idx_filename) or
+        not os.path.isfile(shuffle_idx_filename)):
 
-            print_rank_0(' > WARNING: could not find index map files, building '
-                         'the indices on rank 0 ...')
+        print_rank_0(' > WARNING: could not find index map files, building '
+                     'the indices on rank 0 ...')
 
-            # For the last epoch, decide whether include the entire epoch
-            # in the global shuffle or not.
+        # For the last epoch, decide whether include the entire epoch
+        # in the global shuffle or not.
 
-            # If we need only one epoch, then separating last epoch  does
-            # not mean anything.
-            if num_epochs == 1:
-                separate_last_epoch = False
-                print(' > only one epoch required, setting '
-                      'separate_last_epoch to False', flush=True)
+        # If we need only one epoch, then separating last epoch  does
+        # not mean anything.
+        if num_epochs == 1:
+            separate_last_epoch = False
+            print(' > only one epoch required, setting '
+                  'separate_last_epoch to False', flush=True)
 
-            else:
-                # Get the number of samples for the last epoch
-                num_samples_from_epochs_minus_one = (
-                    (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
-                last_epoch_num_samples = num_samples - \
-                                         num_samples_from_epochs_minus_one
-                assert last_epoch_num_samples >= 0, \
-                    'last epoch number of samples should be non-negative.'
-                num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-                assert last_epoch_num_samples < (num_samples_per_epoch + 1), \
-                    'last epoch number of samples exceeded max value.'
-                # If we have less than 80% of the samples for the last epoch,
-                # seperate out the epoch and treat it differently.
-                # Note: the 80% number is just based on common sense and can
-                # be adjusted if needed.
-                separate_last_epoch = (last_epoch_num_samples <
-                                       int(0.80 * num_samples_per_epoch))
-                if separate_last_epoch:
-                    string = ' > last epoch number of samples ({}) is smaller '\
-                             'than 80% of number of samples per epoch ({}), '\
-                             'setting separate_last_epoch to True'
-                else:
-                    string = ' > last epoch number of samples ({}) is larger '\
-                             'than 80% of number of samples per epoch ({}), '\
-                             'setting separate_last_epoch to False'
-                print(string.format(last_epoch_num_samples,
-                                    num_samples_per_epoch), flush=True)
-
-            # doc-idx.
-            start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
-                                     separate_last_epoch)
-            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save doc-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
-            # sample-idx.
-            start_time = time.time()
-            # Use C++ implementation for speed.
-            # First compile and then import.
-            from megatron.data import helpers
-            assert doc_idx.dtype == np.int32
-            assert sizes.dtype == np.int32
-            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
-                                                  num_epochs, tokens_per_epoch)
-            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save sample-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
-            # shuffle-idx.
-            start_time = time.time()
-            # -1 is due to data structure used to retieve the index:
-            #    sample i --> [sample_idx[i], sample_idx[i+1])
+        else:
+            # Get the number of samples for the last epoch
+            num_samples_from_epochs_minus_one = (
+                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
+            last_epoch_num_samples = num_samples - \
+                                     num_samples_from_epochs_minus_one
+            assert last_epoch_num_samples >= 0, \
+                'last epoch number of samples should be non-negative.'
+            num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
+            assert last_epoch_num_samples < (num_samples_per_epoch + 1), \
+                'last epoch number of samples exceeded max value.'
+            # If we have less than 80% of the samples for the last epoch,
+            # seperate out the epoch and treat it differently.
+            # Note: the 80% number is just based on common sense and can
+            # be adjusted if needed.
+            separate_last_epoch = (last_epoch_num_samples <
+                                   int(0.80 * num_samples_per_epoch))
             if separate_last_epoch:
-                num_samples_ = num_samples_from_epochs_minus_one
+                string = ' > last epoch number of samples ({}) is smaller '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to True'
             else:
-                num_samples_ = sample_idx.shape[0] - 1
-            shuffle_idx = _build_shuffle_idx(num_samples_,
-                                             sample_idx.shape[0] - 1, np_rng)
-            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
-                         ' (seconds): {:4f}'.format(time.time() - start_time))
+                string = ' > last epoch number of samples ({}) is larger '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to False'
+            print(string.format(last_epoch_num_samples,
+                                num_samples_per_epoch), flush=True)
+
+        # doc-idx.
+        start_time = time.time()
+        doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
+                                 separate_last_epoch)
+        np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+        print_rank_0(' > elasped time to build and save doc-idx mapping '
+                     '(seconds): {:4f}'.format(time.time() - start_time))
+        # sample-idx.
+        start_time = time.time()
+        # Use C++ implementation for speed.
+        # First compile and then import.
+        from megatron.data import helpers
+        assert doc_idx.dtype == np.int32
+        assert sizes.dtype == np.int32
+        sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
+                                              num_epochs, tokens_per_epoch)
+        np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+        print_rank_0(' > elasped time to build and save sample-idx mapping '
+                     '(seconds): {:4f}'.format(time.time() - start_time))
+        # shuffle-idx.
+        start_time = time.time()
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        if separate_last_epoch:
+            num_samples_ = num_samples_from_epochs_minus_one
+        else:
+            num_samples_ = sample_idx.shape[0] - 1
+        shuffle_idx = _build_shuffle_idx(num_samples_,
+                                         sample_idx.shape[0] - 1, np_rng)
+        np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+        print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+                     ' (seconds): {:4f}'.format(time.time() - start_time))
 
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
 
@@ -95,7 +95,7 @@ def write_longs(f, a):
     3: np.int16,
     4: np.int32,
     5: np.int64,
-    6: np.float,
+    6: np.float32,
     7: np.double,
     8: np.uint16
 }
 
@@ -1,10 +1,13 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 import enum
 
 class LayerType(enum.Enum):
     encoder = 1
     decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
+    retro_decoder_with_retriever = 5
 
 class AttnType(enum.Enum):
     self_attn = 1
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 """GPT-2 model."""
 
@@ -77,16 +77,18 @@ def set_input_tensor(self, input_tensor):
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, input_ids, position_ids, attention_mask,
-                ret_input_ids=None, ret_position_ids=None, ret_attn_mask=None,
+                retriever_input_ids=None,
+                retriever_position_ids=None,
+                retriever_attn_mask=None,
                 labels=None, tokentype_ids=None, inference_params=None):
 
         lm_output = self.language_model(
             input_ids,
             position_ids,
             attention_mask,
-            ret_input_ids=ret_input_ids,
-            ret_position_ids=ret_position_ids,
-            ret_attn_mask=ret_attn_mask,
+            retriever_input_ids=retriever_input_ids,
+            retriever_position_ids=retriever_position_ids,
+            retriever_attn_mask=retriever_attn_mask,
             inference_params=inference_params)
 
         if self.post_process:
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 """Transformer based language model."""
 
@@ -7,10 +7,10 @@
 
 from megatron import get_args
 from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
 
-from .enums import LayerType, AttnMaskType
+from .enums import AttnMaskType, LayerType
 from .module import MegatronModule
-from .retro_transformer import ParallelRetroEncoder, ParallelRetroTransformer
 from .rotary_pos_embedding import apply_rotary_pos_emb, RotaryEmbedding
 from .transformer import ParallelTransformer
 from .utils import get_linear_layer
@@ -352,6 +352,7 @@ def __init__(self,
         self.decoder_attn_mask_type = decoder_attn_mask_type
         self.add_pooler = add_pooler
         self.encoder_hidden_state = None
+        self.add_retriever = args.retro_add_retriever
         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
 
         # Embeddings.
@@ -380,39 +381,18 @@ def __init__(self,
             # https://github.com/kingoflolz/mesh-transformer-jax/
             self.rotary_pos_emb = RotaryEmbedding(rotary_dim)
 
-        # Retriever (bi-directional transformer with cross attention)
-        if args.retro_add_retriever:
-            self.retriever = ParallelRetroEncoder(
+        # Encoder (usually set to True, False if part of an encoder-decoder
+        # architecture and in encoder-only stage).
+        if self.add_encoder:
+            self.encoder = ParallelTransformer(
                 self.init_method,
                 output_layer_init_method,
-                self_attn_mask_type=AttnMaskType.padding,
+                model_type=args.model_type if not args.retro_add_retriever \
+                    else ModelType.retro_decoder,
+                self_attn_mask_type=self.encoder_attn_mask_type,
                 pre_process=self.pre_process,
-                post_process=False,
+                post_process=self.post_process,
             )
-            self._retriever_key = 'retriever'
-        else:
-            self.retriever = None
-
-        # Encoder (usually set to True, False if part of an encoder-decoder
-        # architecture and in encoder-only stage).
-        if self.add_encoder:
-            if args.retro_add_retriever:
-                self.encoder = ParallelRetroTransformer(
-                    self.init_method,
-                    output_layer_init_method,
-                    self_attn_mask_type=self.encoder_attn_mask_type,
-                    pre_process=self.pre_process,
-                    post_process=self.post_process,
-                    retriever=self.retriever,
-                )
-            else:
-                self.encoder = ParallelTransformer(
-                    self.init_method,
-                    output_layer_init_method,
-                    self_attn_mask_type=self.encoder_attn_mask_type,
-                    pre_process=self.pre_process,
-                    post_process=self.post_process,
-                )
             self._encoder_key = 'encoder'
         else:
             self.encoder = None
@@ -423,6 +403,7 @@ def __init__(self,
             self.decoder = ParallelTransformer(
                 self.init_method,
                 output_layer_init_method,
+                model_type=args.model_type,
                 layer_type=LayerType.decoder,
                 self_attn_mask_type=self.decoder_attn_mask_type,
                 pre_process=self.pre_process,
@@ -477,26 +458,29 @@ def set_input_tensor(self, input_tensor):
 
     def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
                 dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
-                ret_input_ids=None, ret_position_ids=None, ret_attn_mask=None,
+                retriever_input_ids=None,
+                retriever_position_ids=None,
+                retriever_attn_mask=None,
                 enc_dec_attn_mask=None, tokentype_ids=None,
                 inference_params=None,
                 pooling_sequence_index=0,
                 enc_hidden_states=None, output_enc_hidden=False):
 
-        # Retriever embedding.
-        if self.retriever and self.pre_process:
-            retriever_input = self.embedding(ret_input_ids, ret_position_ids,
-                                             tokentype_ids=tokentype_ids)
-        else:
-            retriever_input = None
-
         # Encoder embedding.
         if self.pre_process:
             encoder_input = self.embedding(enc_input_ids, enc_position_ids,
                                            tokentype_ids=tokentype_ids)
         else:
             encoder_input = None
 
+        # Retriever embedding.
+        if self.add_retriever and self.pre_process:
+            retriever_input = self.embedding(retriever_input_ids,
+                                             retriever_position_ids,
+                                             tokentype_ids=tokentype_ids)
+        else:
+            retriever_input = None
+
         # Rotary positional embeddings
         rotary_pos_emb = None
         if self.use_rotary_position_embeddings:
@@ -509,19 +493,13 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
         # Run encoder.
         if enc_hidden_states is None:
             if self.encoder is not None:
-                if self.retriever:
-                    encoder_output = self.encoder(
-                        encoder_input,
-                        enc_attn_mask,
-                        retriever_output=retriever_input,
-                        retriever_attn_mask=ret_attn_mask,
-                        inference_params=inference_params)
-                else:
-                    encoder_output = self.encoder(
-                        encoder_input,
-                        enc_attn_mask,
-                        inference_params=inference_params,
-                        rotary_pos_emb=rotary_pos_emb)
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    retriever_input=retriever_input,
+                    retriever_attn_mask=retriever_attn_mask,
+                    inference_params=inference_params,
+                    rotary_pos_emb=rotary_pos_emb)
             else:
                 encoder_output = self.encoder_hidden_state
         else:
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ def write_longs(f, a):`
`95`	`95`	`3: np.int16,`
`96`	`96`	`4: np.int32,`
`97`	`97`	`5: np.int64,`
`98`		`- 6: np.float,`
	`98`	`+ 6: np.float32,`
`99`	`99`	`7: np.double,`
`100`	`100`	`8: np.uint16`
`101`	`101`	`}`