Make TransformerLM train reasonably well in trax. Adding loss and metric masking and dropout refactor in Transformer.

Lukasz Kaiser · Copybara-Service · commit 0d840eeadcd3 · 2019-03-21T16:02:40.000-07:00
PiperOrigin-RevId: 239692595
diff --git a/tensor2tensor/trax/configs/resnet50_imagenet_8gb.gin b/tensor2tensor/trax/configs/resnet50_imagenet_8gb.gin
@@ -42,5 +42,5 @@ train.eval_steps = 20
 train.inputs = @trax.inputs.inputs
 train.model = @trax.models.Resnet50
 train.optimizer = @trax.optimizers.momentum
-train.train_steps = 500000
+train.train_steps = 1000000
 train.lr_schedule = @learning_rate.EvalAdjustingSchedule
diff --git a/tensor2tensor/trax/configs/transformer_lm1b_8gb.gin b/tensor2tensor/trax/configs/transformer_lm1b_8gb.gin
@@ -5,28 +5,32 @@ import tensor2tensor.trax.trax
 
 # Parameters for batch_fun:
 # ==============================================================================
-batch_fun.batch_size = 32
-batch_fun.eval_batch_size = 32
+batch_fun.batch_size = 128
+batch_fun.eval_batch_size = 128
 
 # Parameters for inputs:
 # ==============================================================================
 inputs.data_dir = None
 inputs.dataset_name = 't2t_languagemodel_lm1b32k'
 
+# Parameters for mask:
+# ==============================================================================
+mask.mask_id = 0
+
 # Parameters for MultifactorSchedule:
 # ==============================================================================
-MultifactorSchedule.constant = 0.05
+MultifactorSchedule.constant = 0.1
 MultifactorSchedule.factors = 'constant * linear_warmup * rsqrt_decay'
 MultifactorSchedule.warmup_steps = 8000
 
 # Parameters for preprocess_fun:
 # ==============================================================================
-preprocess_fun.max_target_length = 256
+preprocess_fun.max_target_length = 512
 
 # Parameters for train:
 # ==============================================================================
 train.eval_frequency = 1000
-train.eval_steps = 1
+train.eval_steps = 5
 train.inputs = @trax.inputs.inputs
 train.model = @trax.models.TransformerLM
 train.run_debug_step = False
@@ -38,10 +42,10 @@ train_and_eval_batches.input_name = 'targets'
 
 # Parameters for TransformerLM:
 # ==============================================================================
-TransformerLM.dropout = 0.1
+TransformerLM.dropout = 0.2
 TransformerLM.feature_depth = 512
 TransformerLM.feedforward_depth = 2048
-TransformerLM.max_len = 256
+TransformerLM.max_len = 512
 TransformerLM.mode = 'train'
 TransformerLM.num_heads = 8
 TransformerLM.num_layers = 6
diff --git a/tensor2tensor/trax/inputs.py b/tensor2tensor/trax/inputs.py
@@ -191,10 +191,12 @@ def batch_fun(dataset, training, shapes, target_names,
     if variable_target_shapes:
       bucket_boundaries = [bucket_length // 4, bucket_length // 2,
                            bucket_length, bucket_length * 2,
-                           bucket_length * 4, bucket_length * 8]
+                           bucket_length * 4, bucket_length * 8,
+                           bucket_length * 16]
       bucket_batch_sizes = [cur_batch_size * 4, cur_batch_size * 2,
                             cur_batch_size, cur_batch_size // 2,
-                            cur_batch_size // 4, cur_batch_size // 8, 1]
+                            cur_batch_size // 4, cur_batch_size // 8,
+                            max(1, cur_batch_size // 16), 1]
       buckets = (bucket_boundaries, bucket_batch_sizes)
 
   if buckets:
diff --git a/tensor2tensor/trax/models/transformer.py b/tensor2tensor/trax/models/transformer.py
@@ -29,7 +29,7 @@ def TransformerEncoder(mode='train',  # pylint: disable=invalid-name
                        feature_depth=512,
                        feedforward_depth=2048,
                        num_heads=8,
-                       dropout=0.9):
+                       dropout=0.1):
   """Transformer Encoder Stack.
 
   Args:
@@ -38,20 +38,22 @@ def TransformerEncoder(mode='train',  # pylint: disable=invalid-name
     feature_depth: int:  depth of embedding
     feedforward_depth: int: depth of feed-forward layer
     num_heads: int: number of attention heads
-    dropout: float: dropout rate - Stax follows TF's KEEP probability convention
+    dropout: float: dropout rate (how much to drop out; note that stax follows
+      Tensorflow's keep_rate convention, so we use 1 - dropout in calls below)
 
   Returns:
     A staxlayer for implementing a raw Transformer encoder stack.  No embedding
     or positional signals are added by this layer.
   """
+  keep_rate = 1.0 - dropout
   # Multi-headed Attention and Feed-forward layers
   multi_attention = stax.MultiHeadedAttention(
-      feature_depth, num_heads=num_heads, dropout=dropout, mode=mode)
+      feature_depth, num_heads=num_heads, dropout=keep_rate, mode=mode)
 
   feed_forward = stax.serial(
       stax.Dense(feedforward_depth, W_init=stax.xavier_uniform()),
       stax.Relu,
-      stax.Dropout(dropout, mode=mode),
+      stax.Dropout(keep_rate, mode=mode),
       stax.Dense(feature_depth, W_init=stax.xavier_uniform())
   )
 
@@ -74,11 +76,11 @@ def encoder(embedded_source, source_mask):
                                      stax.Identity,  # value
                                      source_mask),  # attention mask
                       multi_attention,
-                      stax.Dropout(dropout, mode=mode)),
+                      stax.Dropout(keep_rate, mode=mode)),
         # feed-forward
         stax.residual(stax.LayerNorm(feature_depth),
                       feed_forward,
-                      stax.Dropout(dropout, mode=mode))
+                      stax.Dropout(keep_rate, mode=mode))
     )
     return stax.serial(
         embedded_source,
@@ -95,8 +97,8 @@ def TransformerLM(vocab_size,  # pylint: disable=invalid-name
                   feature_depth=512,
                   feedforward_depth=2048,
                   num_heads=8,
-                  dropout=0.9,
-                  max_len=256):
+                  dropout=0.1,
+                  max_len=512):
   """Transformer language model (only uses the decoder part of Transformer).
 
   Args:
@@ -106,20 +108,21 @@ def TransformerLM(vocab_size,  # pylint: disable=invalid-name
     feature_depth: int:  depth of embedding
     feedforward_depth: int: depth of feed-forward layer
     num_heads: int: number of attention heads
-    dropout: float: dropout rate - Stax follows TF's KEEP probability convention
+    dropout: float: dropout rate (how much to drop out)
     max_len: int: maximum symbol length for positional encoding
 
   Returns:
     init and apply.
   """
+  keep_rate = 1.0 - dropout
   # Multi-headed Attention and Feed-forward layers
   multi_attention = stax.MultiHeadedAttention(
-      feature_depth, num_heads=num_heads, dropout=dropout, mode=mode)
+      feature_depth, num_heads=num_heads, dropout=keep_rate, mode=mode)
 
   feed_forward = stax.serial(
       stax.Dense(feedforward_depth, W_init=stax.xavier_uniform()),
       stax.Relu,
-      stax.Dropout(dropout, mode=mode),
+      stax.Dropout(keep_rate, mode=mode),
       stax.Dense(feature_depth, W_init=stax.xavier_uniform())
   )
 
@@ -132,18 +135,18 @@ def TransformerLM(vocab_size,  # pylint: disable=invalid-name
                                    stax.Identity,  # value
                                    stax.CausalMask(axis=-2)),  # attention mask
                     multi_attention,
-                    stax.Dropout(dropout, mode=mode)),
+                    stax.Dropout(keep_rate, mode=mode)),
       # feed-forward
       stax.residual(stax.LayerNorm(feature_depth),
                     feed_forward,
-                    stax.Dropout(dropout, mode=mode))
+                    stax.Dropout(keep_rate, mode=mode))
   )
 
   return stax.serial(
       stax.ShiftRight(),
       stax.Embedding(feature_depth, vocab_size),
       stax.PositionalEncoding(feature_depth, max_len=max_len),
-      stax.Dropout(dropout, mode=mode),
+      stax.Dropout(keep_rate, mode=mode),
       stax.repeat(decoder_layer, num_layers),
       stax.LayerNorm(feature_depth),
       stax.Dense(vocab_size, W_init=stax.xavier_uniform()),
@@ -158,7 +161,7 @@ def Transformer(source_vocab_size,  # pylint: disable=invalid-name
                 feature_depth=512,
                 feedforward_depth=2048,
                 num_heads=8,
-                dropout=0.9,
+                dropout=0.1,
                 shared_embedding=True,
                 max_len=200,
                 return_evals=False):
@@ -172,7 +175,7 @@ def Transformer(source_vocab_size,  # pylint: disable=invalid-name
     feature_depth: int:  depth of embedding
     feedforward_depth: int: depth of feed-forward layer
     num_heads: int: number of attention heads
-    dropout: float: dropout rate - Stax follows TF's KEEP probability convention
+    dropout: float: dropout rate (how much to drop out)
     shared_embedding: bool: specify whether source/target embeddings are tied.
     max_len: int: maximum symbol length for positional encoding
     return_evals: bool: whether to generate decode-time evaluation functions
@@ -182,11 +185,11 @@ def Transformer(source_vocab_size,  # pylint: disable=invalid-name
   the 'evals' functions that itself returns a namedtuple containing evaluation
   functions for the trained encoder, decoder, and generator substax.
   """
-
+  keep_rate = 1.0 - dropout
   # Input embedding and positional encoding
   inject_position = stax.serial(
       stax.PositionalEncoding(feature_depth, max_len=max_len),
-      stax.Dropout(dropout, mode=mode)
+      stax.Dropout(keep_rate, mode=mode)
   )
   if shared_embedding:
     assert source_vocab_size == target_vocab_size
@@ -202,12 +205,12 @@ def Transformer(source_vocab_size,  # pylint: disable=invalid-name
 
   # Multi-headed Attention and Feed-forward layers
   multi_attention = stax.MultiHeadedAttention(
-      feature_depth, num_heads=num_heads, dropout=dropout, mode=mode)
+      feature_depth, num_heads=num_heads, dropout=keep_rate, mode=mode)
 
   feed_forward = stax.serial(
       stax.Dense(feedforward_depth, W_init=stax.xavier_uniform()),
       stax.Relu,
-      stax.Dropout(dropout, mode=mode),
+      stax.Dropout(keep_rate, mode=mode),
       stax.Dense(feature_depth, W_init=stax.xavier_uniform())
   )
 
@@ -231,11 +234,11 @@ def encoder(source, source_mask):
                                      stax.Identity,  # value
                                      source_mask),  # attention mask
                       multi_attention,
-                      stax.Dropout(dropout, mode=mode)),
+                      stax.Dropout(keep_rate, mode=mode)),
         # feed-forward
         stax.residual(stax.LayerNorm(feature_depth),
                       feed_forward,
-                      stax.Dropout(dropout, mode=mode))
+                      stax.Dropout(keep_rate, mode=mode))
     )
     return stax.serial(
         source,
@@ -266,19 +269,19 @@ def decoder(memory, target, target_mask, memory_mask):
                                      stax.Identity,  # value
                                      target_mask),  # attention mask
                       multi_attention,
-                      stax.Dropout(dropout, mode=mode)),
+                      stax.Dropout(keep_rate, mode=mode)),
         # target attends to encoded source
         stax.residual(stax.LayerNorm(feature_depth),
                       stax.multiplex(stax.Identity,  # query
                                      memory,  # key
                                      memory,  # value
                                      memory_mask),  # attention mask
                       multi_attention,
-                      stax.Dropout(dropout, mode=mode)),
+                      stax.Dropout(keep_rate, mode=mode)),
         # feed-forward
         stax.residual(stax.LayerNorm(feature_depth),
                       feed_forward,
-                      stax.Dropout(dropout, mode=mode))
+                      stax.Dropout(keep_rate, mode=mode))
     )
     return stax.serial(
         target,
diff --git a/tensor2tensor/trax/trax.py b/tensor2tensor/trax/trax.py
@@ -46,26 +46,38 @@
 from tensorflow.io import gfile
 
 
+@gin.configurable(blacklist=["inputs", "targets"])
+def masked_mean(inputs, targets, mask_id=None):
+  """Mean of the inputs but counting only those where targets != mask_id."""
+  x = inputs.astype(np.float32)
+  if mask_id is None:
+    return np.mean(x)
+  unmask = 1.0 - np.equal(targets, mask_id).astype(np.float32)
+  return np.sum(x * unmask) / np.sum(unmask)
+
+
 def accuracy(batch, model_predictions):
   """Calculate accuracy."""
   _, targets = batch
   predicted_class = np.argmax(model_predictions, axis=-1)
-  return np.mean(predicted_class == targets)
+  correct = np.equal(predicted_class, targets)
+  return masked_mean(correct, targets)
 
 
 def neg_log_perplexity(batch, model_predictions):
   """Calculate negative log perplexity."""
   _, targets = batch
   hot_targets = stax.one_hot(targets, model_predictions.shape[-1])
-  return np.mean(np.sum(model_predictions * hot_targets, axis=-1))
+  xent = np.sum(model_predictions * hot_targets, axis=-1)
+  return masked_mean(xent, targets)
 
 
 def loss(params, batch, model_predict):
   """Calculate loss."""
   inputs, targets = batch
   preds = model_predict(params, inputs)
-  return - np.mean(np.sum(preds * stax.one_hot(targets, preds.shape[-1]),
-                          axis=-1))
+  xent = np.sum(preds * stax.one_hot(targets, preds.shape[-1]), axis=-1)
+  return - masked_mean(xent, targets)
 
 
 def log(s, stdout=True):