diff --git a/benchmarks/sizing/README.md b/benchmarks/sizing/README.md
new file mode 100644
index 0000000..1f706f9
--- /dev/null
+++ b/benchmarks/sizing/README.md
@@ -0,0 +1,132 @@
+# Transformer Sizing Guidelines
+
+The intent of these benchmarks is to measure the throughput of Generalized Matrix Multiplications (GEMMs) and Batched Matrix Multiplications (BMM) found in transformer models on modern GPU architectures. With these benchmarks, users can easily study:
+- The performance characteristics of GEMMs and BMMs on their GPU architecture.
+- How these GEMMs and BMMs form transformer layers.
+
+There are three scripts within `benchmarks/sizing` that can be run:
+
+## GEMM Benchmarks
+`mm_flops.py` measures throughput of GEMMs of shape $(m, n) \times (n, k)$.
+```
+Example for mm_flops.py: python mm_flops.py -m 1024 -k 1024 -n 1024 2048
+Example for mm_flops.py with range option: python mm_flops.py -m 1024 -k 1024 --n_range 1024 2048 256
+usage: mm_flops.py [-h] (-m M [M ...] | --m_range M_RANGE [M_RANGE ...]) (-n [N ...] | --n_range N_RANGE [N_RANGE ...])(-k [K ...] | --k_range K_RANGE [K_RANGE ...]) [--num_iterations NUM_ITERATIONS]
+[--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
+
+options:
+  -h, --help            show this help message and exit
+  -m M [M ...]          The first dimension of the GEMM, enter any number of arguments
+  --m_range M_RANGE [M_RANGE ...]
+                        The first dimension of the GEMM, [start,stop,step]
+  -n [N ...]            The shared dimension of the GEMM, enter any number of arguments
+  --n_range N_RANGE [N_RANGE ...]
+                        The shared dimension of the GEMM, [start,stop,step]
+  -k [K ...]            The last dimension of the GEMM, enter any number of arguments
+  --k_range K_RANGE [K_RANGE ...]
+                        The last dimension of the GEMM, [start,stop,step]
+  --num_iterations NUM_ITERATIONS
+                        The number of iterations used to benchmark each GEMM
+  --num_warmup_iterations NUM_WARMUP_ITERATIONS
+                        The number of warmup iterations
+  --cuda_device CUDA_DEVICE
+                        The cuda device to run the benchmark on
+  --output_file OUTPUT_FILE
+```
+
+## BMM Benchmarks
+`bmm_flops.py` measures throughput of batched matrix multiplications $(b,m,n)\times (b,n,k)$.
+```
+Example for bmm_flops.py: python bmm_flops.py -m 1024 -k 1024 -n 1024 2048 -b 128
+usage: bmm_flops.py [-h] (-b B [B ...] | --b_range B_RANGE [B_RANGE ...]) (-m M [M ...] | --m_range M_RANGE [M_RANGE ...])(-n [N ...] | --n_range N_RANGE [N_RANGE ...]) (-k [K ...] | --k_range K_RANGE [K_RANGE ...])
+[--num_iterations NUM_ITERATIONS] [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE][--output_file OUTPUT_FILE]
+
+options:
+  -h, --help            show this help message and exit
+  -b B [B ...]          The batched dimension of the BMM, enter any number of arguments
+  --b_range B_RANGE [B_RANGE ...]
+                        The batched dimension of the BMM, [start,stop,step]
+  -m M [M ...]          The first dimension of the BMM, enter any number of arguments
+  --m_range M_RANGE [M_RANGE ...]
+                        The first dimension of the BMM, [start,stop,step]
+  -n [N ...]            The shared dimension of the BMM, enter any number of arguments
+  --n_range N_RANGE [N_RANGE ...]
+                        The shared dimension of the BMM, [start,stop,step]
+  -k [K ...]            The last dimension of the BMM, enter any number of arguments
+  --k_range K_RANGE [K_RANGE ...]
+                        The last dimension of the BMM, [start,stop,step]
+  --num_iterations NUM_ITERATIONS
+                        The number of iterations used to benchmark each BMM
+  --num_warmup_iterations NUM_WARMUP_ITERATIONS
+                        The number of warmup iterations
+  --cuda_device CUDA_DEVICE
+                        The cuda device to run the benchmark on
+  --output_file OUTPUT_FILE
+```
+
+## Transformer Layer Benchmarks
+`transformer_flops.py` measures throughput of a transformer layer or of each block of a transformer layer.
+```
+Example for transformer_flops.py: python transformer_flops.py --hidden_size 4096 --num_attention_heads 16 --microbatch_size 4 --seq_length 2048 --vocab_size 51200 --global_batch_size 256 --tensor_mp_size 1 --num_iterations 10 --num_warmup_iterations 5
+usage: transformer_flops.py [-h]
+                            (--hidden_size HIDDEN_SIZE [HIDDEN_SIZE ...] | --hidden_size_range HIDDEN_SIZE_RANGE [HIDDEN_SIZE_RANGE ...])
+                            (--num_attention_heads NUM_ATTENTION_HEADS [NUM_ATTENTION_HEADS ...] | --num_attention_heads_range NUM_ATTENTION_HEADS_RANGE [NUM_ATTENTION_HEADS_RANGE ...])
+                            (--vocab_size VOCAB_SIZE [VOCAB_SIZE ...] | --vocab_size_range VOCAB_SIZE_RANGE [VOCAB_SIZE_RANGE ...])
+                            (--seq_length SEQ_LENGTH [SEQ_LENGTH ...] | --seq_length_range SEQ_LENGTH_RANGE [SEQ_LENGTH_RANGE ...])
+                            (--microbatch_size MICROBATCH_SIZE [MICROBATCH_SIZE ...] | --microbatch_size_range MICROBATCH_SIZE_RANGE [MICROBATCH_SIZE_RANGE ...])
+                            (--global_batch_size GLOBAL_BATCH_SIZE [GLOBAL_BATCH_SIZE ...] | --global_batch_size_range GLOBAL_BATCH_SIZE_RANGE [GLOBAL_BATCH_SIZE_RANGE ...])
+                            (--tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...] | --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...])
+                            [--blocks BLOCKS [BLOCKS ...]] [--use_flash] [--num_iterations NUM_ITERATIONS]
+                            [--num_warmup_iterations NUM_WARMUP_ITERATIONS] [--cuda_device CUDA_DEVICE] [--output_file OUTPUT_FILE]
+
+options:
+  -h, --help            show this help message and exit
+  --hidden_size HIDDEN_SIZE [HIDDEN_SIZE ...]
+                        The hidden dimension, enter any number of arguments
+  --hidden_size_range HIDDEN_SIZE_RANGE [HIDDEN_SIZE_RANGE ...]
+                        The hidden dimension, [start,stop,step]
+  --num_attention_heads NUM_ATTENTION_HEADS [NUM_ATTENTION_HEADS ...]
+                        The number of attention heads, enter any number of arguments
+  --num_attention_heads_range NUM_ATTENTION_HEADS_RANGE [NUM_ATTENTION_HEADS_RANGE ...]
+                        The number of attention heads, [start,stop,step]
+  --vocab_size VOCAB_SIZE [VOCAB_SIZE ...]
+                        The vocabulary size, enter any number of arguments
+  --vocab_size_range VOCAB_SIZE_RANGE [VOCAB_SIZE_RANGE ...]
+                        The vocabulary size, [start,stop,step]
+  --seq_length SEQ_LENGTH [SEQ_LENGTH ...]
+                        The sequence length, enter any number of arguments
+  --seq_length_range SEQ_LENGTH_RANGE [SEQ_LENGTH_RANGE ...]
+                        The sequence length, [start,stop,step]
+  --microbatch_size MICROBATCH_SIZE [MICROBATCH_SIZE ...]
+                        The microbatch size, enter any number of arguments
+  --microbatch_size_range MICROBATCH_SIZE_RANGE [MICROBATCH_SIZE_RANGE ...]
+                        The microbatch size, [start,stop,step]
+  --global_batch_size GLOBAL_BATCH_SIZE [GLOBAL_BATCH_SIZE ...]
+                        The global batch size, enter any number of arguments
+  --global_batch_size_range GLOBAL_BATCH_SIZE_RANGE [GLOBAL_BATCH_SIZE_RANGE ...]
+                        The global batch size, [start,stop,step]
+  --tensor_mp_size TENSOR_MP_SIZE [TENSOR_MP_SIZE ...]
+                        The tensor parallel size, enter any number of arguments
+  --tensor_mp_size_range TENSOR_MP_SIZE_RANGE [TENSOR_MP_SIZE_RANGE ...]
+                        The tensor parallel size, [start,stop,step]
+  --blocks BLOCKS [BLOCKS ...]
+                        The transformer blocks to benchmark, enter "all" or any number of [qkv_transform, attention_score,
+                        attention_over_value, attention_linear_projection, mlp_h_to_4h, mlp_4h_to_h, logit_block, layer_norm, dropout,
+                        add_bias_dropout, softmax, gelu]
+  --use_flash           Use flash attention
+  --num_iterations NUM_ITERATIONS
+                        The number of iterations used to benchmark each BMM
+  --num_warmup_iterations NUM_WARMUP_ITERATIONS
+                        The number of warmup iterations
+  --cuda_device CUDA_DEVICE
+                        The cuda device to run the benchmark on
+  --output_file OUTPUT_FILE
+```
+
+## Output Files
+The output files will be in a text based format, and can be read into a `Pandas.dataframe`. An example of this is found in `plotting/transformer_figures.ipynb`. Alternatively, users can convert this output file into a csv using the `plotting/convert_to_csv` script.
+Example:
+```
+python convert_to_csv.py --file_name ../results/bmm.out --output_file ../results/bmm.csv
+```
+
diff --git a/benchmarks/sizing/bmm_flops.py b/benchmarks/sizing/bmm_flops.py
new file mode 100644
index 0000000..d391db9
--- /dev/null
+++ b/benchmarks/sizing/bmm_flops.py
@@ -0,0 +1,62 @@
+import time
+import torch
+import numpy as np
+import sys
+import argparse
+
+from utils import benchmark_bmm
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    b_group = parser.add_mutually_exclusive_group(required=True)
+    b_group.add_argument("-b", nargs="+", type=int, help='The batched dimension of the BMM, enter any number of arguments')
+    b_group.add_argument("--b_range", nargs='+', type=int, help="The batched dimension of the BMM, [start,stop,step]")
+
+    m_group = parser.add_mutually_exclusive_group(required=True)
+    m_group.add_argument("-m", nargs="+", type=int, help='The first dimension of the BMM, enter any number of arguments')
+    m_group.add_argument("--m_range", nargs='+', type=int, help="The first dimension of the BMM, [start,stop,step]")
+
+    n_group = parser.add_mutually_exclusive_group(required=True)
+    n_group.add_argument("-n", nargs="*", type=int, help='The shared dimension of the BMM, enter any number of arguments')
+    n_group.add_argument("--n_range", nargs='+', type=int, help="The shared dimension of the BMM, [start,stop,step]")
+
+    k_group = parser.add_mutually_exclusive_group(required=True)
+    k_group.add_argument("-k", nargs="*", type=int, help='The last dimension of the BMM, enter any number of arguments')
+    k_group.add_argument("--k_range", nargs='+', type=int, help="The last dimension of the BMM, [start,stop,step]")
+
+    parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
+    parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
+    parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
+    parser.add_argument("--output_file", type=str, default="../results/bmm.out")
+    args = parser.parse_args()
+
+    b = args.b
+    m = args.m
+    n = args.n
+    k = args.k
+
+    if b is None:
+        start,stop,step = args.b_range
+        b = np.arange(start,stop,step)
+    if m is None:
+        start,stop,step = args.m_range
+        m = np.arange(start,stop,step)
+    if n is None:
+        start,stop,step = args.n_range
+        n = np.arange(start,stop,step)
+    if k is None:
+        start,stop,step = args.k_range
+        k = np.arange(start,stop,step)
+    
+    # set cuda device
+    torch.cuda.set_device(f"cuda:{args.cuda_device}")
+
+    # loop through all sizes to benchmark
+    with open(args.output_file, 'w') as sys.stdout:
+        for B in b:
+            for M in m:
+                for N in n:
+                    for K in k:
+                        benchmark_bmm(B, M, N, K, "bmm", args.num_iterations, args.num_warmup_iterations)
+                        print("-" * 80)
diff --git a/benchmarks/sizing/megatron/__init__.py b/benchmarks/sizing/megatron/__init__.py
new file mode 100644
index 0000000..4a9f98a
--- /dev/null
+++ b/benchmarks/sizing/megatron/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+
+def print_rank_0(*message):
+    """If distributed is initialized print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(*message, flush=True)
+    else:
+        print(*message, flush=True)
+
+
+from .initialize import initialize_megatron
+from .neox_arguments import NeoXArgs
diff --git a/benchmarks/sizing/megatron/checkpointing.py b/benchmarks/sizing/megatron/checkpointing.py
new file mode 100644
index 0000000..8ad2a88
--- /dev/null
+++ b/benchmarks/sizing/megatron/checkpointing.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input/output checkpointing."""
+
+import json
+import os
+import re
+import shutil
+import random
+import sys
+import numpy as np
+
+import torch
+from glob import glob
+
+from megatron import mpu
+from megatron import print_rank_0
+from megatron.utils import natural_sort
+from megatron.text_generation_utils import get_batch, forward_model
+from pathlib import Path
+from pprint import pformat
+
+
+def check_checkpoint_args(neox_args, checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retrieved from checkpoint."""
+
+    assert isinstance(checkpoint_args, dict), "args stored in checkpoint is a dict"
+    for checkpoint_arg_name, checkpoint_arg_value in checkpoint_args.items():
+        args_value = getattr(neox_args, checkpoint_arg_name)
+        error_message = "{} value from checkpoint ({}) is not equal to the currently set argument value ({}).".format(
+            checkpoint_arg_name, checkpoint_arg_value, args_value
+        )
+        assert checkpoint_arg_value == args_value, error_message
+
+
+def do_forward_pass(neox_args, model, inference=False):
+
+    # set to eval mode
+    model_was_in_train = model.training
+    model.eval()
+
+    # get context tokens
+    # always forward full batch size
+    context_tokens_tensor = (
+        torch.arange(neox_args.seq_length + 1)
+        .repeat((neox_args.train_micro_batch_size_per_gpu, 1))
+        .cuda()
+    )
+
+    # forward
+    if inference:
+        tokens, attention_mask, position_ids = get_batch(
+            neox_args, context_tokens_tensor[:, : neox_args.seq_length]
+        )
+        model_inputs = (
+            tokens,
+            position_ids,
+            attention_mask,
+            torch.Tensor(),
+        )
+        logits, _ = forward_model(neox_args, model, model_inputs)
+    elif neox_args.is_pipe_parallel:
+        data_iterator = iter([{"text": context_tokens_tensor}])
+        _, logits = model.eval_batch(data_iter=data_iterator, return_logits=True)
+    else:
+        tokens, attention_mask, position_ids = get_batch(
+            neox_args, context_tokens_tensor[:, : neox_args.seq_length]
+        )
+        logits = model((tokens, position_ids, attention_mask))
+
+    # reset to train mode, if model was in training before
+    if model_was_in_train:
+        model.train()
+
+    if logits is not None:
+        logits = logits.detach().cpu()[
+            0
+        ]  # just return first batch item (they are all equal)
+
+    return logits
+
+
+def check_forward_pass(neox_args, model, checkpoint_logits, inference):
+    # do forward pass with loaded checkpoint
+    logits = do_forward_pass(neox_args=neox_args, model=model, inference=inference)
+
+    # check
+    if (
+        logits is not None and checkpoint_logits is not None
+    ):  # this could be the case for non-final pipeline stages
+        if not (logits == checkpoint_logits).all().item():
+            if mpu.get_data_parallel_rank() == 0:
+                print(
+                    " > WARNING: validate_checkpoint_forward() forward after load of checkpoint does not yield exactly same result"
+                )
+            assert (
+                torch.isclose(logits, checkpoint_logits).all().item()
+            ), "validate_checkpoint_forward() forward after load of checkpoint does not yield a close result"
+
+
+def ensure_directory_exists(filename):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+
+def get_checkpoint_name(checkpoints_path, iteration, release=False, mp_rank=None):
+    """A unified checkpoint name."""
+    if release:
+        directory = "release"
+    else:
+        directory = "iter_{:07d}".format(iteration)
+    return os.path.join(
+        checkpoints_path,
+        directory,
+        "mp_rank_{:02d}".format(
+            mpu.get_model_parallel_rank() if mp_rank is None else mp_rank
+        ),
+        "model_optim_rng.pt",
+    )
+
+
+def delete_old_checkpoints(save_dir, n_to_keep):
+    if torch.distributed.get_rank() == 0:
+        ckpt_dir_regex = r"global_step[\d]*"
+        if save_dir.endswith("/"):
+            save_dir = save_dir.strip("/")
+        all_ckpts = natural_sort(
+            [
+                i
+                for i in glob(f"{save_dir}/*")
+                if os.path.isdir(i) and re.search(ckpt_dir_regex, i)
+            ]
+        )
+        n_to_delete = len(all_ckpts) - n_to_keep
+        if n_to_delete > 0:
+            to_delete = all_ckpts[:n_to_delete]
+            print(f"WARNING: Deleting old checkpoints: \n\t{', '.join(to_delete)}")
+            for ckpt in to_delete:
+                try:
+                    shutil.rmtree(ckpt)
+                except FileNotFoundError:
+                    pass
+
+
+def save_ds_checkpoint(iteration, model, neox_args):
+    """Save a model checkpoint."""
+    sd = {
+        "iteration": iteration,
+        "args": {
+            "num_layers": neox_args.num_layers,
+            "hidden_size": neox_args.hidden_size,
+            "num_attention_heads": neox_args.num_attention_heads,
+            "max_position_embeddings": neox_args.max_position_embeddings,
+            "make_vocab_size_divisible_by": neox_args.make_vocab_size_divisible_by,
+            "padded_vocab_size": neox_args.padded_vocab_size,
+            "tokenizer_type": neox_args.tokenizer_type,
+            "model_parallel_size": neox_args.model_parallel_size,
+        },
+    }
+    # rng states.
+    if not neox_args.no_save_rng:
+        sd["random_rng_state"] = random.getstate()
+        sd["np_rng_state"] = np.random.get_state()
+        sd["torch_rng_state"] = torch.get_rng_state()
+        sd["cuda_rng_state"] = torch.cuda.get_rng_state()
+        sd["rng_tracker_states"] = mpu.get_cuda_rng_tracker().get_states()
+
+    if neox_args.checkpoint_validation_with_forward_pass:
+        logits = do_forward_pass(neox_args=neox_args, model=model)
+        sd["checkpoint_validation_logits"] = logits
+
+    # checkpoint folder name
+    tag = f"global_step{iteration}"
+
+    # save checkpoint
+    model.save_checkpoint(neox_args.save, tag=tag, client_state=sd)
+
+    # save config files
+    if torch.distributed.get_rank() == 0 and neox_args.config_files is not None:
+        configs_directory = os.path.join(neox_args.save, tag, "configs")
+        os.makedirs(configs_directory, exist_ok=True)
+        for config_filename, config_data in neox_args.config_files.items():
+            with open(os.path.join(configs_directory, config_filename), "w") as f:
+                if isinstance(config_data, str):
+                    f.write(config_data)
+                else:
+                    json.dump(config_data, f)
+
+
+def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint."""
+
+    if neox_args.deepspeed:
+        save_ds_checkpoint(iteration, model, neox_args)
+    else:
+        raise ValueError("Must be using deepspeed to use neox")
+
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    if neox_args.keep_last_n_checkpoints is not None:
+        delete_old_checkpoints(neox_args.save, neox_args.keep_last_n_checkpoints)
+
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+
+
+def load_checkpoint(
+    neox_args, model, optimizer, lr_scheduler, inference=False, iteration=None
+):
+    """Load a model checkpoint and return the iteration."""
+    if neox_args.deepspeed:
+        load_optim_and_scheduler = (
+            not neox_args.no_load_optim
+        )  # TODO: These should be configured by separate args
+        if neox_args.finetune:
+            load_optim_and_scheduler = False
+        if iteration is not None:
+            tag = f"global_step{iteration}"
+        else:
+            tag = None
+        checkpoint_name, state_dict = model.load_checkpoint(
+            neox_args.load,
+            load_optimizer_states=load_optim_and_scheduler,
+            load_lr_scheduler_states=load_optim_and_scheduler,
+            load_module_only=not load_optim_and_scheduler,
+            tag=tag,
+        )
+
+        if checkpoint_name is None:
+            # if an iteration is specified, we want to raise an error here rather than
+            # continuing silently, since we are trying to load a specific checkpoint
+            if iteration is not None:
+                available_checkpoints = sorted(
+                    [
+                        int(i.name.replace("global_step", ""))
+                        for i in Path(neox_args.load).glob("global_step*")
+                    ]
+                )
+                raise ValueError(
+                    f"Unable to load checkpoint for iteration {iteration}. \nAvailable iterations: {pformat(available_checkpoints)}"
+                )
+            if mpu.get_data_parallel_rank() == 0:
+                print("Unable to load checkpoint.")
+
+            return 0  # iteration 0, if not checkpoint loaded
+    else:
+        raise ValueError("Must be using deepspeed to use neox")
+
+    # Set iteration.
+    if neox_args.finetune:
+        iteration = 0
+    else:
+        iteration = state_dict.get("iteration") or state_dict.get(
+            "total_iters"
+        )  # total_iters backward compatible with older checkpoints
+        if iteration is None:
+            raise ValueError(
+                f"Unable to load iteration from checkpoint {checkpoint_name} with keys {state_dict.keys()}, exiting"
+            )
+
+    # Check arguments.
+    if "args" in state_dict:
+        checkpoint_args = state_dict["args"]
+        check_checkpoint_args(neox_args=neox_args, checkpoint_args=checkpoint_args)
+        print_rank_0(
+            " > validated currently set args with arguments in the checkpoint ..."
+        )
+    else:
+        print_rank_0(" > could not find arguments in the checkpoint for validation...")
+
+    # Check loaded checkpoint with forward pass
+    if neox_args.checkpoint_validation_with_forward_pass:
+        if "checkpoint_validation_logits" in state_dict:
+            check_forward_pass(
+                neox_args=neox_args,
+                model=model,
+                checkpoint_logits=state_dict["checkpoint_validation_logits"],
+                inference=inference,
+            )
+            print_rank_0(" > validated loaded checkpoint with forward pass ...")
+        else:
+            if mpu.get_data_parallel_rank() == 0:
+                print(
+                    " > WARNING: checkpoint_validation_with_forward_pass is configured but no checkpoint validation data available in checkpoint {}".format(
+                        checkpoint_name
+                    )
+                )
+
+    # rng states.
+    if not neox_args.finetune and not neox_args.no_load_rng:
+        try:
+            random.setstate(state_dict["random_rng_state"])
+            np.random.set_state(state_dict["np_rng_state"])
+            torch.set_rng_state(state_dict["torch_rng_state"])
+            torch.cuda.set_rng_state(state_dict["cuda_rng_state"])
+            mpu.get_cuda_rng_tracker().set_states(state_dict["rng_tracker_states"])
+        except KeyError:
+            print_rank_0(
+                "Unable to load optimizer from checkpoint {}. "
+                "Specify --no-load-rng or --finetune to prevent "
+                "attempting to load the optimizer state, "
+                "exiting ...".format(checkpoint_name)
+            )
+            sys.exit()
+
+    torch.distributed.barrier()
+    if mpu.get_data_parallel_rank() == 0:
+        print("  successfully loaded {}".format(checkpoint_name))
+
+    return iteration
diff --git a/benchmarks/sizing/megatron/data/Makefile b/benchmarks/sizing/megatron/data/Makefile
new file mode 100644
index 0000000..8f9db76
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/Makefile
@@ -0,0 +1,9 @@
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/benchmarks/sizing/megatron/data/__init__.py b/benchmarks/sizing/megatron/data/__init__.py
new file mode 100644
index 0000000..b6e690f
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/__init__.py
@@ -0,0 +1 @@
+from . import *
diff --git a/benchmarks/sizing/megatron/data/blendable_dataset.py b/benchmarks/sizing/megatron/data/blendable_dataset.py
new file mode 100644
index 0000000..e05c584
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/blendable_dataset.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Blendable dataset."""
+
+import time
+
+import numpy as np
+import torch
+
+from megatron import print_rank_0
+from megatron import mpu
+
+
+class BlendableDataset(torch.utils.data.Dataset):
+    def __init__(self, datasets, weights):
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+
+        self.size = 0
+        for dataset in self.datasets:
+            self.size += len(dataset)
+
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+
+        # Build indices.
+        start_time = time.time()
+        assert num_datasets < 255
+        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
+        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+
+        from megatron.data import helpers
+
+        helpers.build_blending_indices(
+            self.dataset_index,
+            self.dataset_sample_index,
+            weights,
+            num_datasets,
+            self.size,
+            torch.distributed.get_rank() == 0,
+        )
+
+        print(
+            "> RANK {} elapsed time for building blendable dataset indices: "
+            "{:.2f} (sec)".format(
+                torch.distributed.get_rank(), time.time() - start_time
+            )
+        )
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        try:
+            dataset_idx = self.dataset_index[idx]
+            sample_idx = self.dataset_sample_index[idx]
+            return self.datasets[dataset_idx][sample_idx]
+        except IndexError:
+            new_idx = idx % len(self)
+            print(
+                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
+            )
+            return self[new_idx]
diff --git a/benchmarks/sizing/megatron/data/data_utils.py b/benchmarks/sizing/megatron/data/data_utils.py
new file mode 100644
index 0000000..fba34e5
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/data_utils.py
@@ -0,0 +1,495 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import numpy as np
+from typing import List, Tuple
+from itertools import zip_longest
+from functools import partial
+
+from megatron import mpu, print_rank_0
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatron.data.blendable_dataset import BlendableDataset
+from megatron.data.gpt2_dataset import GPT2Dataset
+from megatron.data.samplers import DistributedBatchSampler
+
+
+def make_data_loader(dataset, neox_args):
+    """Build dataloader given an input dataset."""
+    if dataset is None:
+        return None
+    # Data parallel arguments.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    global_batch_size = neox_args.batch_size * world_size
+    num_workers = neox_args.num_workers
+
+    # Use a simple sampler with distributed batch sampler.
+    sampler = torch.utils.data.SequentialSampler(dataset)
+    batch_sampler = DistributedBatchSampler(
+        sampler=sampler,
+        batch_size=global_batch_size,
+        drop_last=True,
+        rank=rank,
+        world_size=world_size,
+    )
+    # Torch dataloader.
+    return torch.utils.data.DataLoader(
+        dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True
+    )
+
+
+def build_the_dataset(
+    data_prefix,
+    name,
+    data_impl,
+    num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    build_index_mappings=True,
+):
+    """Build train/valid/test datasets."""
+
+    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    print_rank_0("    {}:".format(name))
+    print_rank_0("     no. of documents:{}".format(total_num_of_documents))
+    dataset = None
+    documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
+    dataset = GPT2Dataset(
+        name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        num_samples,
+        seq_length,
+        seed,
+        build_index_mappings=build_index_mappings,
+    )
+    return dataset
+
+
+def build_train_valid_test_datasets(
+    data_prefix,
+    use_shared_fs,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(" > dataset split:")
+
+    def print_split_stats(name, index):
+        print_rank_0("    {}:".format(name))
+        print_rank_0(
+            "     document indices in [{}, {}) total of {} "
+            "documents".format(
+                splits[index], splits[index + 1], splits[index + 1] - splits[index]
+            )
+        )
+
+    print_split_stats("train", 0)
+    print_split_stats("validation", 1)
+    print_split_stats("test", 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            documents = np.arange(
+                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
+            )
+
+            dataset = GPT2Dataset(
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                use_shared_fs=use_shared_fs,
+            )
+        return dataset
+
+    train_dataset = build_dataset(0, "train")
+    valid_dataset = build_dataset(1, "valid")
+    test_dataset = build_dataset(2, "test")
+
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(",") != -1:
+        splits = [float(s) for s in splits_string.split(",")]
+    elif splits_string.find("/") != -1:
+        splits = [float(s) for s in splits_string.split("/")]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.0)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] + int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+
+def get_normalized_weights_and_num_samples(
+    weights: List[float], num_samples: int
+) -> Tuple[List[float], List[int]]:
+    # Normalize weights
+    weight_sum = sum(weights)
+    assert weight_sum > 0.0
+    weights = [weight / weight_sum for weight in weights]
+    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
+    # not uniformly distribute the number of samples, we still have
+    # samples left to feed to the network.
+    weighted_num_samples = []
+    for weight in weights:
+        weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+    return weights, weighted_num_samples
+
+
+def build_weighted_datasets(
+    neox_args,
+    train_num_samples,
+    valid_num_samples,
+    test_num_samples,
+    train_weights,
+    valid_weights,
+    test_weights,
+    build_index_mappings=True,
+):
+    # build individual datasets
+    train_datasets, valid_datasets, test_datasets = [], [], []
+    for i, (train_path, valid_path, test_path) in enumerate(
+        zip_longest(
+            neox_args.train_data_paths,
+            neox_args.valid_data_paths,
+            neox_args.test_data_paths,
+        )
+    ):
+        if train_path:
+            train_datasets.append(
+                build_the_dataset(
+                    data_prefix=train_path,
+                    name=f"train_{i}",
+                    data_impl=neox_args.data_impl,
+                    num_samples=train_num_samples[i],
+                    seq_length=neox_args.seq_length,
+                    seed=neox_args.seed,
+                    skip_warmup=(not neox_args.mmap_warmup),
+                    build_index_mappings=build_index_mappings,
+                )
+            )
+
+        if valid_path:
+            valid_datasets.append(
+                build_the_dataset(
+                    data_prefix=valid_path,
+                    name=f"valid_{i}",
+                    data_impl=neox_args.data_impl,
+                    num_samples=valid_num_samples[i],
+                    seq_length=neox_args.seq_length,
+                    seed=neox_args.seed,
+                    skip_warmup=(not neox_args.mmap_warmup),
+                    build_index_mappings=build_index_mappings,
+                )
+            )
+
+        if test_path:
+            test_datasets.append(
+                build_the_dataset(
+                    data_prefix=test_path,
+                    name=f"test_{i}",
+                    data_impl=neox_args.data_impl,
+                    num_samples=test_num_samples[i],
+                    seq_length=neox_args.seq_length,
+                    seed=neox_args.seed,
+                    skip_warmup=(not neox_args.mmap_warmup),
+                    build_index_mappings=build_index_mappings,
+                )
+            )
+    return train_datasets, valid_datasets, test_datasets
+
+
+def weights_by_num_docs(l: list, alpha=0.3):
+    """
+    Builds weights from a multinomial distribution over groups of data according to the number of
+    samples in each group.
+
+    We sample from a group according to the probability p(L) ∝ |L| ** α,
+    where p(L) is the probability of sampling from a given group,
+          |L| is the number of examples in that datapoint,
+          and α is a coefficient that acts to upsample data from underrepresented groups
+
+    Hence α (`alpha`) allows us to control how much to 'boost' the probability of training on low-resource groups.
+
+    See https://arxiv.org/abs/1911.02116 for more details
+    """
+    if len(l) == 1:
+        return [1.0]
+
+    total_n_docs = sum(l)
+    unbiased_sample_probs = [i / total_n_docs for i in l]
+
+    probs = [i**alpha for i in unbiased_sample_probs]
+
+    # normalize
+    total = sum(probs)
+    probs = [i / total for i in probs]
+
+    # weights should be the inverse of the number of samples
+    unbiased_sample_probs_inverse = [1 - p for p in unbiased_sample_probs]
+    weights = [p * p2 for p, p2 in zip(probs, unbiased_sample_probs_inverse)]
+
+    # normalize
+    total = sum(weights)
+    weights = [i / total for i in weights]
+
+    return weights
+
+
+def build_train_valid_test_data_iterators(neox_args):
+    """XXX"""
+
+    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+
+    print_rank_0("> building train, validation, and test datasets ...")
+
+    # Ensure only the first/last pipeline stages have data loaders
+    if neox_args.is_pipe_parallel:
+        is_first_stage = mpu.get_pipe_parallel_rank() == 0
+        is_last_stage = (
+            mpu.get_pipe_parallel_rank() == mpu.get_pipe_parallel_world_size() - 1
+        )
+        pipe_load = is_first_stage or is_last_stage
+    else:
+        pipe_load = True
+
+    # Data loader only on rank 0 of each model parallel group.
+    if mpu.get_model_parallel_rank() == 0 and pipe_load:
+        # Number of train/valid/test samples.
+        train_iters = neox_args.train_iters
+        eval_iters = (train_iters // neox_args.eval_interval + 1) * neox_args.eval_iters
+        test_iters = neox_args.eval_iters
+        train_val_test_num_samples = [
+            train_iters * neox_args.train_batch_size,
+            eval_iters * neox_args.train_batch_size,
+            test_iters * neox_args.train_batch_size,
+        ]
+
+        if neox_args.train_data_paths:
+            # when individual train / valid / test data paths are provided
+            # normalize weight values and get num samples for each dataset
+            train_weights, train_num_samples = get_normalized_weights_and_num_samples(
+                neox_args.train_data_weights, train_val_test_num_samples[0]
+            )
+            valid_weights, valid_num_samples = get_normalized_weights_and_num_samples(
+                neox_args.valid_data_weights, train_val_test_num_samples[1]
+            )
+            test_weights, test_num_samples = get_normalized_weights_and_num_samples(
+                neox_args.test_data_weights, train_val_test_num_samples[2]
+            )
+
+            # build individual datasets
+            train_datasets, valid_datasets, test_datasets = build_weighted_datasets(
+                neox_args,
+                train_num_samples,
+                valid_num_samples,
+                test_num_samples,
+                train_weights,
+                valid_weights,
+                test_weights,
+                build_index_mappings=not neox_args.weight_by_num_documents,
+            )
+
+            if neox_args.weight_by_num_documents:
+
+                # gets the number of documents in each datapath
+                get_num_docs_list = lambda datasets: [
+                    dataset.indexed_dataset.sizes.shape[0] for dataset in datasets
+                ]
+                train_num_docs, valid_num_docs, test_num_docs = (
+                    get_num_docs_list(train_datasets),
+                    get_num_docs_list(valid_datasets),
+                    get_num_docs_list(test_datasets),
+                )
+
+                # builds weights according to alpha + the number of docs
+                fn = partial(
+                    weights_by_num_docs, alpha=neox_args.weighted_sampler_alpha
+                )
+                train_weights, valid_weights, test_weights = (
+                    fn(train_num_docs),
+                    fn(valid_num_docs),
+                    fn(test_num_docs),
+                )
+                (
+                    train_weights,
+                    train_num_samples,
+                ) = get_normalized_weights_and_num_samples(
+                    train_weights, train_val_test_num_samples[0]
+                )
+                (
+                    valid_weights,
+                    valid_num_samples,
+                ) = get_normalized_weights_and_num_samples(
+                    valid_weights, train_val_test_num_samples[1]
+                )
+                test_weights, test_num_samples = get_normalized_weights_and_num_samples(
+                    test_weights, train_val_test_num_samples[2]
+                )
+
+                # rebuild datasets weighted according to new weights
+                train_datasets, valid_datasets, test_datasets = build_weighted_datasets(
+                    neox_args,
+                    train_num_samples,
+                    valid_num_samples,
+                    test_num_samples,
+                    train_weights,
+                    valid_weights,
+                    test_weights,
+                )
+
+            if train_datasets:
+                train_ds = BlendableDataset(train_datasets, train_weights)
+            if valid_datasets:
+                valid_ds = BlendableDataset(valid_datasets, valid_weights)
+            if test_datasets:
+                test_ds = BlendableDataset(test_datasets, test_weights)
+        else:
+            # when just data_path is provided
+            # split dataset into train, valid and test from data_path
+            train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+                data_prefix=neox_args.data_path,
+                use_shared_fs=neox_args.use_shared_fs,
+                data_impl=neox_args.data_impl,
+                splits_string=neox_args.split,
+                train_valid_test_num_samples=train_val_test_num_samples,
+                seq_length=neox_args.seq_length,
+                seed=neox_args.seed,
+                skip_warmup=(not neox_args.mmap_warmup),
+            )
+
+        # Build dataloders.
+        train_dataloader = make_data_loader(train_ds, neox_args=neox_args)
+        valid_dataloader = make_data_loader(valid_ds, neox_args=neox_args)
+        test_dataloader = make_data_loader(test_ds, neox_args=neox_args)
+
+        # Flags to know if we need to do training/validation/testing.
+        do_train = train_dataloader is not None and neox_args.train_iters > 0
+        do_valid = valid_dataloader is not None and neox_args.eval_iters > 0
+        do_test = test_dataloader is not None and neox_args.eval_iters > 0
+        # Need to broadcast num_tokens and num_type_tokens.
+        flags = torch.cuda.LongTensor([int(do_train), int(do_valid), int(do_test)])
+    else:
+        flags = torch.cuda.LongTensor([0, 0, 0])
+
+    # Broadcast num tokens.
+    if neox_args.is_pipe_parallel:
+        # Only first/last pipeline stages have data loaders, so pipeline parallelism should
+        # broadcast globally instead of just the model parallel group.
+        torch.distributed.broadcast(flags, src=0)
+    else:
+        torch.distributed.broadcast(
+            flags,
+            mpu.get_model_parallel_src_rank(),
+            group=mpu.get_model_parallel_group(),
+        )
+    neox_args.do_train = flags[0].item()
+    neox_args.do_valid = flags[1].item()
+    neox_args.do_test = flags[2].item()
+
+    # Shift the start iterations.
+    if train_dataloader is not None:
+        train_dataloader.batch_sampler.start_iter = (
+            neox_args.iteration * neox_args.gradient_accumulation_steps
+        ) % len(train_dataloader)
+        print_rank_0(
+            "setting training data start iteration to {}".format(
+                train_dataloader.batch_sampler.start_iter
+            )
+        )
+    if valid_dataloader is not None:
+        start_iter_val = (
+            (neox_args.iteration * neox_args.gradient_accumulation_steps)
+            // neox_args.eval_interval
+        ) * neox_args.eval_iters
+        valid_dataloader.batch_sampler.start_iter = start_iter_val % len(
+            valid_dataloader
+        )
+        print_rank_0(
+            "setting validation data start iteration to {}".format(
+                valid_dataloader.batch_sampler.start_iter
+            )
+        )
+
+    # Build iterators.
+    if train_dataloader is not None:
+        train_data_iterator = iter(train_dataloader)
+    else:
+        train_data_iterator = None
+
+    if valid_dataloader is not None:
+        valid_data_iterator = iter(valid_dataloader)
+    else:
+        valid_data_iterator = None
+
+    if test_dataloader is not None:
+        test_data_iterator = iter(test_dataloader)
+    else:
+        test_data_iterator = None
+
+    return train_data_iterator, valid_data_iterator, test_data_iterator
+
+
+def compile_helper():
+    """Compile helper function at runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(["make", "-C", path])
+    if ret.returncode != 0:
+        print("Making C++ dataset helpers module failed, exiting.")
+        import sys
+
+        sys.exit(1)
diff --git a/benchmarks/sizing/megatron/data/gpt2_dataset.py b/benchmarks/sizing/megatron/data/gpt2_dataset.py
new file mode 100644
index 0000000..cd6cf86
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/gpt2_dataset.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT2 style dataset."""
+
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import mpu, print_rank_0
+
+
+class GPT2Dataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        num_samples,
+        seq_length,
+        seed,
+        build_index_mappings=True,
+        use_shared_fs=True,
+    ):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        if build_index_mappings:
+            # Build index mappings.
+            self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+                self.name,
+                data_prefix,
+                documents,
+                self.indexed_dataset.sizes,
+                num_samples,
+                seq_length,
+                seed,
+                use_shared_fs=use_shared_fs,
+            )
+            self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1
+            self.sample_idx_len = self.sample_idx.shape[0] - 1
+
+            if self.shuffle_idx_len != self.sample_idx_len:
+                print(
+                    f"WARNING: shuffle index length ({self.shuffle_idx_len}) is not equal to sample index length ({self.sample_idx_len})"
+                )
+
+    def __len__(self):
+        return min(self.shuffle_idx_len, self.sample_idx_len)
+
+    def __getitem__(self, idx):
+        try:
+            # Get the shuffled index.
+            idx = self.shuffle_idx[idx]
+            # Start and end documents and offsets.
+            doc_index_f = self.sample_idx[idx][0]
+            doc_index_l = self.sample_idx[idx + 1][0]
+            offset_f = self.sample_idx[idx][1]
+            offset_l = self.sample_idx[idx + 1][1]
+            # If we are within the same document, just extract the chunk.
+            if doc_index_f == doc_index_l:
+                sample = self.indexed_dataset.get(
+                    self.doc_idx[doc_index_f],
+                    offset=offset_f,
+                    length=offset_l - offset_f + 1,
+                )
+            else:
+                # Otherwise, get the rest of the initial document.
+                sample_list = [
+                    self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+                ]
+                # Loop over all in between documents and add the entire document.
+                for i in range(doc_index_f + 1, doc_index_l):
+                    sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+                # And finally add the relevant portion of last document.
+                sample_list.append(
+                    self.indexed_dataset.get(
+                        self.doc_idx[doc_index_l], length=offset_l + 1
+                    )
+                )
+                sample = np.concatenate(sample_list)
+
+            return {"text": np.array(sample, dtype=np.int64)}
+        except IndexError:
+            new_idx = idx % len(self)
+            print(
+                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
+            )
+            return self[new_idx]
+
+
+def _build_index_mappings(
+    name,
+    data_prefix,
+    documents,
+    sizes,
+    num_samples,
+    seq_length,
+    seed,
+    use_shared_fs=True,
+):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    _filename = data_prefix
+    _filename += "_{}_indexmap".format(name)
+    _filename += "_{}ns".format(num_samples)
+    _filename += "_{}sl".format(seq_length)
+    _filename += "_{}s".format(seed)
+    doc_idx_filename = _filename + "_doc_idx.npy"
+    sample_idx_filename = _filename + "_sample_idx.npy"
+    shuffle_idx_filename = _filename + "_shuffle_idx.npy"
+
+    if not use_shared_fs:
+        should_process_dataset = int(os.environ["LOCAL_RANK"]) == 0
+    else:
+        should_process_dataset = torch.distributed.get_rank() == 0
+
+    # Build the indexed mapping if not exist.
+    if should_process_dataset:
+        if (
+            (not os.path.isfile(doc_idx_filename))
+            or (not os.path.isfile(sample_idx_filename))
+            or (not os.path.isfile(shuffle_idx_filename))
+        ):
+            print_rank_0(
+                " > WARNING: could not find index map files, building "
+                "the indices on rank 0 ..."
+            )
+            # doc-idx.
+            start_time = time.time()
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
+            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+            print_rank_0(
+                " > elapsed time to build and save doc-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            from megatron.data import helpers
+
+            assert doc_idx.dtype == np.int32
+            assert sizes.dtype == np.int32
+
+            num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
+            if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
+                sample_idx = helpers.build_sample_idx_int32(
+                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                )
+            else:
+                sample_idx = helpers.build_sample_idx_int64(
+                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                )
+            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+            print_rank_0(
+                " > elapsed time to build and save sample-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
+            # shuffle-idx.
+            start_time = time.time()
+            # -1 is due to data structure used to retrieve the index:
+            #    sample i --> [sample_idx[i], sample_idx[i+1])
+            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
+            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            print_rank_0(
+                " > elapsed time to build and save shuffle-idx mapping"
+                " (seconds): {:4f}".format(time.time() - start_time)
+            )
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_io_parallel_group()
+    )
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(" > loading doc-idx mapping from {}".format(doc_idx_filename))
+    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading sample-idx mapping from {}".format(sample_idx_filename))
+    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename))
+    shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(
+        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
+    )
+    print_rank_0("    total number of samples: {}".format(sample_idx.shape[0]))
+    print_rank_0("    total number of epochs: {}".format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence length, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng):
+    """Build an array with length = number-of-epochs * number-of-documents.
+    Each index is mapped to a corresponding document."""
+    doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
+    doc_idx[:] = documents
+    doc_idx = doc_idx.reshape(-1)
+    doc_idx = doc_idx.astype(np.int32)
+    np_rng.shuffle(doc_idx)
+    return doc_idx
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Beginning offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the beginning of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    dtype_ = np.uint32
+    if size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+    shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx)
+    return shuffle_idx
diff --git a/benchmarks/sizing/megatron/data/helpers.cpp b/benchmarks/sizing/megatron/data/helpers.cpp
new file mode 100644
index 0000000..830326c
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/helpers.cpp
@@ -0,0 +1,756 @@
+/*
+ coding=utf-8
+ Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+/* Helper methods for fast index mapping builds */
+
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <stdexcept>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+void build_blending_indices(py::array_t<uint8_t>& dataset_index,
+                            py::array_t<int64_t>& dataset_sample_index,
+                            const py::array_t<double>& weights,
+                            const int32_t num_datasets,
+                            const int64_t size,
+                            const bool verbose)
+{
+    /* Given multiple datasets and a weighting array, build samples
+     such that it follows those weights.*/
+
+    if (verbose) { std::cout << "> building indices for blendable datasets ..." << std::endl; }
+
+    // Get the pointer access without the checks.
+    auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+    auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+    auto weights_ptr = weights.unchecked<1>();
+
+    // Initialize buffer for number of samples used for each dataset.
+    int64_t current_samples[num_datasets];
+    for (int64_t i = 0; i < num_datasets; ++i) { current_samples[i] = 0; }
+
+    // For each sample:
+    for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
+        // Determine where the max error in sampling is happening.
+        double sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+        int64_t max_error_index = 0;
+        double max_error =
+            weights_ptr[0] * sample_idx_double - static_cast<double>(current_samples[0]);
+        for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
+            double error = weights_ptr[dataset_idx] * sample_idx_double -
+                           static_cast<double>(current_samples[dataset_idx]);
+            if (error > max_error) {
+                max_error = error;
+                max_error_index = dataset_idx;
+            }
+        }
+
+        // Populate the indices.
+        dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
+        dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+
+        // Update the total samples.
+        current_samples[max_error_index] += 1;
+    }
+
+    // print info
+    if (verbose) {
+        std::cout << " > sample ratios:" << std::endl;
+        for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
+            auto ratio =
+                static_cast<double>(current_samples[dataset_idx]) / static_cast<double>(size);
+            std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx]
+                      << ", achieved: " << ratio << std::endl;
+        }
+    }
+}
+
+py::array build_sample_idx_int32(const py::array_t<int32_t>& sizes_,
+                                 const py::array_t<int32_t>& doc_idx_,
+                                 const int32_t seq_length,
+                                 const int32_t num_epochs,
+                                 const int64_t tokens_per_epoch)
+{
+    /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+       starting offset in that document.*/
+
+    // Consistency checks.
+    assert(seq_length > 1);
+    assert(num_epochs > 0);
+    assert(tokens_per_epoch > 1);
+
+    // Remove bound checks.
+    auto sizes = sizes_.unchecked<1>();
+    auto doc_idx = doc_idx_.unchecked<1>();
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+    int32_t* sample_idx = new int32_t[2 * (num_samples + 1)];
+
+    cout << "    using:" << endl << std::flush;
+    cout << "     number of documents:       " << doc_idx_.shape(0) / num_epochs << endl
+         << std::flush;
+    cout << "     number of epochs:          " << num_epochs << endl << std::flush;
+    cout << "     sequence length:           " << seq_length << endl << std::flush;
+    cout << "     total number of samples:   " << num_samples << endl << std::flush;
+
+    // Index into sample_idx.
+    int64_t sample_index = 0;
+    // Index into doc_idx.
+    int64_t doc_idx_index = 0;
+    // Beginning offset for each document.
+    int32_t doc_offset = 0;
+    // Start with first document and no offset.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+
+    while (sample_index <= num_samples) {
+        // Start with a fresh sequence.
+        int32_t remaining_seq_length = seq_length + 1;
+        while (remaining_seq_length != 0) {
+            // Get the document length.
+            auto doc_id = doc_idx[doc_idx_index];
+            auto doc_length = sizes[doc_id] - doc_offset;
+            // And add it to the current sequence.
+            remaining_seq_length -= doc_length;
+            // If we have more than a full sequence, adjust offset and set
+            // remaining length to zero so we return from the while loop.
+            // Note that -1 here is for the same reason we have -1 in
+            // `_num_epochs` calculations.
+            if (remaining_seq_length <= 0) {
+                doc_offset += (remaining_seq_length + doc_length - 1);
+                remaining_seq_length = 0;
+            } else {
+                // Otherwise, start from the beginning of the next document.
+                ++doc_idx_index;
+                doc_offset = 0;
+            }
+        }
+        // Record the sequence.
+        sample_idx[2 * sample_index] = doc_idx_index;
+        sample_idx[2 * sample_index + 1] = doc_offset;
+        ++sample_index;
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(sample_idx, [](void* mem_) {
+        int32_t* mem = reinterpret_cast<int32_t*>(mem_);
+        delete[] mem;
+    });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(int32_t);
+    return py::array(std::vector<int64_t>{num_samples + 1, 2},  // shape
+                     {2 * byte_size, byte_size},                // C-style contiguous strides
+                     sample_idx,                                // the data pointer
+                     free_when_done);                           // numpy array references
+}
+
+py::array build_sample_idx_int64(const py::array_t<int32_t>& sizes_,
+                                 const py::array_t<int32_t>& doc_idx_,
+                                 const int32_t seq_length,
+                                 const int32_t num_epochs,
+                                 const int64_t tokens_per_epoch)
+{
+    /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+       starting offset in that document.*/
+
+    // Consistency checks.
+    assert(seq_length > 1);
+    assert(num_epochs > 0);
+    assert(tokens_per_epoch > 1);
+
+    // Remove bound checks.
+    auto sizes = sizes_.unchecked<1>();
+    auto doc_idx = doc_idx_.unchecked<1>();
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+    int64_t* sample_idx = new int64_t[2 * (num_samples + 1)];
+
+    cout << "    using:" << endl << std::flush;
+    cout << "     number of documents:       " << doc_idx_.shape(0) / num_epochs << endl
+         << std::flush;
+    cout << "     number of epochs:          " << num_epochs << endl << std::flush;
+    cout << "     sequence length:           " << seq_length << endl << std::flush;
+    cout << "     total number of samples:   " << num_samples << endl << std::flush;
+
+    // Index into sample_idx.
+    int64_t sample_index = 0;
+    // Index into doc_idx.
+    int64_t doc_idx_index = 0;
+    // Beginning offset for each document.
+    int32_t doc_offset = 0;
+    // Start with first document and no offset.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+
+    while (sample_index <= num_samples) {
+        // Start with a fresh sequence.
+        int32_t remaining_seq_length = seq_length + 1;
+        while (remaining_seq_length != 0) {
+            // Get the document length.
+            auto doc_id = doc_idx[doc_idx_index];
+            auto doc_length = sizes[doc_id] - doc_offset;
+            // And add it to the current sequence.
+            remaining_seq_length -= doc_length;
+            // If we have more than a full sequence, adjust offset and set
+            // remaining length to zero so we return from the while loop.
+            // Note that -1 here is for the same reason we have -1 in
+            // `_num_epochs` calculations.
+            if (remaining_seq_length <= 0) {
+                doc_offset += (remaining_seq_length + doc_length - 1);
+                remaining_seq_length = 0;
+            } else {
+                // Otherwise, start from the beginning of the next document.
+                ++doc_idx_index;
+                doc_offset = 0;
+            }
+        }
+        // Record the sequence.
+        sample_idx[2 * sample_index] = doc_idx_index;
+        sample_idx[2 * sample_index + 1] = doc_offset;
+        ++sample_index;
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(sample_idx, [](void* mem_) {
+        int64_t* mem = reinterpret_cast<int64_t*>(mem_);
+        delete[] mem;
+    });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(int64_t);
+    return py::array(std::vector<int64_t>{num_samples + 1, 2},  // shape
+                     {2 * byte_size, byte_size},                // C-style contiguous strides
+                     sample_idx,                                // the data pointer
+                     free_when_done);                           // numpy array references
+}
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+                                     const int32_t max_length,
+                                     std::mt19937& rand32_gen)
+{
+    /* Training sample length. */
+    const auto random_number = rand32_gen();
+    if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); }
+    return max_length;
+}
+
+template <typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
+                             const py::array_t<int32_t>& sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+                             const bool verbose)
+{
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(short_seq_prob > 0.0);
+    assert(short_seq_prob <= 1.0);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+
+    // For efficiency, convert probability to ratio. Note: rand() generates int.
+    const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+        const auto sent_end_index = docs[docs_.shape(0) - 1];
+        const auto num_sentences = sent_end_index - sent_start_index;
+        cout << "    using:" << endl << std::flush;
+        cout << "     number of documents:            " << docs_.shape(0) - 1 << endl << std::flush;
+        cout << "     sentences range:                [" << sent_start_index << ", "
+             << sent_end_index << ")" << endl
+             << std::flush;
+        cout << "     total number of sentences:      " << num_sentences << endl << std::flush;
+        cout << "     number of epochs:               " << num_epochs << endl << std::flush;
+        cout << "     maximum number of samples:      " << max_num_samples << endl << std::flush;
+        cout << "     maximum sequence length:        " << max_seq_length << endl << std::flush;
+        cout << "     short sequence probability:     " << short_seq_prob << endl << std::flush;
+        cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl << std::flush;
+        cout << "     seed:                           " << seed << endl << std::flush;
+    }
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration = 0; iteration < 2; ++iteration) {
+        // Set the seed so both iterations produce the same results.
+        std::mt19937 rand32_gen(seed);
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Counters:
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+        uint64_t long_sent_docs = 0;
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        // For each epoch:
+        for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+                if (verbose && (!second)) {
+                    cout << "    reached " << max_num_samples << " samples after " << epoch
+                         << " epochs ..." << endl
+                         << std::flush;
+                }
+                break;
+            }
+            // For each document:
+            for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+
+                // At the beginning of the document previous index is the
+                // start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) { ++empty_docs; }
+                    if (num_remain_sent == 1) { ++one_sent_docs; }
+                }
+
+                // Detect documents with long sentences.
+                bool contains_long_sentence = false;
+                if (num_remain_sent > 1) {
+                    for (auto sent_index = sent_index_first; sent_index < sent_index_last;
+                         ++sent_index) {
+                        if (sizes[sent_index] > LONG_SENTENCE_LEN) {
+                            if ((epoch == 0) && (!second)) { ++long_sent_docs; }
+                            contains_long_sentence = true;
+                            break;
+                        }
+                    }
+                }
+
+                // If we have more than two sentences.
+                if ((num_remain_sent > 1) && (!contains_long_sentence)) {
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+                    auto target_seq_len =
+                        get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen);
+
+                    // Loop through sentences.
+                    for (auto sent_index = sent_index_first; sent_index < sent_index_last;
+                         ++sent_index) {
+                        // Add the size and number of sentences.
+                        seq_len += sizes[sent_index];
+                        ++num_sent;
+                        --num_remain_sent;
+
+                        // If we have reached the target length.
+                        // and if not only one sentence is left in the document.
+                        // and if we have at least two sentneces.
+                        // and if we have reached end of the document.
+                        if (((seq_len >= target_seq_len) && (num_remain_sent > 1) &&
+                             (num_sent > 1)) ||
+                            (num_remain_sent == 0)) {
+                            // Check for overflow.
+                            if ((3 * map_index + 2) > std::numeric_limits<int64_t>::max()) {
+                                cout << "number of samples exceeded maximum "
+                                     << "allowed by type int64: "
+                                     << std::numeric_limits<int64_t>::max() << endl;
+                                throw std::overflow_error("Number of samples");
+                            }
+
+                            // Populate the map.
+                            if (second) {
+                                const auto map_index_0 = 3 * map_index;
+                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+                            }
+
+                            // Update indices / counters.
+                            ++map_index;
+                            prev_start_index = sent_index + 1;
+                            target_seq_len =
+                                get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen);
+                            seq_len = 0;
+                            num_sent = 0;
+                        }
+
+                    }  // for (auto sent_index=sent_index_first; ...
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+            if (verbose) {
+                cout << "   number of empty documents: " << empty_docs << endl << std::flush;
+                cout << "   number of documents with one sentence: " << one_sent_docs << endl
+                     << std::flush;
+                cout << "   number of documents with long sentences: " << long_sent_docs << endl
+                     << std::flush;
+                cout << "   will create mapping for " << map_index << " samples" << endl
+                     << std::flush;
+            }
+            assert(maps == NULL);
+            assert(num_samples < 0);
+            maps = new DocIdx[3 * map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    }  // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i = (num_samples - 1); i > 0; --i) {
+        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+        const auto i0 = 3 * i;
+        const auto j0 = 3 * j;
+        // Swap values.
+        swap(maps[i0], maps[j0]);
+        swap(maps[i0 + 1], maps[j0 + 1]);
+        swap(maps[i0 + 2], maps[j0 + 2]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void* mem_) {
+        DocIdx* mem = reinterpret_cast<DocIdx*>(mem_);
+        delete[] mem;
+    });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 3},  // shape
+                     {3 * byte_size, byte_size},            // C-style contiguous strides
+                     maps,                                  // the data pointer
+                     free_when_done);                       // numpy array references
+}
+
+py::array build_mapping(const py::array_t<int64_t>& docs_,
+                        const py::array_t<int>& sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+                        const bool verbose)
+{
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) { cout << "    using uint64 for data mapping..." << endl << std::flush; }
+        return build_mapping_impl<uint64_t>(docs_,
+                                            sizes_,
+                                            num_epochs,
+                                            max_num_samples,
+                                            max_seq_length,
+                                            short_seq_prob,
+                                            seed,
+                                            verbose);
+    } else {
+        if (verbose) { cout << "    using uint32 for data mapping..." << endl << std::flush; }
+        return build_mapping_impl<uint32_t>(docs_,
+                                            sizes_,
+                                            num_epochs,
+                                            max_num_samples,
+                                            max_seq_length,
+                                            short_seq_prob,
+                                            seed,
+                                            verbose);
+    }
+}
+
+template <typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
+                                    const py::array_t<int32_t>& sizes_,
+                                    const py::array_t<int32_t>& titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks)
+{
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+    auto titles_sizes = titles_sizes_.unchecked<1>();
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+        const auto sent_end_index = docs[docs_.shape(0) - 1];
+        const auto num_sentences = sent_end_index - sent_start_index;
+        cout << "    using:" << endl << std::flush;
+        cout << "     number of documents:            " << docs_.shape(0) - 1 << endl << std::flush;
+        cout << "     sentences range:                [" << sent_start_index << ", "
+             << sent_end_index << ")" << endl
+             << std::flush;
+        cout << "     total number of sentences:      " << num_sentences << endl << std::flush;
+        cout << "     number of epochs:               " << num_epochs << endl << std::flush;
+        cout << "     maximum number of samples:      " << max_num_samples << endl << std::flush;
+        cout << "     maximum sequence length:        " << max_seq_length << endl << std::flush;
+        cout << "     seed:                           " << seed << endl << std::flush;
+    }
+
+    // Mapping and its length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Acceptable number of sentences per block.
+    int min_num_sent = 2;
+    if (use_one_sent_blocks) { min_num_sent = 1; }
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration = 0; iteration < 2; ++iteration) {
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+        uint64_t long_sent_docs = 0;
+        // For each epoch:
+        for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {
+            // assign every block a unique id
+            int32_t block_id = 0;
+
+            if (map_index >= max_num_samples) {
+                if (verbose && (!second)) {
+                    cout << "    reached " << max_num_samples << " samples after " << epoch
+                         << " epochs ..." << endl
+                         << std::flush;
+                }
+                break;
+            }
+            // For each document:
+            for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+                const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+                // At the beginning of the document previous index is the
+                // start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) { ++empty_docs; }
+                    if (num_remain_sent == 1) { ++one_sent_docs; }
+                }
+                // Detect documents with long sentences.
+                bool contains_long_sentence = false;
+                if (num_remain_sent >= min_num_sent) {
+                    for (auto sent_index = sent_index_first; sent_index < sent_index_last;
+                         ++sent_index) {
+                        if (sizes[sent_index] > LONG_SENTENCE_LEN) {
+                            if ((epoch == 0) && (!second)) { ++long_sent_docs; }
+                            contains_long_sentence = true;
+                            break;
+                        }
+                    }
+                }
+                // If we have enough sentences and no long sentences.
+                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+
+                    // Loop through sentences.
+                    for (auto sent_index = sent_index_first; sent_index < sent_index_last;
+                         ++sent_index) {
+                        // Add the size and number of sentences.
+                        seq_len += sizes[sent_index];
+                        ++num_sent;
+                        --num_remain_sent;
+
+                        // If we have reached the target length.
+                        // and there are an acceptable number of sentences left
+                        // and if we have at least the minimum number of sentences.
+                        // or if we have reached end of the document.
+                        if (((seq_len >= target_seq_len) && (num_remain_sent >= min_num_sent) &&
+                             (num_sent >= min_num_sent)) ||
+                            (num_remain_sent == 0)) {
+                            // Populate the map.
+                            if (second) {
+                                const auto map_index_0 = 4 * map_index;
+                                // Each sample has 4 items: the starting sentence index, ending
+                                // sentence index, the index of the document from which the block
+                                // comes (used for fetching titles) and the unique id of the block
+                                // (used for creating block indexes)
+
+                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+                            }
+
+                            // Update indices / counters.
+                            ++map_index;
+                            ++block_id;
+                            prev_start_index = sent_index + 1;
+                            seq_len = 0;
+                            num_sent = 0;
+                        }
+                    }  // for (auto sent_index=sent_index_first; ...
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+            if (verbose) {
+                cout << "   number of empty documents: " << empty_docs << endl << std::flush;
+                cout << "   number of documents with one sentence: " << one_sent_docs << endl
+                     << std::flush;
+                cout << "   number of documents with long sentences: " << long_sent_docs << endl
+                     << std::flush;
+                cout << "   will create mapping for " << map_index << " samples" << endl
+                     << std::flush;
+            }
+            assert(maps == NULL);
+            assert(num_samples < 0);
+            maps = new DocIdx[4 * map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    }  // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i = (num_samples - 1); i > 0; --i) {
+        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+        const auto i0 = 4 * i;
+        const auto j0 = 4 * j;
+        // Swap values.
+        swap(maps[i0], maps[j0]);
+        swap(maps[i0 + 1], maps[j0 + 1]);
+        swap(maps[i0 + 2], maps[j0 + 2]);
+        swap(maps[i0 + 3], maps[j0 + 3]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void* mem_) {
+        DocIdx* mem = reinterpret_cast<DocIdx*>(mem_);
+        delete[] mem;
+    });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 4},  // shape
+                     {4 * byte_size, byte_size},            // C-style contiguous strides
+                     maps,                                  // the data pointer
+                     free_when_done);                       // numpy array references
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
+                               const py::array_t<int>& sizes_,
+                               const py::array_t<int>& titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                               const bool verbose,
+                               const bool use_one_sent_blocks)
+{
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) { cout << "    using uint64 for data mapping..." << endl << std::flush; }
+        return build_blocks_mapping_impl<uint64_t>(docs_,
+                                                   sizes_,
+                                                   titles_sizes_,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   max_seq_length,
+                                                   seed,
+                                                   verbose,
+                                                   use_one_sent_blocks);
+    } else {
+        if (verbose) { cout << "    using uint32 for data mapping..." << endl << std::flush; }
+        return build_blocks_mapping_impl<uint32_t>(docs_,
+                                                   sizes_,
+                                                   titles_sizes_,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   max_seq_length,
+                                                   seed,
+                                                   verbose,
+                                                   use_one_sent_blocks);
+    }
+}
+
+PYBIND11_MODULE(helpers, m)
+{
+    m.def("build_mapping", &build_mapping);
+    m.def("build_blocks_mapping", &build_blocks_mapping);
+    m.def("build_sample_idx_int32", &build_sample_idx_int32);
+    m.def("build_sample_idx_int64", &build_sample_idx_int64);
+    m.def("build_blending_indices", &build_blending_indices);
+}
diff --git a/benchmarks/sizing/megatron/data/indexed_dataset.py b/benchmarks/sizing/megatron/data/indexed_dataset.py
new file mode 100644
index 0000000..21ac734
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/indexed_dataset.py
@@ -0,0 +1,597 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+import os
+import shutil
+import struct
+from functools import lru_cache
+from itertools import accumulate
+
+import numpy as np
+import torch
+
+from megatron import print_rank_0
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return "cached"
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return "mmap"
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == "mmap":
+        return MMapIndexedDatasetBuilder(
+            out_file, dtype=__best_fitting_dtype(vocab_size)
+        )
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
+        return None
+    if impl == "infer":
+        impl = infer_dataset_impl(path)
+    if impl == "lazy" and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == "cached" and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == "mmap" and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == "mmap":
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float32,
+    7: np.float64,
+    8: np.uint16,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + ".idx"
+
+
+def data_file_path(prefix_path):
+    return prefix_path + ".bin"
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+
+    _HDR_MAGIC = b"TNTIDX\x00\x00"
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                "Index file doesn't match expected format. "
+                "Make sure that --dataset-impl is configured properly."
+            )
+            version = f.read(8)
+            assert struct.unpack("<Q", version) == (1,)
+            code, self.element_size = struct.unpack("<QQ", f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack("<QQ", f.read(16))
+            self.doc_count = struct.unpack("<Q", f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), "rb", buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError("index out of range")
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start] : self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(index_file_path(path)) and os.path.exists(
+            data_file_path(path)
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx : ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx : ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float32: 4,
+        np.float64: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, "wb")
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, np_array):
+        assert isinstance(np_array, np.ndarray) and np_array.dtype == self.dtype
+        bytes = self.out_file.write(np_array)
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in np_array.shape:
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(np_array.shape))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), "rb") as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, "wb")
+        index.write(b"TNTIDX\x00\x00")
+        index.write(struct.pack("<Q", 1))
+        index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
+        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack("<Q", len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, "rb") as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, "wb")
+
+                    # Write Magic string so we can check the file format then opening it again.
+                    self._file.write(cls._HDR_MAGIC)
+                    # Write version number
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", 1))
+                    # Little endian unsigned 8 Bit integer
+                    self._file.write(struct.pack("<B", code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    pointers = np.zeros(len(sizes), dtype=np.int64)
+                    sizes = np.array(sizes, dtype=np.int64)
+
+                    np.cumsum(sizes[:-1], out=pointers[1:])
+                    pointers = pointers * dtype().itemsize
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", len(sizes)))
+                    # Little endian unsigned 64 Bit integer
+                    self._file.write(struct.pack("<Q", len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order="C"))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                # Little endian unsigned 64 Bit integer
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                # Little endian unsigned 8 Bit integer
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                self._doc_count = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+            )
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._len,
+                offset=offset + self._sizes.nbytes,
+            )
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes,
+            )
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode="r", order="C"
+        )
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+            )
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr
+            )
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
+        )
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(index_file_path(path)) and os.path.exists(
+            data_file_path(path)
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, "wb")
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def add_item(self, np_array):
+        assert isinstance(np_array, np.ndarray) and np_array.dtype == self.dtype
+        self._data_file.write(np_array.tobytes(order="C"))
+        self._sizes.append(np_array.size)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), "rb") as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
diff --git a/benchmarks/sizing/megatron/data/samplers.py b/benchmarks/sizing/megatron/data/samplers.py
new file mode 100644
index 0000000..5e14b4a
--- /dev/null
+++ b/benchmarks/sizing/megatron/data/samplers.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch samplers that work with either random or sequential data samplers."""
+
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    """Based off of pytorch RandomSampler and DistributedSampler. Essentially
+    a RandomSampler, but this class lets the user set an epoch like
+    DistributedSampler Samples elements randomly. If without replacement, then
+    sample from a shuffled dataset. If with replacement, then user can
+    specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``,
+        default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError(
+                "With replacement=False, num_samples should not "
+                "be specified, since a random permute will be "
+                "performed."
+            )
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(
+                "num_samples should be a positive integer "
+                "value, but got num_samples={}".format(self.num_samples)
+            )
+        if not isinstance(self.replacement, bool):
+            raise ValueError(
+                "replacement should be a boolean value, but got "
+                "replacement={}".format(self.replacement)
+            )
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            return iter(
+                torch.randint(
+                    high=n, size=(self.num_samples,), dtype=torch.int64, generator=g
+                ).tolist()
+            )
+        return iter(torch.randperm(n, generator=g).tolist())
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """Similar to normal implementation of distributed sampler, except
+    implementation is at the batch sampler level, instead of just the
+    sampler level. This allows wrapping of arbitrary data samplers
+    (sequential, random, WeightedRandomSampler, etc.) with this batch
+    sampler.
+
+    The `interleave` argument specifies how to distribute a batch. A value
+    of True combined with the above random sampler is equivalent to pytorch's
+    torch.utils.data.distributed.DistributedSampler.
+
+    For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
+    specifying True will result in the following samples for each gpu:
+        GPU0: [0,2,4,6] GPU1: [1,3,5,7]
+    specifying False will result in the following samples:
+        GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
+
+    def __init__(
+        self,
+        sampler,
+        batch_size,
+        drop_last,
+        rank=-1,
+        world_size=2,
+        wrap_last=False,
+        interleave=False,
+    ):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
+        if rank == -1:
+            assert False, "should not be here"
+            rank = torch.distributed.get_rank()
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.interleave = interleave
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter:
+                    yield tbatch
+                    self.start_iter = 0
+                i += 1
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= self.batch_size
+                self.wrap_around += len(batch)
+                self.wrap_around %= self.batch_size
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        if self.interleave:
+            return batch[self.rank : self.batch_size : self.world_size]
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
diff --git a/benchmarks/sizing/megatron/fused_kernels.egg-info/PKG-INFO b/benchmarks/sizing/megatron/fused_kernels.egg-info/PKG-INFO
new file mode 100644
index 0000000..ddcb9f3
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels.egg-info/PKG-INFO
@@ -0,0 +1,5 @@
+Metadata-Version: 2.1
+Name: fused-kernels
+Version: 0.0.1
+Author: Sid Black & Alejandro Molina et al.
+Author-email: alejandro.molina@aleph-alpha.de
diff --git a/benchmarks/sizing/megatron/fused_kernels.egg-info/SOURCES.txt b/benchmarks/sizing/megatron/fused_kernels.egg-info/SOURCES.txt
new file mode 100644
index 0000000..1d87938
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels.egg-info/SOURCES.txt
@@ -0,0 +1,12 @@
+/fsx/home-jacob/TransformerSizing/megatron/fused_kernels/scaled_masked_softmax.cpp
+/fsx/home-jacob/TransformerSizing/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+/fsx/home-jacob/TransformerSizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
+/fsx/home-jacob/TransformerSizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+/fsx/home-quentin/jacob/TransformerSizing/megatron/fused_kernels/scaled_masked_softmax.cpp
+/fsx/home-quentin/jacob/TransformerSizing/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+/fsx/home-quentin/jacob/TransformerSizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
+/fsx/home-quentin/jacob/TransformerSizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+fused_kernels.egg-info/PKG-INFO
+fused_kernels.egg-info/SOURCES.txt
+fused_kernels.egg-info/dependency_links.txt
+fused_kernels.egg-info/top_level.txt
\ No newline at end of file
diff --git a/benchmarks/sizing/megatron/fused_kernels.egg-info/dependency_links.txt b/benchmarks/sizing/megatron/fused_kernels.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/benchmarks/sizing/megatron/fused_kernels.egg-info/top_level.txt b/benchmarks/sizing/megatron/fused_kernels.egg-info/top_level.txt
new file mode 100644
index 0000000..eb14337
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels.egg-info/top_level.txt
@@ -0,0 +1,2 @@
+scaled_masked_softmax_cuda
+scaled_upper_triang_masked_softmax_cuda
diff --git a/benchmarks/sizing/megatron/fused_kernels/__init__.py b/benchmarks/sizing/megatron/fused_kernels/__init__.py
new file mode 100644
index 0000000..80caa2d
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+import subprocess
+
+from torch.utils import cpp_extension
+from pathlib import Path
+
+srcpath = Path(__file__).parent.absolute()
+
+# Setting this param to a list has a problem of generating different
+# compilation commands (with different order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicitly in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def load_fused_kernels():
+    try:
+        import scaled_upper_triang_masked_softmax_cuda
+        import scaled_masked_softmax_cuda
+    except (ImportError, ModuleNotFoundError) as e:
+        print("\n")
+        print(e)
+        print("=" * 100)
+        print(
+            f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
+        )
+        print("=" * 100)
+        exit()
+    return
diff --git a/benchmarks/sizing/megatron/fused_kernels/compat.h b/benchmarks/sizing/megatron/fused_kernels/compat.h
new file mode 100644
index 0000000..251337d
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/compat.h
@@ -0,0 +1,29 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied from NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.cpp b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.cpp
new file mode 100644
index 0000000..b7c162c
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.cpp
@@ -0,0 +1,83 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor);
+
+torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
+                       torch::Tensor const& softmax_results,
+                       float scale_factor);
+
+int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads);
+
+torch::Tensor fwd(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor)
+{
+    AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
+    AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+                   (input.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+    AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
+
+    return fwd_cuda(input, mask, scale_factor);
+}
+
+torch::Tensor bwd(torch::Tensor const& output_grads,
+                  torch::Tensor const& softmax_results,
+                  float scale_factor)
+{
+    AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
+    AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
+
+    AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+                   (output_grads.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+    AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+                   (softmax_results.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+
+    return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads)
+{
+    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+}  // end namespace scaled_masked_softmax
+}  // end namespace fused_softmax
+}  // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("forward",
+          &multihead_attn::fused_softmax::scaled_masked_softmax::fwd,
+          "Self Multihead Attention scaled, time masked softmax -- Forward.");
+
+    m.def("backward",
+          &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+          "Self Multihead Attention scaled, time masked softmax -- Backward.");
+
+    m.def("get_batch_per_block",
+          &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
+          "Return Batch per block size.");
+}
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.h b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.h
new file mode 100644
index 0000000..977e594
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax.h
@@ -0,0 +1,550 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <c10/macros/Macros.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <cfloat>
+#include <limits>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype* dst, const Datatype* src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16* dst,
+                                                         const c10::BFloat16* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16* dst,
+                                                         const c10::BFloat16* src)
+{
+    *((float2*)dst) = *((float2*)src);
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half* dst, const c10::Half* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half* dst, const c10::Half* src)
+{
+    *((float2*)dst) = *((float2*)src);
+}
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t* dst, const uint8_t* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t* dst, const uint8_t* src)
+{
+    *((half2*)dst) = *((half2*)src);
+}
+
+int log2_ceil(int value)
+{
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template <typename T>
+struct Add {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct Max {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
+};
+
+template <typename T>
+__device__ __forceinline__ T
+WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template <typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum)
+{
+    ReduceOp<acc_t> r;
+#pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+#pragma unroll
+        for (int i = 0; i < WARP_BATCH; ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Explicit masking
+ */
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_forward(output_t* dst,
+                                                   const input_t* src,
+                                                   const uint8_t* mask,
+                                                   const acc_t scale,
+                                                   int micro_batch_size,
+                                                   int element_count,
+                                                   int pad_batches)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two
+                                                                  : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches)
+    int first_batch =
+        (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) +
+         threadIdx.y) *
+        WARP_BATCH;
+    int pad_first_batch = 0;
+    if (pad_batches != 1) {  // bert style
+        pad_first_batch =
+            (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
+    } else {  // gpt2 style
+        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    }
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                int itr_idx = i * element_count + it * WARP_SIZE;
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
+                copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
+
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (temp_mask[element] != 1) {
+                        elements[i][it + element] = (acc_t)temp_data[element] * scale;
+                    } else {
+                        elements[i][it + element] = -10000.0;
+                    }
+                }
+            } else {
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        max_value[i] = elements[i][0];
+#pragma unroll
+        for (int it = 1; it < WARP_ITERATIONS; ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH]{0.0f};
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; ++it) {
+            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+            sum[i] += elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        if (i >= local_batches) break;
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = elements[i][it + element] / sum[i];
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
+                    dst + i * element_count + it * WARP_SIZE, out);
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_backward(output_t* gradInput,
+                                                    input_t* grad,
+                                                    const input_t* output,
+                                                    acc_t scale,
+                                                    int micro_batch_size,
+                                                    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two
+                                                                  : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches)
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
+                    temp_grad, grad + i * element_count + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
+                    temp_output, output + i * element_count + it * WARP_SIZE);
+
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    output_reg[i][it + element] = (acc_t)temp_output[element];
+                }
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    grad_reg[i][it + element] =
+                        (acc_t)temp_grad[element] * output_reg[i][it + element];
+                }
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        sum[i] = grad_reg[i][0];
+#pragma unroll
+        for (int it = 1; it < WARP_ITERATIONS; ++it) { sum[i] += grad_reg[i][it]; }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+// store result
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        if (i >= local_batches) break;
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] -
+                                                       output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
+                    gradInput + i * element_count + it * WARP_SIZE, out);
+            }
+        }
+    }
+}
+}  // end of anonymous namespace
+
+int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads)
+{
+    int log2_elements = log2_ceil(key_seq_len);
+    const int next_power_of_two = 1 << log2_elements;
+
+    int batch_count = batches * attn_heads * query_seq_len;
+    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+    constexpr int threads_per_block = 128;
+    int warps_per_block = (threads_per_block / warp_size);
+    int batches_per_block = warps_per_block * batches_per_warp;
+
+    return batches_per_block;
+}
+
+template <typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_forward(output_t* dst,
+                                            const input_t* src,
+                                            const uint8_t* mask,
+                                            const input_t scale,
+                                            int query_seq_len,
+                                            int key_seq_len,
+                                            int batches,
+                                            int attn_heads,
+                                            int pad_batches)
+{
+    if (key_seq_len == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches * attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside
+        // softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        dim3 blocks(query_seq_len / batches_per_block, attn_heads, batches);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0:  // 1
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 1:  // 2
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 2:  // 4
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 3:  // 8
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 4:  // 16
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 5:  // 32
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 6:  // 64
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 7:  // 128
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 8:  // 256
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 9:  // 512
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 10:  // 1024
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 11:  // 2048
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            default: break;
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_backward(output_t* grad_input,
+                                             input_t* grad,
+                                             const input_t* output,
+                                             const acc_t scale,
+                                             int query_seq_len,
+                                             int key_seq_len,
+                                             int batches,
+                                             int attn_heads)
+{
+    if (key_seq_len == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches * attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside
+        // softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside
+        // softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = batch_count / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0:  // 1
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 1:  // 2
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 2:  // 4
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 3:  // 8
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 4:  // 16
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 5:  // 32
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 6:  // 64
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 7:  // 128
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 8:  // 256
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 9:  // 512
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 10:  // 1024
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 11:  // 2048
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            default: break;
+        }
+    }
+}
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
new file mode 100644
index 0000000..757850d
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -0,0 +1,109 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include "scaled_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads)
+{
+    return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor)
+{
+    // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+    const int batches = input.size(0);
+    const int pad_batches = mask.size(0);
+    const int attn_heads = input.size(1);
+    const int query_seq_len = input.size(2);
+    const int key_seq_len = input.size(3);
+    TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
+    TORCH_INTERNAL_ASSERT(query_seq_len > 1);
+    TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
+    TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
+    TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
+    TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
+
+    // Output
+    auto act_options = input.options().requires_grad(false);
+    torch::Tensor softmax_results =
+        torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
+
+    // Softmax Intermediate Result Ptr
+    void* input_ptr = static_cast<void*>(input.data_ptr());
+    void* mask_ptr = static_cast<void*>(mask.data_ptr());
+    void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+    DISPATCH_HALF_AND_BFLOAT(input.scalar_type(),
+                             "dispatch_scaled_masked_softmax_forward",
+                             dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
+                                 reinterpret_cast<scalar_t*>(softmax_results_ptr),
+                                 reinterpret_cast<const scalar_t*>(input_ptr),
+                                 reinterpret_cast<const uint8_t*>(mask_ptr),
+                                 scale_factor,
+                                 query_seq_len,
+                                 key_seq_len,
+                                 batches,
+                                 attn_heads,
+                                 pad_batches););
+    return softmax_results;
+}
+
+torch::Tensor bwd_cuda(torch::Tensor const& output_grads_,
+                       torch::Tensor const& softmax_results_,
+                       float scale_factor)
+{
+    auto output_grads = output_grads_.contiguous();
+    auto softmax_results = softmax_results_.contiguous();
+
+    // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+    const int batches = output_grads.size(0);
+    const int attn_heads = output_grads.size(1);
+    const int query_seq_len = output_grads.size(2);
+    const int key_seq_len = output_grads.size(3);
+
+    void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+    // Softmax Grad
+    DISPATCH_HALF_AND_BFLOAT(output_grads_.scalar_type(),
+                             "dispatch_scaled_masked_softmax_backward",
+                             dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
+                                 reinterpret_cast<scalar_t*>(output_grads_ptr),
+                                 reinterpret_cast<scalar_t*>(output_grads_ptr),
+                                 reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+                                 scale_factor,
+                                 query_seq_len,
+                                 key_seq_len,
+                                 batches,
+                                 attn_heads););
+
+    // backward pass is completely in-place
+    return output_grads;
+}
+}  // namespace scaled_masked_softmax
+}  // namespace fused_softmax
+}  // namespace multihead_attn
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
new file mode 100644
index 0000000..945c48c
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
@@ -0,0 +1,70 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);
+
+torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
+                       torch::Tensor const& softmax_results,
+                       float scale_factor);
+
+torch::Tensor fwd(torch::Tensor const& input, float scale_factor)
+{
+    AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
+    AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+                   (input.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+
+    return fwd_cuda(input, scale_factor);
+}
+
+torch::Tensor bwd(torch::Tensor const& output_grads,
+                  torch::Tensor const& softmax_results,
+                  float scale_factor)
+{
+    AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
+    AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
+
+    AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+                   (output_grads.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+    AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+                   (softmax_results.scalar_type() == at::ScalarType::BFloat16),
+               "Only fp16 and bf16 are supported");
+
+    return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+}  // end namespace scaled_upper_triang_masked_softmax
+}  // end namespace fused_softmax
+}  // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("forward",
+          &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
+          "Self Multihead Attention scaled, time masked softmax -- Forward.");
+    m.def("backward",
+          &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
+          "Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
new file mode 100644
index 0000000..b075719
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -0,0 +1,633 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <c10/macros/Macros.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <cfloat>
+#include <limits>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype* dst, const Datatype* src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16* dst,
+                                                         const c10::BFloat16* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16* dst,
+                                                         const c10::BFloat16* src)
+{
+    *((float2*)dst) = *((float2*)src);
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half* dst, const c10::Half* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half* dst, const c10::Half* src)
+{
+    *((float2*)dst) = *((float2*)src);
+}
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t* dst, const uint8_t* src)
+{
+    *dst = *src;
+}
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t* dst, const uint8_t* src)
+{
+    *((half2*)dst) = *((half2*)src);
+}
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_zero_vector(Datatype* dst);
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16* dst)
+{
+    *dst = 0.0;
+}
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16* dst)
+{
+    *((float2*)dst) = make_float2(0.0f, 0.0f);
+}
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half* dst)
+{
+    *dst = 0.0;
+}
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half* dst)
+{
+    *((float2*)dst) = make_float2(0.0f, 0.0f);
+}
+
+int log2_ceil(int value)
+{
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template <typename T>
+struct Add {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct Max {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
+};
+
+template <typename T>
+__device__ __forceinline__ T
+WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template <typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum)
+{
+    ReduceOp<acc_t> r;
+#pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+#pragma unroll
+        for (int i = 0; i < WARP_BATCH; ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Implicit time (diagonal masking)
+ */
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_forward(output_t* dst,
+                                                                const input_t* src,
+                                                                const acc_t scale,
+                                                                int micro_batch_size,
+                                                                int stride,
+                                                                int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two
+                                                                  : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1;
+    int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1) / WARP_SIZE;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
+                    temp_data, src + i * element_count * stride + it * WARP_SIZE);
+
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if ((element_index + element) < batch_element_count) {
+                        elements[i][it + element] = (acc_t)temp_data[element] * scale;
+                    } else {
+                        elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                    }
+                }
+            } else {
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        max_value[i] = elements[i][0];
+#pragma unroll
+        for (int it = 1; it < WARP_ITERATIONS; ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH]{0.0f};
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; ++it) {
+            if (it < warp_iteration_limit) {
+                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+                sum[i] += elements[i][it];
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        if (i >= local_batches) break;
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < local_seq) {
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < local_seq) {
+                        out[element] = elements[i][it + element] / sum[i];
+                    } else {
+                        out[element] = 0;
+                    }
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
+                    dst + i * element_count * stride + it * WARP_SIZE, out);
+            } else if (element_index < element_count) {
+                copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride +
+                                                                 it * WARP_SIZE);
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_backward(output_t* gradInput,
+                                                                 input_t* grad,
+                                                                 const input_t* output,
+                                                                 acc_t scale,
+                                                                 int micro_batch_size,
+                                                                 int stride,
+                                                                 int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two
+                                                                  : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
+                    temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
+                    temp_output, output + i * element_count * stride + it * WARP_SIZE);
+
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        output_reg[i][it + element] = (acc_t)temp_output[element];
+                    }
+                }
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        grad_reg[i][it + element] =
+                            (acc_t)temp_grad[element] * output_reg[i][it + element];
+                    }
+                }
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH];
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        sum[i] = grad_reg[i][0];
+#pragma unroll
+        for (int it = 1; it < WARP_ITERATIONS; ++it) { sum[i] += grad_reg[i][it]; }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+// store result
+#pragma unroll
+    for (int i = 0; i < WARP_BATCH; ++i) {
+        if (i >= local_batches) break;
+#pragma unroll
+        for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+#pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] -
+                                                       output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
+                    gradInput + i * element_count * stride + it * WARP_SIZE, out);
+            }
+        }
+    }
+}
+
+}  // end of anonymous namespace
+
+template <typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_forward(output_t* dst,
+                                                         const input_t* src,
+                                                         const input_t scale,
+                                                         int softmax_elements,
+                                                         int softmax_elements_stride,
+                                                         int attn_batches)
+{
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside
+        // softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0:  // 1
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1:  // 2
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2:  // 4
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3:  // 8
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4:  // 16
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5:  // 32
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6:  // 64
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7:  // 128
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8:  // 256
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9:  // 512
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10:  // 1024
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11:  // 2048
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default: break;
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_backward(output_t* grad_input,
+                                                          input_t* grad,
+                                                          const input_t* output,
+                                                          const acc_t scale,
+                                                          int softmax_elements,
+                                                          int softmax_elements_stride,
+                                                          int attn_batches)
+{
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside
+        // softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside
+        // softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0:  // 1
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 1:  // 2
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 2:  // 4
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 3:  // 8
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 4:  // 16
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 5:  // 32
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 6:  // 64
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 7:  // 128
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 8:  // 256
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 9:  // 512
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 10:  // 1024
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            case 11:  // 2048
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        grad_input,
+                        grad,
+                        output,
+                        scale,
+                        batch_count,
+                        softmax_elements_stride,
+                        softmax_elements);
+                break;
+            default: break;
+        }
+    }
+}
diff --git a/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
new file mode 100644
index 0000000..7ced78a
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -0,0 +1,91 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include "scaled_upper_triang_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor)
+{
+    // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+    const int attn_batches = input.size(0);
+    const int seq_len = input.size(1);
+    TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+
+    // Output
+    auto act_options = input.options().requires_grad(false);
+    torch::Tensor softmax_results = torch::empty({attn_batches, seq_len, seq_len}, act_options);
+
+    // Softmax Intermediate Result Ptr
+    void* input_ptr = static_cast<void*>(input.data_ptr());
+    void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+    DISPATCH_HALF_AND_BFLOAT(
+        input.scalar_type(),
+        "dispatch_scaled_upper_triang_masked_softmax_forward",
+        dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
+            reinterpret_cast<scalar_t*>(softmax_results_ptr),
+            reinterpret_cast<const scalar_t*>(input_ptr),
+            scale_factor,
+            seq_len,
+            seq_len,
+            attn_batches););
+    return softmax_results;
+}
+
+torch::Tensor bwd_cuda(torch::Tensor const& output_grads_,
+                       torch::Tensor const& softmax_results_,
+                       float scale_factor)
+{
+    auto output_grads = output_grads_.contiguous();
+    auto softmax_results = softmax_results_.contiguous();
+
+    // output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+    const int attn_batches = output_grads.size(0);
+    const int seq_len = output_grads.size(1);
+    TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
+
+    void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+    // Softmax Grad
+    DISPATCH_HALF_AND_BFLOAT(
+        output_grads_.scalar_type(),
+        "dispatch_scaled_upper_triang_masked_softmax_backward",
+        dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
+            reinterpret_cast<scalar_t*>(output_grads_ptr),
+            reinterpret_cast<scalar_t*>(output_grads_ptr),
+            reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+            scale_factor,
+            seq_len,
+            seq_len,
+            attn_batches););
+
+    // backward pass is completely in-place
+    return output_grads;
+}
+}  // namespace scaled_upper_triang_masked_softmax
+}  // namespace fused_softmax
+}  // namespace multihead_attn
diff --git a/benchmarks/sizing/megatron/fused_kernels/setup.py b/benchmarks/sizing/megatron/fused_kernels/setup.py
new file mode 100644
index 0000000..0eb04c7
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/setup.py
@@ -0,0 +1,68 @@
+from setuptools import setup, find_packages
+from torch.utils import cpp_extension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from pathlib import Path
+import subprocess
+
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+srcpath = Path(__file__).parent.absolute()
+cc_flag = []
+_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+if int(bare_metal_major) >= 11:
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_80,code=sm_80")
+
+nvcc_flags = [
+    "-O3",
+    "-gencode",
+    "arch=compute_70,code=sm_70",
+    "--use_fast_math",
+    "-U__CUDA_NO_HALF_OPERATORS__",
+    "-U__CUDA_NO_HALF_CONVERSIONS__",
+    "--expt-relaxed-constexpr",
+    "--expt-extended-lambda",
+]
+cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
+layernorm_cuda_args = {
+    "cxx": ["-O3"],
+    "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
+}
+setup(
+    name="fused_kernels",
+    version="0.0.1",
+    author="Sid Black & Alejandro Molina et al.",
+    author_email="alejandro.molina@aleph-alpha.de",
+    include_package_data=False,
+    ext_modules=[
+        CUDAExtension(
+            "scaled_upper_triang_masked_softmax_cuda",
+            [
+                str(srcpath / "scaled_upper_triang_masked_softmax.cpp"),
+                str(srcpath / "scaled_upper_triang_masked_softmax_cuda.cu"),
+            ],
+            extra_compile_args=cuda_ext_args,
+        ),
+        CUDAExtension(
+            "scaled_masked_softmax_cuda",
+            [
+                str(srcpath / "scaled_masked_softmax.cpp"),
+                str(srcpath / "scaled_masked_softmax_cuda.cu"),
+            ],
+            extra_compile_args=cuda_ext_args,
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/benchmarks/sizing/megatron/fused_kernels/tests/test_fused_kernels.py b/benchmarks/sizing/megatron/fused_kernels/tests/test_fused_kernels.py
new file mode 100644
index 0000000..b85618d
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -0,0 +1,296 @@
+import math
+
+import torch
+from torch.nn import LayerNorm
+
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.gpt2_model import gpt2_attention_mask_func
+
+
+def test_load_fused_kernels():
+    try:
+        import scaled_masked_softmax_cuda
+        import scaled_upper_triang_masked_softmax_cuda
+        import torch
+
+        print("[Success] load_fused_kernels")
+    except ImportError as e:
+        print("[Fail] load_fused_kernels")
+        raise e
+
+
+def test_fused_softmax():
+    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    embedding_output = bert.embeddings(
+        input_ids=tokens["input_ids"].cuda(),
+        position_ids=None,
+        token_type_ids=tokens["token_type_ids"].cuda(),
+        inputs_embeds=None,
+        past_key_values_length=0,
+    )
+
+    # (bsz, 1, 1, seq_len)
+    mask = bert.get_extended_attention_mask(
+        attention_mask=tokens["attention_mask"].cuda(),
+        input_shape=tokens["input_ids"].shape,
+        device=bert.device,
+    )
+    # (bsz, 1, seq_len, seq_len)
+    mask = mask.repeat(1, 1, mask.size()[-1], 1)
+
+    attention = bert.encoder.layer[0].attention.self
+    key_layer = attention.transpose_for_scores(attention.key(embedding_output))
+    query_layer = attention.transpose_for_scores(attention.query(embedding_output))
+
+    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+    attention_scores /= math.sqrt(key_layer.size()[-1])
+
+    fused_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.padding,
+            scaled_masked_softmax_fusion=True,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_softmax_output = fused_softmax(
+        attention_scores,
+        (mask != 0),
+    )
+
+    torch_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.padding,
+            scaled_masked_softmax_fusion=False,
+        )
+        .cuda()
+        .half()
+    )
+
+    torch_softmax_output = torch_softmax(
+        attention_scores,
+        (mask != 0),
+    )
+
+    test_result = (fused_softmax_output - torch_softmax_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_fused_softmax"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_fused_softmax"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+
+
+def test_fused_upper_triangle_mask_softmax():
+    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi"  # 24
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    attention_mask = tokens["attention_mask"].cuda()
+    attention_mask = attention_mask.view(attention_mask.size(0), -1)
+    attention_mask = attention_mask[:, None, None, :]
+    attention_mask = (1.0 - attention_mask) * -10000.0
+    attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1)
+    attn = gpt.h[0]
+
+    hidden_states = gpt.wte(tokens["input_ids"].cuda())
+    q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1)
+    q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim)
+    k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim)
+    attn_weights = torch.matmul(q, k.transpose(-1, -2))
+
+    sq, sk = q.size(-2), k.size(-2)
+    causal_mask = attn.attn.bias[:, :, sk - sq : sk, :sk].bool()
+    total_mask = ~(causal_mask & (attention_mask == 0))
+    """
+    tensor([[[[False,  True,  True,  ...,  True,  True,  True],
+              [False, False,  True,  ...,  True,  True,  True],
+              [False, False, False,  ...,  True,  True,  True],
+              ...,
+              [False, False, False,  ..., False,  True,  True],
+              [False, False, False,  ..., False, False,  True],
+              [False, False, False,  ..., False, False, False]]]
+    """
+
+    fused_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.causal,
+            scaled_masked_softmax_fusion=True,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_softmax_output = fused_softmax(
+        attn_weights,
+        total_mask,
+    )
+
+    torch_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.causal,
+            scaled_masked_softmax_fusion=False,
+        )
+        .cuda()
+        .half()
+    )
+
+    torch_softmax_output = torch_softmax(
+        attn_weights,
+        total_mask,
+    )
+
+    test_result = (fused_softmax_output - torch_softmax_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_fused_upper_triangle_mask_softmax"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_fused_upper_triangle_mask_softmax"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+
+
+def test_layer_norm():
+    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    # [bsz, seq_len, d_model]
+    embedding_output = (
+        bert.embeddings(
+            input_ids=tokens["input_ids"].cuda(),
+            position_ids=None,
+            token_type_ids=tokens["token_type_ids"].cuda(),
+            inputs_embeds=None,
+            past_key_values_length=0,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_layernorm_layer = (
+        MixedFusedLayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
+    )
+
+    torch_layernorm_layer = (
+        LayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
+    )
+
+    fused_output = fused_layernorm_layer(embedding_output)
+    torch_output = torch_layernorm_layer(embedding_output)
+    test_result = (fused_output - torch_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_layer_norm"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_layer_norm"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
+        )
+
+
+if __name__ == "__main__":
+    try:
+        from transformers import BertTokenizer, GPT2Tokenizer
+        from transformers.models.bert.modeling_bert import BertModel
+        from transformers.models.gpt2.modeling_gpt2 import GPT2Model
+        import transformers
+
+        transformers.logging.set_verbosity(
+            transformers.logging.FATAL,
+        )
+
+    except:
+        print("\n[Fail] Please install `transformers` package to test fused kernels\n")
+        exit(-1)
+
+    test_load_fused_kernels()
+    test_fused_softmax()
+    test_fused_upper_triangle_mask_softmax()
diff --git a/benchmarks/sizing/megatron/fused_kernels/type_shim.h b/benchmarks/sizing/megatron/fused_kernels/type_shim.h
new file mode 100644
index 0000000..1a3a309
--- /dev/null
+++ b/benchmarks/sizing/megatron/fused_kernels/type_shim.h
@@ -0,0 +1,72 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                                \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Half: {                                             \
+            using scalar_t = at::Half;                                           \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t = at::BFloat16;                                       \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...)              \
+    switch (TYPEIN) {                                                                       \
+        case at::ScalarType::Float: {                                                       \
+            using scalar_t_in = float;                                                      \
+            switch (TYPEOUT) {                                                              \
+                case at::ScalarType::Float: {                                               \
+                    using scalar_t_out = float;                                             \
+                    __VA_ARGS__;                                                            \
+                    break;                                                                  \
+                }                                                                           \
+                case at::ScalarType::Half: {                                                \
+                    using scalar_t_out = at::Half;                                          \
+                    __VA_ARGS__;                                                            \
+                    break;                                                                  \
+                }                                                                           \
+                case at::ScalarType::BFloat16: {                                            \
+                    using scalar_t_out = at::BFloat16;                                      \
+                    __VA_ARGS__;                                                            \
+                    break;                                                                  \
+                }                                                                           \
+                default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+            }                                                                               \
+            break;                                                                          \
+        }                                                                                   \
+        case at::ScalarType::Half: {                                                        \
+            using scalar_t_in = at::Half;                                                   \
+            using scalar_t_out = at::Half;                                                  \
+            __VA_ARGS__;                                                                    \
+            break;                                                                          \
+        }                                                                                   \
+        case at::ScalarType::BFloat16: {                                                    \
+            using scalar_t_in = at::BFloat16;                                               \
+            using scalar_t_out = at::BFloat16;                                              \
+            __VA_ARGS__;                                                                    \
+            break;                                                                          \
+        }                                                                                   \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");          \
+    }
diff --git a/benchmarks/sizing/megatron/gradient_noise_scale/__init__.py b/benchmarks/sizing/megatron/gradient_noise_scale/__init__.py
new file mode 100644
index 0000000..6b0d07e
--- /dev/null
+++ b/benchmarks/sizing/megatron/gradient_noise_scale/__init__.py
@@ -0,0 +1 @@
+from .gradient_noise_scale import GradientNoiseScale
diff --git a/benchmarks/sizing/megatron/gradient_noise_scale/gradient_noise_scale.py b/benchmarks/sizing/megatron/gradient_noise_scale/gradient_noise_scale.py
new file mode 100644
index 0000000..9c518a9
--- /dev/null
+++ b/benchmarks/sizing/megatron/gradient_noise_scale/gradient_noise_scale.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def ema(avg, beta, yi, i):
+    """Exponential moving average"""
+    if avg is None:
+        avg = 0
+    avg = beta * avg + (1 - beta) * yi
+    return avg, avg / (1 - beta ** (i + 1))
+
+
+class GradientNoiseScale:
+    """
+    A class to measure the gradient noise scale of a model while training (cf. https://arxiv.org/abs/1812.06162).
+
+    The core thesis of the paper is that, if our batch size is small, there will be a lot of noise present in the gradients, and we might update our weights only on noise.
+    After several updates the optimizer may still push us in the right direction, but we would be better off having used a larger batch size, which is more computationally
+    efficient and directly averages out the noise in the gradients.
+
+    But there's a limit to the gains large batch sizes can give you - if, after a certain batch size, your gradient is already accurate, there's no point in increasing the
+    batch size further, as we'll just be wasting compute for little to no gain in accuracy.
+
+    This means there is some theoretically optimal batch size for a given model, which measuring the gradient noise scale can help us to estimate.
+
+    To estimate the 'simple' noise scale (Bsimple), we need to have a measure of the gradients using a large batch size (Bbig) and a small
+    batch size (Bsmall).
+
+    when we have those:
+        Bsimple ≈ (tr(Σ) / |G|^2)
+
+    tr(Σ) can be approximated by:
+        tr(Σ) ≈ (1 / ((1/Bsmall) - (1/Bbig))) * (|Gsmall|^2 - |Gbig|^2)
+
+    and |G|^2 by:
+        |G|^2 ≈ (1 / (Bbig - Bsmall)) * (Bbig*|Gbig|^2 - Bsmall*|Gsmall|^2)
+
+    - With multi-gpu training, we can do this by taking the gradients of the microbatch_size_per_gpu for Bsmall,
+    and the gradients of the entire batch for Bbig.
+    - Alternatively, we can just take Bsmall as a single batch, and Bbig as several sequential batches in a row.
+    This is the option we've opted for in this implementation because a) it's easier to implement and b) also works in
+    single-gpu environments. Unfortunately it does come with some memory overhead.
+    """
+
+    def __init__(
+        self,
+        model,
+        batch_size_small,
+        n_batches=10,
+        beta=0.99,
+        cpu_offload=False,
+        neox_args=None,
+        mpu=None,
+    ):
+        self.batch_size_small = batch_size_small
+        self.batch_size_large = batch_size_small * n_batches
+        self.n_batches = n_batches
+        self.beta = beta
+        self.model = model
+        self.buffer = None
+        self.ema_scale = None
+        self.ema_noise = None
+        self.noise_scale = None
+        self.n_updates = 0
+        self.cpu_offload = cpu_offload
+        self.model.store_gradients = True
+        self.model.store_gradients_cpu = cpu_offload
+        self.neox_args = neox_args
+        self.mpu = mpu
+
+    def flatten_grads(self):
+        grads = []
+        assert hasattr(
+            self.model, "stored_gradients"
+        ), "You might need to update DeeperSpeed"
+        if self.model.stored_gradients is not None:
+            for g in self.model.stored_gradients:
+                if g is not None and not g.isnan().any() and not g.isinf().any():
+                    g = g.flatten().view(-1, 1)
+                    if self.cpu_offload:
+                        g = g.cpu()
+                    grads.append(g)
+                else:
+                    return None
+            if not grads:
+                return None
+            return torch.cat(grads)
+
+    def _sync_overflow(self, is_overflow):
+        if self.neox_args.is_pipe_parallel:
+            # Since each model parallel GPU carries only part of the model,
+            # make sure overflow flag is synced across all the pipe parallel GPUs
+            overflow_gpu = torch.cuda.ByteTensor([is_overflow])
+            torch.distributed.all_reduce(
+                overflow_gpu,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.mpu.get_pipe_parallel_group(),
+            )
+            overflow = overflow_gpu[0].item()
+        else:
+            overflow = is_overflow
+        return overflow
+
+    def _update(self):
+
+        grad = self.flatten_grads()
+        is_overflow = self._sync_overflow(grad is None)
+        if is_overflow:
+            return
+        if self.buffer is None:
+            self.buffer = grad
+        else:
+            self.buffer += grad
+        if self.n_updates % self.n_batches == self.n_batches - 1:
+            # average grads every n_batches iteration to get a simulation of Bbig
+            self.buffer /= self.n_batches
+            grads = self.buffer
+            self.buffer = None
+
+            # calculate Gbig and Gsmall
+            # this needs to be done in fp32 or it overflows
+            if self.neox_args.is_pipe_parallel:
+
+                g_big = torch.square(torch.norm(grads.to(torch.float)))
+                g_small = torch.square(torch.norm(grad.to(torch.float)))
+
+                # we need to put the tensors back on gpu to do the allreduce
+                if self.cpu_offload:
+                    g_big = g_big.to(self.model.device)
+                    g_small = g_small.to(self.model.device)
+
+                # avg g_big / g_small across pipe parallel groups
+                torch.distributed.all_reduce(
+                    g_big,
+                    op=torch.distributed.ReduceOp.SUM,
+                    group=self.mpu.get_pipe_parallel_group(),
+                )
+                torch.distributed.all_reduce(
+                    g_small,
+                    op=torch.distributed.ReduceOp.SUM,
+                    group=self.mpu.get_pipe_parallel_group(),
+                )
+                g_big /= self.mpu.get_pipe_parallel_world_size()
+                g_small /= self.mpu.get_pipe_parallel_world_size()
+
+            else:
+                g_big = torch.square(torch.norm(grads.to(torch.float)))
+                g_small = torch.square(torch.norm(grad.to(torch.float)))
+
+            # communicate any overflows
+            is_overflow = (
+                g_small.isinf().any()
+                or g_small.isnan().any()
+                or g_big.isinf().any()
+                or g_big.isnan().any()
+            )
+            is_overflow = self._sync_overflow(is_overflow)
+            if is_overflow:
+                return
+
+            # calculate noise / scale
+            noise = (
+                1
+                / (self.batch_size_large - self.batch_size_small)
+                * (self.batch_size_large * g_big - self.batch_size_small * g_small)
+            )
+            scale = (
+                1
+                / (1 / self.batch_size_small - 1 / self.batch_size_large)
+                * (g_small - g_big)
+            )
+
+            # calculate running average
+            self.ema_noise, noise = ema(
+                self.ema_noise, self.beta, noise, self.n_updates
+            )
+            self.ema_scale, scale = ema(
+                self.ema_scale, self.beta, scale, self.n_updates
+            )
+
+            # calculate noise scale
+            scale = scale.item()
+            noise = noise.item()
+            self.noise_scale = scale / noise
+
+        self.n_updates += 1
+
+    def update(self):
+        if self.neox_args.is_pipe_parallel:
+            # update on all ranks
+            self._update()
+        else:
+            # for mp / dp only, the grads will be the same across all ranks, so we can just do the process on a single rank
+            if torch.distributed.get_rank() == 0:
+                # only update on 0th rank
+                self._update()
+            torch.distributed.barrier()
diff --git a/benchmarks/sizing/megatron/initialize.py b/benchmarks/sizing/megatron/initialize.py
new file mode 100644
index 0000000..bc40326
--- /dev/null
+++ b/benchmarks/sizing/megatron/initialize.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron initialization."""
+
+import random
+import os
+
+import numpy as np
+import torch
+
+from megatron import fused_kernels
+from megatron import mpu
+from megatron.mpu import set_model_parallel_rank, set_model_parallel_world_size
+
+import deepspeed
+import inspect
+
+
+def initialize_megatron(neox_args, allow_no_cuda=False):
+    """Set initialize distributed and set autoresume and random seeds.
+    `allow_no_cuda` should not be set unless using megatron for cpu only
+    data processing. In general this arg should not be set unless you know
+    what you are doing.
+    Returns a function to finalize distributed env initialization
+    (optionally, only when args.lazy_mpu_init == True)
+    """
+    if not allow_no_cuda:
+        # Make sure cuda is available.
+        assert torch.cuda.is_available(), "Megatron requires CUDA."
+
+    # torch.distributed initialization
+    def finish_mpu_init():
+        # Pytorch distributed.
+        _initialize_distributed(neox_args=neox_args)
+
+        # Random seeds for reproducibility.
+        if neox_args.rank == 0:
+            print("> setting random seeds to {} ...".format(neox_args.seed))
+        _set_random_seed(neox_args.seed)
+
+    # check fused kernels are installed:
+    if (
+        neox_args.scaled_upper_triang_masked_softmax_fusion
+        or neox_args.scaled_masked_softmax_fusion
+    ):
+        fused_kernels.load_fused_kernels()
+
+    if neox_args.lazy_mpu_init:
+        neox_args.use_cpu_initialization = True
+        # delayed initialization of DDP-related stuff
+        # We only set basic DDP globals
+        set_model_parallel_world_size(neox_args.model_parallel_size)
+        # and return function for external DDP manager to call when it has DDP initialized
+        set_model_parallel_rank(neox_args.rank)
+        return finish_mpu_init
+    else:
+        # Megatron's MPU is the master. Complete initialization right away.
+        finish_mpu_init()
+
+        # Compile dataset C++ code.
+        if neox_args.local_rank == 0:
+            from megatron.data.data_utils import compile_helper
+
+            compile_helper()
+
+        # Write arguments to tensorboard.
+        _write_args_to_tensorboard(neox_args=neox_args)
+        # No continuation function
+        return None
+
+
+def setup_deepspeed_random_and_activation_checkpointing(neox_args):
+    """Optional DeepSpeed Activation Checkpointing features.
+    Gives access to partition activations, contiguous memory optimizations
+    and cpu checkpointing.
+
+    Activation checkpoint requires keep track of the random states
+    and setting the random seed for each MP process. Megatron uses
+    mpu.get_cuda_rng_tracker and mpu.model_parallel_cuda_manual_seed
+    for keeping track of the random states and setting the random seeds.
+    Since they are used in places outside of activation checkpointing,
+    we overwrite them to maintain consistency.
+
+    This must be called before all the calls to mpu.model_parallel_cuda_manual_seed
+    """
+    num_layers = neox_args.num_layers // neox_args.checkpoint_num_layers
+    num_layers = (
+        num_layers
+        if neox_args.num_layers % neox_args.checkpoint_num_layers == 0
+        else num_layers + 1
+    )
+
+    deepspeed.checkpointing.configure(
+        mpu,
+        partition_activations=neox_args.partition_activations,
+        contiguous_checkpointing=neox_args.contiguous_checkpointing,
+        num_checkpoints=num_layers,
+        checkpoint_in_cpu=neox_args.checkpoint_in_cpu,
+        synchronize=neox_args.synchronize_each_layer,
+        profile=neox_args.profile_backward,
+    )
+
+
+def _initialize_distributed(neox_args):
+    """Initialize torch.distributed and mpu."""
+
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+
+        if neox_args.rank == 0:
+            print(
+                "torch distributed is already initialized, "
+                "skipping initialization ...",
+                flush=True,
+            )
+        neox_args.rank = torch.distributed.get_rank()
+        neox_args.world_size = torch.distributed.get_world_size()
+
+    else:
+
+        if neox_args.rank == 0:
+            print("> initializing torch distributed ...", flush=True)
+        # Manually set the device ids.
+        if device_count > 0:
+            device = neox_args.rank % device_count
+            if neox_args.local_rank is not None:
+                assert (
+                    neox_args.local_rank == device
+                ), "expected local-rank to be the same as rank % device-count."
+            else:
+                neox_args.local_rank = device
+            torch.cuda.set_device(device)
+
+        deepspeed.init_distributed(
+            dist_backend=neox_args.distributed_backend,
+            auto_mpi_discovery=True,
+            distributed_port=os.getenv("MASTER_PORT", "6000"),
+            verbose=True,
+        )
+
+    # Setup 3D topology.
+    pp = neox_args.pipe_parallel_size if neox_args.pipe_parallel_size >= 1 else 1
+    mp = neox_args.model_parallel_size if neox_args.model_parallel_size >= 1 else 1
+    assert (
+        neox_args.world_size % (pp * mp) == 0
+    ), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}"
+    dp = neox_args.world_size // (pp * mp)
+
+    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+
+    # this does pipe on the most outside, then data, then model.
+    # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
+    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
+
+    # Offset base seeds for the interior pipeline stages.
+    # TODO: adjust last stage too once IO is improved.
+    stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
+    if 0 < stage_id < topo.get_dim("pipe") - 1:
+        offset = neox_args.seed + 1138
+        neox_args.seed = offset + (stage_id * mp)
+
+    # Set the model-parallel / data-parallel communicators.
+    if device_count > 0:
+        if mpu.model_parallel_is_initialized():
+            print(
+                "_initialize_distributed() model parallel is already initialized",
+                flush=True,
+            )
+        else:
+            mpu.initialize_model_parallel(
+                neox_args.model_parallel_size,
+                topology=topo,
+                fp32_allreduce=neox_args.fp32_allreduce,
+            )
+
+    # Init DeepSpeed Activation Checkpointing Features
+    setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)
+
+
+def _init_autoresume(neox_args):
+    """Set autoresume start time."""
+
+    if neox_args.adlr_autoresume:
+        print_rank_0("> enabling autoresume ...")
+        sys.path.append(os.environ.get("SUBMIT_SCRIPTS", "."))
+        try:
+            from userlib.auto_resume import AutoResume
+        except BaseException:
+            print("> ADLR autoresume is not available, exiting ...", flush=True)
+            sys.exit()
+        neox_args.adlr_autoresume_object = AutoResume
+
+    if neox_args.adlr_autoresume_object:
+        torch.distributed.barrier()
+        neox_args.adlr_autoresume_object.init()
+        torch.distributed.barrier()
+
+
+def _set_random_seed(seed):
+    """Set random seed for reproducibility."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            mpu.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
+
+
+def _write_args_to_tensorboard(neox_args):
+
+    """Write arguments to tensorboard."""
+    if neox_args.tensorboard_writer:
+        for arg_name in vars(neox_args):
+            neox_args.tensorboard_writer.add_text(
+                arg_name, str(getattr(neox_args, arg_name))
+            )
diff --git a/benchmarks/sizing/megatron/learning_rates.py b/benchmarks/sizing/megatron/learning_rates.py
new file mode 100644
index 0000000..943efdf
--- /dev/null
+++ b/benchmarks/sizing/megatron/learning_rates.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate decay functions."""
+
+import math
+
+from megatron import print_rank_0
+
+
+class AnnealingLR(object):
+    """Anneals the learning rate."""
+
+    def __init__(
+        self,
+        optimizer,
+        start_lr,
+        warmup_iter,
+        total_iters,
+        decay_style,
+        last_iter,
+        min_lr=0.0,
+        use_checkpoint_lr_scheduler=True,
+        override_lr_scheduler=False,
+        use_mup=False,
+    ):
+
+        # Class values.
+        self.optimizer = optimizer
+        self.start_lr = start_lr
+        self.min_lr = min_lr
+        self.warmup_iter = warmup_iter
+        self.num_iters = last_iter
+        self.end_iter = total_iters
+        assert self.end_iter > 0
+        self.decay_style = decay_style
+        self.override_lr_scheduler = override_lr_scheduler
+        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
+        self.use_mup = use_mup
+        if self.override_lr_scheduler:
+            assert not self.use_checkpoint_lr_scheduler, (
+                "both override and " "use-checkpoint are set."
+            )
+        # Set the learning rate
+        self.step(self.num_iters)
+
+        print_rank_0("> learning rate decay style: {}".format(self.decay_style))
+
+    def get_lr(self):
+        """Learning rate decay functions from:
+        https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+
+        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Warmup.
+        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
+            return float(self.start_lr) * num_iters_ / self.warmup_iter
+
+        num_iters_ = num_iters_ - self.warmup_iter
+        if self.decay_style == "linear":
+            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+        elif self.decay_style == "cosine":
+            lr = (
+                self.start_lr
+                / 2.0
+                * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
+            )
+        elif self.decay_style == "exponential":
+            # exp(-0.693) = 1/2
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
+        else:
+            lr = self.start_lr
+        return max(lr, self.min_lr)
+
+    def step(self, step_num=None):
+        """Set lr for all parameters groups."""
+        if step_num is None:
+            step_num = self.num_iters + 1
+        self.num_iters = step_num
+        new_lr = self.get_lr()
+        for group in self.optimizer.param_groups:
+            if self.use_mup and "width_mult" in group:
+                group["lr"] = new_lr / group["width_mult"]
+            else:
+                group["lr"] = new_lr
+
+    def state_dict(self):
+        state_dict = {
+            "start_lr": self.start_lr,
+            "warmup_iter": self.warmup_iter,
+            "num_iters": self.num_iters,
+            "decay_style": self.decay_style,
+            "end_iter": self.end_iter,
+            "min_lr": self.min_lr,
+        }
+        return state_dict
+
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_lr_scheduler:
+            print_rank_0(" > overriding {} value to {}".format(name, cls_value))
+            return cls_value
+
+        if not self.use_checkpoint_lr_scheduler:
+            assert cls_value == sd_value, (
+                "AnnealingLR: class input value"
+                "and checkpoint values for {} do not match".format(name)
+            )
+        print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name))
+        return sd_value
+
+    def load_state_dict(self, sd):
+
+        self.start_lr = self._check_and_set(
+            self.start_lr, sd["start_lr"], "learning rate"
+        )
+        self.min_lr = self._check_and_set(
+            self.min_lr, sd["min_lr"], "minimum learning rate"
+        )
+        self.warmup_iter = self._check_and_set(
+            self.warmup_iter, sd["warmup_iter"], "warmup iterations"
+        )
+        self.end_iter = self._check_and_set(
+            self.end_iter, sd["end_iter"], "total number of iterations"
+        )
+        self.decay_style = self._check_and_set(
+            self.decay_style, sd["decay_style"], "decay style"
+        )
+
+        self.num_iters = sd["num_iters"]
+        self.step(self.num_iters)
diff --git a/benchmarks/sizing/megatron/logging.py b/benchmarks/sizing/megatron/logging.py
new file mode 100644
index 0000000..174ece5
--- /dev/null
+++ b/benchmarks/sizing/megatron/logging.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021, EleutherAI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import torch
+
+try:
+    import wandb
+except ModuleNotFoundError:
+    pass
+
+from megatron import mpu, print_rank_0
+from megatron.utils import report_memory
+
+
+class Tee:
+    """Duplicate output to both stdout/err and file"""
+
+    def __init__(self, file, err: bool = False) -> None:
+        self.file = open(file, "w")
+        self.err = err
+        if not err:
+            self.std = sys.stdout
+            sys.stdout = self
+        else:
+            self.std = sys.stderr
+            sys.stderr = self
+
+    def __del__(self) -> None:
+        if not self.err:
+            sys.stdout = self.std
+        else:
+            sys.stderr = self.std
+        self.file.close()
+
+    def write(self, data) -> None:
+        try:
+            self.file.write(data)
+        except OSError:
+            pass
+        try:
+            self.std.write(data)
+        except OSError:
+            pass
+
+    def flush(self) -> None:
+        try:
+            self.file.flush()
+        except OSError:
+            pass
+
+
+def human_readable_flops(num) -> str:
+    for unit in [
+        "",
+        "KFLOPS",
+        "MFLOPS",
+        "GFLOPS",
+        "TFLOPS",
+        "PFLOPS",
+        "EFLOPS",
+        "ZFLOPS",
+    ]:
+        if abs(num) < 1000.0:
+            return "%3.1f%s" % (num, unit)
+        num /= 1000.0
+    return "%.1f%s" % (num, "Yi")
+
+
+def get_flops(neox_args, model, iter_time_s) -> float:
+    world_size = torch.distributed.get_world_size()
+    ff = model.total_params * 6
+    attn = neox_args.seq_length * neox_args.hidden_size * neox_args.num_layers * 60
+    flops = (
+        neox_args.train_batch_size
+        * neox_args.seq_length
+        * (ff + attn)
+        / (iter_time_s * world_size)
+    )
+    return flops
+
+
+def training_log(
+    neox_args,
+    timers,
+    loss_dict,
+    total_loss_dict,
+    learning_rate,
+    iteration,
+    loss_scale,
+    report_memory_flag,
+    skipped_iter,
+    model,
+    optimizer,
+    noise_scale_logger,
+):
+    """Log training information such as losses, timing, etc."""
+
+    # Update losses.
+    skipped_iters_key = "skipped iterations"
+    total_loss_dict[skipped_iters_key] = (
+        total_loss_dict.get(skipped_iters_key, 0) + skipped_iter
+    )
+    got_nan_key = "got nan"
+
+    got_nan = False
+    for key in loss_dict:
+        if not skipped_iter:
+            total_loss_dict[key] = total_loss_dict.get(key, 0.0) + loss_dict[key]
+        else:
+            value = loss_dict[key].float().sum().item()
+            is_nan = value == float("inf") or value == -float("inf") or value != value
+            got_nan = got_nan or is_nan
+
+    total_loss_dict[got_nan_key] = total_loss_dict.get(got_nan_key, 0) + int(got_nan)
+
+    # Logging.
+    timers_to_log = []
+
+    def add_to_logging(name):
+        if name in timers.timers:
+            timers_to_log.append(name)
+
+    if not neox_args.is_pipe_parallel:
+        add_to_logging("forward")
+        add_to_logging("backward")
+        add_to_logging("backward-backward")
+        add_to_logging("backward-allreduce")
+        add_to_logging("backward-master-grad")
+        add_to_logging("backward-clip-grad")
+        add_to_logging("optimizer")
+        add_to_logging("batch generator")
+
+        # Log timer info to tensorboard and wandb
+        normalizer = iteration % neox_args.log_interval
+        if normalizer == 0:
+            normalizer = neox_args.log_interval
+        if torch.distributed.get_rank() == 0:
+            timers.write(
+                names=timers_to_log, iteration=iteration, normalizer=normalizer
+            )
+    else:
+        # with pipeline parallel, the megatron timers are overridden by the deepspeed ones.
+        # Try to grab timer values from model engine. Only recently added to deeperspeed, so check that the engine
+        # has that attribute first
+        if hasattr(model, "timer_values") and model.timer_values is not None:
+            if (
+                model.wall_clock_breakdown()
+                and model.global_steps % model.steps_per_print() == 0
+            ):
+                timer_values = model.timer_values
+                # deepspeed already logs to tensorboard / prints values, so just log to wandb
+                if neox_args.use_wandb and torch.distributed.get_rank() == 0:
+                    for key in timer_values:
+                        tb_wandb_log(
+                            f"timers/{key}",
+                            timer_values[key],
+                            iteration,
+                            use_wandb=neox_args.use_wandb,
+                            tensorboard_writer=neox_args.tensorboard_writer,
+                        )
+
+    # write losses, lr, etc. every step
+    tb_wandb_log(
+        "train/learning_rate",
+        learning_rate,
+        iteration,
+        use_wandb=neox_args.use_wandb,
+        tensorboard_writer=neox_args.tensorboard_writer,
+    )
+    for key in loss_dict:
+        tb_wandb_log(
+            f'train/{key.replace(" ", "_")}',
+            loss_dict[key],
+            iteration,
+            use_wandb=neox_args.use_wandb,
+            tensorboard_writer=neox_args.tensorboard_writer,
+        )
+    if neox_args.fp16:
+        tb_wandb_log(
+            f"train/loss_scale",
+            loss_scale,
+            iteration,
+            use_wandb=neox_args.use_wandb,
+            tensorboard_writer=neox_args.tensorboard_writer,
+        )
+
+    # log gradient noise scale
+    if neox_args.log_gradient_noise_scale:
+        if noise_scale_logger.noise_scale is not None:
+            tb_wandb_log(
+                f"train/noise_scale",
+                noise_scale_logger.noise_scale,
+                iteration,
+                use_wandb=neox_args.use_wandb,
+                tensorboard_writer=neox_args.tensorboard_writer,
+            )
+
+    # (optional) Log optimizer states to wandb / tb every step
+    if neox_args.log_optimizer_states:
+        for k, v in optimizer.state_dict()["optimizer_state_dict"]["state"].items():
+            for ki, vi in v.items():  # step, module
+                if ki != "step":
+                    opt_state_norm = torch.norm(vi) if hasattr(vi, "dim") else vi
+                    tb_wandb_log(
+                        f"optimizer_state_norms/{k}_{ki}",
+                        opt_state_norm,
+                        iteration,
+                        use_wandb=neox_args.use_wandb,
+                        tensorboard_writer=neox_args.tensorboard_writer,
+                    )
+
+    # (optional) Log grad/param norms to wandb / tb every step
+    if (
+        neox_args.log_grad_pct_zeros
+        or neox_args.log_grad_norm
+        or neox_args.log_param_norm
+    ):
+        if neox_args.log_grad_pct_zeros or neox_args.log_grad_norm:
+            model.store_gradients = True  # start storing gradients
+
+        for i, (name, param) in enumerate(model.module.named_parameters()):
+            if neox_args.log_grad_pct_zeros:
+                if (
+                    hasattr(model, "stored_gradients")
+                    and model.stored_gradients is not None
+                ):
+                    grad = model.stored_gradients[i]
+                    if grad is not None:
+                        tb_wandb_log(
+                            f"pct_grad_zeros/{name}",
+                            (grad == 0).float().mean().item() * 100,
+                            iteration,
+                            use_wandb=neox_args.use_wandb,
+                            tensorboard_writer=neox_args.tensorboard_writer,
+                            all_ranks=True,
+                        )
+            if neox_args.log_grad_norm:
+                if (
+                    hasattr(model, "stored_gradients")
+                    and model.stored_gradients is not None
+                ):
+                    grad = model.stored_gradients[i]
+                    if grad is not None:
+                        tb_wandb_log(
+                            f"gradient_norms/{name}",
+                            torch.norm(grad),
+                            iteration,
+                            use_wandb=neox_args.use_wandb,
+                            tensorboard_writer=neox_args.tensorboard_writer,
+                            all_ranks=True,
+                        )
+            if neox_args.log_param_norm:
+                tb_wandb_log(
+                    f"parameter_norms/{name}",
+                    torch.norm(param),
+                    iteration,
+                    use_wandb=neox_args.use_wandb,
+                    tensorboard_writer=neox_args.tensorboard_writer,
+                    all_ranks=True,
+                )
+
+    if iteration % neox_args.log_interval == 0:
+        # log other stuff every neox_args.log_interval iters
+        elapsed_time = timers("interval time").elapsed()
+        iteration_time = elapsed_time / neox_args.log_interval
+        samples_per_sec = neox_args.train_batch_size / iteration_time
+        log_string = " samples/sec: {:.3f} |".format(samples_per_sec)
+        tb_wandb_log(
+            "runtime/samples_per_sec",
+            samples_per_sec,
+            iteration,
+            use_wandb=neox_args.use_wandb,
+            tensorboard_writer=neox_args.tensorboard_writer,
+        )
+        tb_wandb_log(
+            "runtime/iteration_time",
+            iteration_time,
+            iteration,
+            use_wandb=neox_args.use_wandb,
+            tensorboard_writer=neox_args.tensorboard_writer,
+        )
+        log_string += " iteration {:8d}/{:8d} |".format(
+            iteration, neox_args.train_iters
+        )
+        log_string += " elapsed time per iteration (ms): {:.1f} |".format(
+            elapsed_time * 1000.0 / neox_args.log_interval
+        )
+        log_string += " learning rate: {:.3E} |".format(learning_rate)
+        num_iterations = max(
+            1, neox_args.log_interval - total_loss_dict[skipped_iters_key]
+        )
+
+        # log curriculum learning
+        if neox_args.curriculum_learning:
+            tb_wandb_log(
+                "curriculum_seqlen",
+                neox_args.curriculum_seqlen,
+                iteration,
+                use_wandb=neox_args.use_wandb,
+                tensorboard_writer=neox_args.tensorboard_writer,
+            )
+
+        # log tflop / gpu
+        flops_per_s_per_gpu = get_flops(
+            neox_args=neox_args, model=model, iter_time_s=iteration_time
+        )
+        log_string += (
+            f" approx flops per GPU: {human_readable_flops(flops_per_s_per_gpu)} |"
+        )
+        tb_wandb_log(
+            "runtime/flops_per_sec_per_gpu",
+            flops_per_s_per_gpu,
+            iteration,
+            use_wandb=neox_args.use_wandb,
+            tensorboard_writer=neox_args.tensorboard_writer,
+        )
+
+        for key in total_loss_dict:
+            if key not in [skipped_iters_key, got_nan_key]:
+                v = (
+                    total_loss_dict[key].item()
+                    if hasattr(total_loss_dict[key], "item")
+                    else total_loss_dict[key]
+                )
+                avg = v / float(num_iterations)
+                log_string += " {}: {:.6E} |".format(key, avg)
+                total_loss_dict[key] = 0.0
+        if neox_args.precision == "fp16":
+            log_string += " loss scale: {:.1f} |".format(loss_scale)
+        log_string += " number of skipped iterations: {:3d} |".format(
+            total_loss_dict[skipped_iters_key]
+        )
+        log_string += " number of nan iterations: {:3d} |".format(
+            total_loss_dict[got_nan_key]
+        )
+        total_loss_dict[skipped_iters_key] = 0
+        total_loss_dict[got_nan_key] = 0
+        print_rank_0(log_string)
+        if report_memory_flag:
+            report_memory("after {} iterations".format(iteration))
+            report_memory_flag = False
+
+        timers.log(timers_to_log, normalizer=neox_args.log_interval)
+
+    return report_memory_flag
+
+
+def tb_wandb_log(
+    key: str,
+    value: float,
+    iteration_no: int,
+    use_wandb: bool,
+    tensorboard_writer=None,
+    all_ranks: bool = False,
+):
+    # logs to both tb and wandb (if present) from the zeroth rank
+    do_log = torch.distributed.get_rank() == 0 or all_ranks
+    if do_log and value is not None:
+        if tensorboard_writer:
+            tensorboard_writer.add_scalar(key, value, iteration_no)
+        if use_wandb:
+            wandb.log({key: value}, step=iteration_no)
diff --git a/benchmarks/sizing/megatron/model/__init__.py b/benchmarks/sizing/megatron/model/__init__.py
new file mode 100755
index 0000000..25dfab6
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/__init__.py
@@ -0,0 +1,22 @@
+#
+# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .norms import LayerNorm
+
+from .gpt2_model import GPT2ModelPipe
+from .utils import get_params_for_weight_decay_optimization
+from .word_embeddings import SoftEmbedding
diff --git a/benchmarks/sizing/megatron/model/activations.py b/benchmarks/sizing/megatron/model/activations.py
new file mode 100644
index 0000000..5c4ba1d
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/activations.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+
+def get_activation(neox_args):
+    """retrieves the activation function specified in neox_args"""
+    if neox_args.activation == "geglu":
+        activation_func = GEGLU(neox_args=neox_args)
+    elif neox_args.activation == "gelu":
+        if neox_args.onnx_safe and neox_args.bias_gelu_fusion:
+            raise ValueError("onnx_safe + bias_gelu_fusion not compatible")
+        if neox_args.onnx_safe:
+            activation_func = erf_gelu
+        elif neox_args.bias_gelu_fusion:
+            activation_func = bias_gelu_impl
+        else:
+            activation_func = F.gelu
+    elif neox_args.activation == "relu":
+        activation_func = F.relu
+    elif neox_args.activation == "softsign":
+        activation_func = F.softsign
+    elif neox_args.activation == "swish":
+        activation_func = swish
+    elif neox_args.activation == "mish":
+        activation_func = mish
+    elif neox_args.activation == "silu":
+        activation_func = F.silu
+    else:
+        raise ValueError(f"Activation function {neox_args.activation} not recognized")
+    return activation_func
+
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * (
+        (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)
+    ) + 0.5 * (1 + tanh_out)
+    return ff * g
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+
+bias_gelu_impl = GeLUFunction.apply
+
+
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return (
+        x
+        * 0.5
+        * (
+            torch.erf(x / 1.41421).to(dtype=x.dtype)
+            + torch.ones_like(x).to(dtype=x.dtype)
+        )
+    )
+
+
+@torch.jit.script
+def swish(x, beta: float = 1.0):
+    return x * torch.sigmoid(beta * x)
+
+
+@torch.jit.script
+def mish(x):
+    return x * torch.tanh(F.softplus(x))
+
+
+class GEGLU(torch.nn.Module):
+    def __init__(self, neox_args):
+        super(GEGLU, self).__init__()
+        if neox_args.onnx_safe:
+            self.activation_func = erf_gelu
+        else:
+            self.activation_func = F.gelu
+
+    def forward(self, x, bias=None):
+        x, gate = x.chunk(2, dim=-1)
+        if bias is not None:
+            bias_1, bias_2 = bias.chunk(2, dim=-1)
+            x = x + bias_1
+            gate = gate + bias_2
+        intermediate_parallel = self.activation_func(gate)
+        return intermediate_parallel * x
diff --git a/benchmarks/sizing/megatron/model/flash_attention.py b/benchmarks/sizing/megatron/model/flash_attention.py
new file mode 100644
index 0000000..7a6b7c6
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/flash_attention.py
@@ -0,0 +1,788 @@
+# Based on: https://github.com/HazyResearch/flash-attention/blob/4a6eaa9f27df6fff7ffb2c24e894938a687dd870/flash_attn/flash_attn_interface.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from flash_attn import flash_attn_triton
+import flash_attn_2_cuda as flash_attn_cuda # For flash_attn version 2.1.1
+
+
+def flash_attn_unpadded_unpacked_func_triton(
+    q, k, v, bias=None, causal=False, softmax_scale=None
+):
+    return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
+
+
+def _flash_attn_forward_cuda(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    return_softmax,
+    num_splits=0,
+    generator=None,
+):
+    """
+    num_splits: how much to parallelize over the seqlen_q dimension. num_splits=0 means
+    it will be set by an internal heuristic. We're exposing num_splits mostly for benchmarking.
+    Don't change it unless you know what you're doing.
+    """
+    softmax_lse, *rest = flash_attn_cuda.fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        return_softmax,
+        num_splits,
+        generator,
+    )
+    # if out.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    S_dmask = rest[0] if return_softmax else None
+    return out, softmax_lse, S_dmask
+
+
+def _flash_attn_backward_cuda(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    num_splits=0,
+    generator=None,
+):
+    """
+    num_splits: whether to parallelize over the seqlen_k dimension (num_splits > 1) or
+    not (num_splits = 1). num_splits=0 means it will be set by an internal heuristic.
+    Any value above 1 will call the same kernel (i.e. num_splits=2 would call the same kernel
+    as num_splits=3), so effectively the choices are 0, 1, and 2.
+    This hyperparameter can be tuned for performance, but default value (heuristic) should work fine.
+    """
+    _, _, _, softmax_d = flash_attn_cuda.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        num_splits,
+        generator,
+    )
+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dq, dk, dv, softmax_d
+
+
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            torch.empty_like(qkv[:, 0]),
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(qkv, out, softmax_lse, cu_seqlens, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen = max_seqlen
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        qkv, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dqkv = torch.empty_like(qkv)
+        _flash_attn_backward_cuda(
+            dout,
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            out,
+            softmax_lse,
+            dqkv[:, 0],
+            dqkv[:, 1],
+            dqkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            ctx.max_seqlen,
+            ctx.max_seqlen,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_qkvpacked_func_cuda(
+    qkv,
+    cu_seqlens,
+    max_seqlen,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    return FlashAttnQKVPackedFunc.apply(
+        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
+    )
+
+
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            torch.empty_like(q),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(
+            q, kv, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        (
+            q,
+            kv,
+            out,
+            softmax_lse,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            rng_state,
+        ) = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dq = torch.empty_like(q)
+        dkv = torch.empty_like(kv)
+        _flash_attn_backward_cuda(
+            dout,
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dq, dkv, None, None, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_kvpacked_func_cuda(
+    q,
+    kv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        kv: (total_k, 2, nheads, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnKVPackedFunc.apply(
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_attn_probs,
+    )
+
+
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
+            q,
+            k,
+            v,
+            torch.empty_like(q),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(
+            q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        (
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            rng_state,
+        ) = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_backward_cuda(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dq, dk, dv, None, None, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_func_cuda(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_attn_probs,
+    )
+
+
+# For flash-attention 2 integration
+def _flash_attn_varlen_forward(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    return_softmax,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
+        q,
+        k,
+        v,
+        None,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        -1,
+        -1,
+        return_softmax,
+        None,
+    )
+    # if out.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
+
+
+def _flash_attn_varlen_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    rng_state=None,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    dq, dk, dv, softmax_d, = flash_attn_cuda.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        None,
+        rng_state,
+    )
+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dq, dk, dv, softmax_d
+
+
+class FlashAttnVarlenQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_softmax):
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax and dropout_p > 0,
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen = max_seqlen
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
+        qkv_shape = q.shape[:-2] + (3, *q.shape[-2:])
+        dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dqkv[:, 0],
+            dqkv[:, 1],
+            dqkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            ctx.max_seqlen,
+            ctx.max_seqlen,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            rng_state=rng_state,
+        )
+        dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
+        return dqkv, None, None, None, None, None, None
+
+
+def flash_attn_varlen_qkvpacked_func(
+    qkv,
+    cu_seqlens,
+    max_seqlen,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If Q, K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of Q, K, V.
+    For multi-query and grouped-query attention (MQA/GQA), please see
+    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.
+
+    Arguments:
+        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
+        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into qkv.
+        max_seqlen: int. Maximum sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnVarlenQKVPackedFunc.apply(
+        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
+    )
+
+
+class FlashAttnVarlenKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax and dropout_p > 0,
+        )
+        ctx.save_for_backward(
+            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
+        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dkv = dkv[..., : dout.shape[-1]]
+        return dq, dkv, None, None, None, None, None, None, None, None
+
+
+def flash_attn_varlen_kvpacked_func(
+    q,
+    kv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of K, V.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnVarlenKVPackedFunc.apply(
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_attn_probs,
+    )
diff --git a/benchmarks/sizing/megatron/model/fused_bias_dropout.py b/benchmarks/sizing/megatron/model/fused_bias_dropout.py
new file mode 100644
index 0000000..afe1f07
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/fused_bias_dropout.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021, EleutherAI contributors
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from typing import Optional
+from torch import Tensor
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+
+def bias_dropout_add(
+    x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool
+) -> Tensor:
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    if residual is not None:
+        out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(
+    x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
+) -> Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(
+    x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float
+) -> Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
diff --git a/benchmarks/sizing/megatron/model/fused_softmax.py b/benchmarks/sizing/megatron/model/fused_softmax.py
new file mode 100644
index 0000000..78f2992
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/fused_softmax.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import enum
+from ..fused_kernels import load_fused_kernels
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class SoftmaxFusionTypes(enum.Enum):
+    upper_triang = 1  # causal mask
+    general = 2  # general mask
+    none = 3  # no fusion
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        fusion_type: type of fusion to perform, should be either upper_triang, general or none. None will perform a regular torch softmax.
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        fusion_type,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super().__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+
+        assert fusion_type in [
+            SoftmaxFusionTypes.upper_triang,
+            SoftmaxFusionTypes.general,
+            SoftmaxFusionTypes.none,
+        ], f"Invalid fusion type {fusion_type}"
+
+        if fusion_type != SoftmaxFusionTypes.none:
+            load_fused_kernels()  # check fused kernels are installed
+
+        self.upper_triang_mask_fusion = fusion_type == SoftmaxFusionTypes.upper_triang
+        self.general_mask_fusion = fusion_type == SoftmaxFusionTypes.general
+        self.fusion = fusion_type != SoftmaxFusionTypes.none
+
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.fusion  # user wants to fuse
+            and self.input_in_float16  # input must be fp16
+            and mask is not None  # mask tensor must not be None
+            and 16 < sk <= 2048  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 2048:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.upper_triang_mask_fusion:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+        if self.upper_triang_mask_fusion:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            return ScaledMaskedSoftmax.apply(input, mask, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/benchmarks/sizing/megatron/model/gmlp.py b/benchmarks/sizing/megatron/model/gmlp.py
new file mode 100644
index 0000000..e4e29da
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/gmlp.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.activations import get_activation
+from megatron.model.norms import get_norm
+from megatron.model.utils import get_fusion_type
+
+from megatron import mpu
+
+
+class TinyAttention(nn.Module):
+    def __init__(self, neox_args, d_attn, d_ff, mask_fn):
+        super().__init__()
+        self.proj_qkv = nn.Linear(d_ff * 2, 3 * d_attn)
+        self.scale = d_attn**-0.5
+        self.proj_ffn = nn.Linear(d_attn, d_ff)
+        self.softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=neox_args.precision == "fp16",
+            input_in_bf16=neox_args.precision == "bfloat16",
+            fusion_type=get_fusion_type(neox_args),
+            mask_func=mask_fn,
+            softmax_in_fp32=neox_args.attention_softmax_in_fp32,
+            scale=None,
+        )
+
+    def forward(self, x, attention_mask):
+        q, k, v = torch.chunk(self.proj_qkv(x), 3, dim=-1)
+        w = torch.einsum("bnd,bmd->bnm", q, k).unsqueeze(1) * self.scale
+        a = self.softmax(
+            w, mask=attention_mask[..., : w.size(-2), : w.size(-1)]
+        ).squeeze(1)
+        x = torch.einsum("bnm,bmd->bnd", a, v)
+        return self.proj_ffn(x)
+
+
+class SpatialGatingUnit(nn.Module):
+    def __init__(self, neox_args, d_ff, d_attn=None, causal=True, mask_fn=None):
+        super().__init__()
+        self.causal = causal
+        self.use_attn = d_attn is not None
+
+        norm, eps = get_norm(neox_args)
+        self.norm = norm(d_ff, eps=eps)
+        self.proj = nn.Linear(neox_args.seq_length, neox_args.seq_length)
+        if self.use_attn:
+            assert mask_fn is not None
+            self.attn = TinyAttention(
+                neox_args=neox_args, d_attn=d_attn, d_ff=d_ff, mask_fn=mask_fn
+            )
+        nn.init.zeros_(self.proj.weight)
+        nn.init.constant_(self.proj.bias, 1.0)
+
+    def forward(self, x, attention_mask):
+        device, n = x.device, x.shape[1]
+        x = x.transpose(0, 1)  # [s, b, d] -> [b, s, d]
+
+        res, gate = x.chunk(2, dim=-1)  # split along dim
+        gate = self.norm(gate)
+
+        weight, bias = self.proj.weight, self.proj.bias
+        if self.causal:
+            weight, bias = weight[:n, :n], bias[:n]
+            mask = torch.ones(weight.shape[:2], device=device).triu_(1).bool()
+            weight = weight.masked_fill(mask, 0.0)
+
+        gate = F.linear(gate.transpose(2, 1), weight, self.proj.bias).transpose(2, 1)
+
+        if self.use_attn:
+            gate = gate + self.attn(x, attention_mask)
+
+        return (gate * res).transpose(0, 1)  # [b, s, d] -> [s, b, d]
+
+
+class GMLPBlock(nn.Module):
+    def __init__(
+        self,
+        neox_args,
+        init_method,
+        output_layer_init_method,
+        layer_number,
+        ff_mult=4,
+        mask_fn=None,
+    ):
+        super().__init__()
+        self.layer_number = layer_number
+
+        ff_dim = neox_args.hidden_size * ff_mult
+        norm, eps = get_norm(neox_args)
+        self.norm = norm(neox_args.hidden_size, eps=eps)
+        self.input_linear = mpu.ColumnParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=ff_dim * 2,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True,
+        )
+        self.activation_func = get_activation(neox_args)
+        ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size())
+        if neox_args.attention_config[layer_number] == "amlp":
+            d_attn = neox_args.gmlp_attn_dim
+        else:
+            d_attn = None
+        self.sgu = SpatialGatingUnit(
+            neox_args, ff_dim_parallel, d_attn, causal=True, mask_fn=mask_fn
+        )
+        self.output_linear = mpu.RowParallelLinear(
+            neox_args=neox_args,
+            input_size=ff_dim,
+            output_size=neox_args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+        )
+
+    def forward(self, args):
+        assert len(args) == 2, "GMLPBlock expects 2 arguments"
+        x, attention_mask = args
+        x = self.norm(x)
+        x, _ = self.input_linear(x)
+        x = self.activation_func(x)
+        x = self.sgu(x, attention_mask)
+        x, _ = self.output_linear(x)
+        return x, attention_mask
diff --git a/benchmarks/sizing/megatron/model/gpt2_model.py b/benchmarks/sizing/megatron/model/gpt2_model.py
new file mode 100644
index 0000000..1283314
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/gpt2_model.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021 EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT-2 model."""
+
+import math
+import torch
+import torch.nn as nn
+from collections import defaultdict
+
+from functools import partial
+from megatron.model.utils import Lambda, SequentialWrapper, recursive_setattr
+from megatron.model.norms import get_norm
+from megatron.model.init_functions import get_init_methods
+
+from megatron import mpu
+from megatron.mpu import ParallelRelativePositionBias
+from megatron.model.transformer import (
+    ParallelTransformerLayerPipe,
+    NormPipe,
+    ParallelLinearPipe,
+    parallel_lm_logits,
+    ParallelLinear,
+)
+from megatron.model.gmlp import GMLPBlock
+from megatron.model.word_embeddings import EmbeddingPipe, SoftEmbedding
+
+# Pipeline parallelism
+from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
+from typing import Union, List
+
+
+def gpt2_attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(ltor_mask, -10000.0)
+    return attention_scores
+
+
+def cross_entropy(output, labels, _fp16=False):
+    """From pretrain_gpt2:forward_step()"""
+    """
+    if self.fp16_lm_cross_entropy:
+        assert output.dtype == torch.half
+        loss = mpu.vocab_parallel_cross_entropy(output, labels)
+    else:
+        loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        return loss
+    """
+    labels, loss_mask = labels[0], labels[1]
+    if _fp16:
+        assert output.dtype == torch.half and loss_mask.dtype == torch.half
+        losses = mpu.vocab_parallel_cross_entropy(output.contiguous(), labels)
+    else:
+        losses = mpu.vocab_parallel_cross_entropy(output.float().contiguous(), labels)
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+    return loss
+
+
+def _pre_transformer_block(args):
+    # data format change for hidden_states to avoid explicit tranposes : [b s h] --> [s b h]
+    assert len(args) == 2, "Incorrect number of arguments to _pre_transformer_block"
+    fn = lambda _args: (_args[0].transpose(0, 1).contiguous(), *_args[1:])
+    return fn(args)
+
+
+def _post_transformer_block(args):
+    # from (hidden_states, attention_mask)
+    # to (hidden_states.T)
+    assert len(args) == 2, "Incorrect number of arguments to _post_transformer_block"
+    fn = lambda _args: (_args[0].transpose(0, 1).contiguous())
+    return fn(args)
+
+
+class GPT2ModelPipe(PipelineModule, torch.nn.Module):
+    """GPT2Model adapted for pipeline parallelism.
+
+    The largest change is flattening the GPTModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+
+    :param neox_args: NeoX arguments object (configuration)
+    :param num_tokentypes: number of token types (TODO: deprecated, remove)
+    :param parallel_output: if true, don't gather the output logits, and calculate loss in parallel. Set to true by default in training for efficiency, but set to false for inference.
+    :param topology: deepspeed topology object specifying pipe / model parallelism topology.
+    :param use_cache: if true, cache key/value pairs for each layer in inference.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=None,
+        use_cache=False,
+    ):
+        self.neox_args = neox_args
+
+        self.use_cache = use_cache
+        self.parallel_output = parallel_output
+        self.hidden_size = self.neox_args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method, self.output_layer_init_method = get_init_methods(
+            self.neox_args
+        )
+        self.__topology__ = topology
+
+        self.specs = []
+        self.init_specs()  # initializes the layer specs (basically a fancy nn.Sequential)
+
+        super().__init__(
+            layers=self.specs,
+            loss_fn=partial(cross_entropy, _fp16=self.neox_args.fp16_lm_cross_entropy),
+            topology=topology,
+            activation_checkpoint_interval=self.neox_args.checkpoint_num_layers
+            if self.neox_args.checkpoint_activations
+            else 0,
+            partition_method=neox_args.pipe_partition_method,
+            checkpointable_layers=["GMLPBlock", "ParallelTransformerLayerPipe"],
+        )
+
+    def insert_layers(
+        self, layers: Union[nn.Module, nn.ModuleList, nn.Sequential, List], idx
+    ):
+        """
+        inserts the layers in `layers` into the pipe model at `idx`.
+        """
+        if isinstance(layers, nn.Module):
+            self.specs.insert(idx, layers)
+        elif any(
+            [isinstance(layers, nn.ModuleList), isinstance(layers, nn.Sequential)]
+        ):
+            self.specs[idx:idx] = layers
+        elif isinstance(layers, list):
+            assert all(
+                [hasattr(l, "__call__") for l in layers]
+            ), "all items in `layers` must be Callables"
+            self.specs[idx:idx] = layers
+        else:
+            raise ValueError(
+                f"layer passed into {self.__class__.__name__}.insert_layer() should be either an nn.Module, an nn.ModuleList, an nn.Sequential object, or a list of callables not a {type(layers)}"
+            )
+
+        # re-initialize parent class
+        super().__init__(
+            layers=self.specs,
+            loss_fn=self.loss_fn,
+            topology=self.__topology__,
+            activation_checkpoint_interval=self.activation_checkpoint_interval,
+            partition_method=self.neox_args.pipe_partition_method,
+            checkpointable_layers=["GMLPBlock", "ParallelTransformerLayerPipe"],
+        )
+
+    def init_specs(self):
+
+        weight_tying = not self.neox_args.no_weight_tying
+        self.specs = []
+
+        # Embedding layer
+        # input will be (input_ids, position_ids, attention_mask)
+
+        if weight_tying:
+            self.specs.append(
+                TiedLayerSpec(
+                    "embed",
+                    EmbeddingPipe,
+                    self.neox_args,
+                    self.hidden_size,
+                    self.neox_args.padded_vocab_size,
+                    self.neox_args.max_position_embeddings,
+                    self.neox_args.hidden_dropout,
+                    self.init_method,
+                    self.num_tokentypes,
+                    tied_weight_attr="word_embeddings_weight",
+                )
+            )
+        else:
+            self.specs.append(
+                LayerSpec(
+                    EmbeddingPipe,
+                    self.neox_args,
+                    self.hidden_size,
+                    self.neox_args.padded_vocab_size,
+                    self.neox_args.max_position_embeddings,
+                    self.neox_args.hidden_dropout,
+                    self.init_method,
+                    self.num_tokentypes,
+                )
+            )
+
+        # NB: the attention mask always needs to be the *last* item in the args when being passed from
+        # one stage to the next, because deepspeed is hacks on top of hacks.
+        #
+        # outputs are now (hidden_states,  attention_mask)
+
+        self.specs.append(_pre_transformer_block)
+
+        # T5 RPE positional embedding
+        if self.neox_args.pos_emb == "rpe":
+            hidden_size_per_attention_head = mpu.divide(
+                self.neox_args.hidden_size, self.neox_args.num_attention_heads
+            )
+            rpe_scale = math.sqrt(hidden_size_per_attention_head)
+            rpe_emb = ParallelRelativePositionBias(
+                neox_args=self.neox_args,
+                scale=rpe_scale,
+                causal=True,
+                num_buckets=self.neox_args.rpe_num_buckets,
+                max_distance=self.neox_args.rpe_max_distance,
+                heads=self.neox_args.num_attention_heads,
+            )
+
+        # Transformer layers
+        for i in range(self.neox_args.num_layers):
+            layer_type = self.neox_args.attention_config[i]
+            if layer_type in ["gmlp", "amlp"]:
+                self.specs.append(
+                    LayerSpec(
+                        GMLPBlock,
+                        init_method=self.init_method,
+                        layer_number=i,
+                        output_layer_init_method=self.output_layer_init_method,
+                        neox_args=self.neox_args,
+                        mask_fn=gpt2_attention_mask_func,
+                    )
+                )
+            else:
+                self.specs.append(
+                    LayerSpec(
+                        ParallelTransformerLayerPipe,
+                        neox_args=self.neox_args,
+                        attention_mask_func=gpt2_attention_mask_func,
+                        init_method=self.init_method,
+                        output_layer_init_method=self.output_layer_init_method,
+                        layer_number=i,
+                        rpe=rpe_emb if self.neox_args.pos_emb == "rpe" else None,
+                        rotary=self.neox_args.pos_emb == "rotary",
+                        use_cache=self.use_cache,
+                    )
+                )
+
+        # used to drop attention mask + reshape hidden states
+        self.specs.append(_post_transformer_block)
+
+        # NormPipe is a (deprecated) helper class that used to be used to pass presents along the pipeline - since presents are now cached to the `TransformerLayer` class this is no longer needed
+        norm, eps = get_norm(self.neox_args)
+        self.specs.append(
+            LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps)
+        )
+
+        # outputs are now a single tensor: hidden_states
+
+        def _logits_helper(embedding, lm_output):
+            """Just a wrapper to massage inputs/outputs from pipeline."""
+            if self.neox_args.use_mup:
+                # Since we're using pipeline parallelism, we can't directly use MuReadout. Instead, use this workaround that does the same thing as MuReadout.
+                # https://github.com/microsoft/mup/issues/6#issuecomment-1082156274
+                lm_output = (
+                    lm_output
+                    / self.tied_modules.embed.word_embeddings.weight.infshape.width_mult()
+                )
+
+            logits = parallel_lm_logits(
+                lm_output, embedding.word_embeddings_weight, self.parallel_output
+            )
+            return logits
+
+        if weight_tying:
+            self.specs.append(
+                TiedLayerSpec(
+                    "embed",
+                    EmbeddingPipe,
+                    self.neox_args,
+                    self.hidden_size,
+                    self.neox_args.padded_vocab_size,
+                    self.neox_args.max_position_embeddings,
+                    self.neox_args.hidden_dropout,
+                    self.init_method,
+                    self.num_tokentypes,
+                    forward_fn=_logits_helper,
+                    tied_weight_attr="word_embeddings_weight",
+                )
+            )
+        else:
+            self.specs.append(
+                LayerSpec(
+                    ParallelLinearPipe,
+                    neox_args=self.neox_args,
+                    init_method=self.init_method,
+                    parallel_output=self.parallel_output,
+                    is_last_layer=True,
+                )
+            )
+
+    def _set_parallel_output(self, value):
+        # sets the parallel output value of the final layer to value
+        final_layer = list(self.forward_funcs)[-1]
+        if isinstance(final_layer, (ParallelLinearPipe, ParallelLinear)):
+            final_layer.final_linear.set_parallel_output(value)
+
+    def inference_mode(self, use_cache=True):
+        """
+        Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
+        so logits are gathered across model parallel ranks.
+
+        :param cache: (bool) True if you want to use caching during inference, False otherwise
+        """
+        # first set caching to true if specified
+        recursive_setattr(self.forward_funcs, "use_cache", use_cache, assert_type=bool)
+        # then set parallel output of the final layer to false so we don't have to gather the output manually
+        self._set_parallel_output(False)
+        recursive_setattr(self.forward_funcs, "training", False)
+
+    def train_mode(self):
+        """
+        Sets up the model for training by turning off k/v caching and setting `parallel output` of the final layer to True,
+        so logits are not gathered across model parallel ranks, and loss is computed in parallel (more efficient).
+        """
+        # set caching to false
+        recursive_setattr(self.forward_funcs, "use_cache", False)
+        # then set parallel output to true (more efficient training)
+        self._set_parallel_output(True)
+        recursive_setattr(self.forward_funcs, "training", True)
+
+    def clear_cache(self):
+        """
+        Recursively clears the kv cache on all layers
+        """
+        recursive_setattr(self.forward_funcs, "layer_past", None)
+
+    def to_sequential(self):
+        """
+        Transforms the PipelineModule to a plain nn.Sequential module
+        :return:
+        """
+        layers = []
+        tied_layers = defaultdict(list)
+        for n, spec in enumerate(self.specs):
+            if isinstance(spec, TiedLayerSpec):
+                if spec.key in tied_layers:
+                    # receiver
+                    layers.append(
+                        Lambda(lambda x: spec.forward_fn(tied_layers[spec.key][0], x))
+                    )
+                else:
+                    # owner
+                    module = spec.build(log=False)
+                    layers.append(module)
+                    tied_layers[spec.key].append(module)
+            elif isinstance(spec, LayerSpec):
+                layers.append(spec.build(log=False))
+            elif hasattr(spec, "__call__"):
+                # check that it's a callable function
+                layers.append(Lambda(spec))
+            else:
+                raise ValueError(f"Layer number {n} ({spec}) Not recognized")
+        model = SequentialWrapper(
+            layers,
+            self.activation_checkpoint_interval,
+            self.activation_checkpoint_func,
+            parent_class_name=self.__class__.__name__,
+        )
+        return model
diff --git a/benchmarks/sizing/megatron/model/init_functions.py b/benchmarks/sizing/megatron/model/init_functions.py
new file mode 100644
index 0000000..11bcdc3
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/init_functions.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+
+try:
+    import mup
+except ImportError:
+    pass
+
+
+def init_method_normal(sigma, use_mup_outer=False, mup_init_scale=1.0):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.normal_(tensor, mean=0.0, std=sigma)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(
+    sigma, num_layers, use_mup_outer=False, mup_init_scale=1.0
+):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.normal_(tensor, mean=0.0, std=std)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+# orthogonal init does not support fp16, so have to patch it
+def _orthogonal(tensor, gain=1):
+
+    if tensor.ndimension() < 2:
+        raise ValueError("Only tensors with 2 or more dimensions are supported")
+
+    rows = tensor.size(0)
+    cols = tensor.numel() // rows
+    flattened = tensor.new(rows, cols).normal_(0, 1)
+
+    if rows < cols:
+        flattened.t_()
+
+    # Compute the qr factorization
+    dt = flattened.dtype
+    flattened = flattened.to(torch.float32)  # orthogonal init does not support fp16
+    q, r = torch.qr(flattened)
+    q, r = q.to(dtype=dt), r.to(dtype=dt)
+    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+    d = torch.diag(r, 0)
+    ph = d.sign()
+    q *= ph
+
+    if rows < cols:
+        q.t_()
+
+    with torch.no_grad():
+        tensor.view_as(q).copy_(q)
+        tensor.mul_(gain)
+    return tensor
+
+
+def orthogonal_init_method(n_layers=1, use_mup=False, mup_init_scale=1.0):
+    """Fills the input Tensor with a (semi) orthogonal matrix, as described in
+    Exact solutions to the nonlinear dynamics of learning in deep linear neural networks - Saxe, A. et al. (2013)
+    Optionally scaling by number of layers possible, as introduced in OBST - Nestler et. al. (2021, to be released)"""
+
+    if use_mup:
+        raise ValueError(
+            "Orthogonal init needs to be patched to support mup. Disable mup or use a different init method to avoid this error"
+        )
+
+    def init_(tensor):
+        return _orthogonal(tensor, math.sqrt(2 / n_layers))
+
+    return init_
+
+
+def xavier_uniform_init_method(use_mup_outer=False, mup_init_scale=1.0):
+    """Fills the input Tensor with values according to the method described in Understanding the difficulty of
+    training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a uniform distribution."""
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.xavier_uniform_(tensor)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.xavier_uniform_(tensor)
+
+    return init_
+
+
+def xavier_normal_init_method(use_mup_outer=False, mup_init_scale=1.0):
+    """Fills the input Tensor with values according to the method described in Understanding the difficulty of
+    training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution."""
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.xavier_normal_(tensor)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.xavier_normal_(tensor)
+
+    return init_
+
+
+def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
+    """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
+    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
+    std = math.sqrt(2 / (5 * dim))
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.normal_(tensor, mean=0.0, std=std)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def wang_init_method(n_layers, dim, use_mup_outer=False, mup_init_scale=1.0):
+    std = 2 / n_layers / math.sqrt(dim)
+
+    def init_(tensor, use_mup=use_mup_outer):
+        if use_mup:
+            mup.init.normal_(tensor, mean=0.0, std=std)
+            with torch.no_grad():
+                tensor.mul_(mup_init_scale)
+            return tensor
+        else:
+            return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def get_init_methods(args):
+
+    if args.use_mup:
+        try:
+            import mup
+        except ModuleNotFoundError:
+            print("Please install mup https://github.com/microsoft/mup")
+            raise Exception
+
+    def _get(name):
+        if name == "normal":
+            return init_method_normal(
+                args.init_method_std, args.use_mup, args.mup_init_scale
+            )
+        elif name == "scaled_normal":
+            return scaled_init_method_normal(
+                args.init_method_std, args.num_layers, args.use_mup, args.mup_init_scale
+            )
+        elif name == "orthogonal":
+            return orthogonal_init_method(args.use_mup, args.mup_init_scale)
+        elif name == "scaled_orthogonal":
+            return orthogonal_init_method(
+                args.num_layers, args.use_mup, args.mup_init_scale
+            )
+        elif name == "xavier_uniform":
+            return xavier_uniform_init_method(args.use_mup, args.mup_init_scale)
+        elif name == "xavier_normal":
+            return xavier_normal_init_method(args.use_mup, args.mup_init_scale)
+        elif name == "wang_init":
+            return wang_init_method(
+                args.num_layers, args.hidden_size, args.use_mup, args.mup_init_scale
+            )
+        elif name == "small_init":
+            return small_init_init_method(
+                args.hidden_size, args.use_mup, args.mup_init_scale
+            )
+        else:
+            raise NotImplementedError(f"Unknown init method {name}")
+
+    return _get(args.init_method), _get(args.output_layer_init_method)
diff --git a/benchmarks/sizing/megatron/model/norms.py b/benchmarks/sizing/megatron/model/norms.py
new file mode 100644
index 0000000..ddb45c3
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/norms.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.nn import LayerNorm as LayerNorm
+
+
+def get_norm(neox_args):
+    if neox_args.norm == "rmsnorm":
+        norm = RMSNorm
+        eps = neox_args.rms_norm_epsilon
+    elif neox_args.norm == "layernorm":
+        eps = neox_args.layernorm_epsilon
+        norm = LayerNorm
+    elif neox_args.norm == "scalenorm":
+        eps = neox_args.scalenorm_epsilon
+        norm = ScaleNorm
+    else:
+        raise ValueError(f"norm {neox_args.norm} not recognized")
+    return norm, eps
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim, p=-1.0, eps=1e-8, bias=False):
+        """
+            Root Mean Square Layer Normalization
+        :param dim: model size
+        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
+        :param eps:  epsilon value, default 1e-8
+        :param bias: whether use bias term for RMSNorm, disabled by
+            default because RMSNorm doesn't enforce re-centering invariance.
+        """
+        super(RMSNorm, self).__init__()
+
+        self.eps = eps
+        self.d = dim
+        self.p = p
+        self.bias = bias
+
+        self.scale = torch.nn.Parameter(torch.ones(dim))
+        self.register_parameter("scale", self.scale)
+
+        if self.bias:
+            self.offset = torch.nn.Parameter(torch.zeros(dim))
+            self.register_parameter("offset", self.offset)
+
+    def forward(self, x):
+        if self.p < 0.0 or self.p > 1.0:
+            norm_x = x.norm(2, dim=-1, keepdim=True)
+            d_x = self.d
+        else:
+            partial_size = int(self.d * self.p)
+            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
+
+            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
+            d_x = partial_size
+
+        rms_x = norm_x * d_x ** (-1.0 / 2)
+        x_normed = x / (rms_x + self.eps)
+
+        if self.bias:
+            return self.scale * x_normed + self.offset
+
+        return self.scale * x_normed
+
+
+class ScaleNorm(torch.nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.g = torch.nn.Parameter(torch.ones(1))
+        self.eps = eps
+
+    def forward(self, x):
+        n = torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps)
+        return x / n * self.g
diff --git a/benchmarks/sizing/megatron/model/positional_embeddings.py b/benchmarks/sizing/megatron/model/positional_embeddings.py
new file mode 100644
index 0000000..6881507
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/positional_embeddings.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import math
+
+
+class SinusoidalPositionalEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.precision = precision
+
+    def forward(self, x, seq_dim=1):
+        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
+        sinusoid_inp = torch.einsum("i,j->ij", t, self.inv_freq)
+        if self.precision == torch.bfloat16:
+            sinusoid_inp = sinusoid_inp.float()
+        sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
+        if self.precision == torch.bfloat16:
+            sin, cos = sin.bfloat16(), cos.bfloat16()
+        emb = torch.cat((sin, cos), dim=-1)
+        return emb[None, :, :]
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+        self.precision = precision
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+            self.cos_cached = emb.cos()[:, None, None, :]
+            self.sin_cached = emb.sin()[:, None, None, :]
+            if self.precision == torch.bfloat16:
+                self.cos_cached = self.cos_cached.bfloat16()
+                self.sin_cached = self.sin_cached.bfloat16()
+        return self.cos_cached, self.sin_cached
+
+
+# rotary pos emb helpers:
+
+
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat(
+        (-x2, x1), dim=x1.ndim - 1
+    )  # dim=-1 triggers a bug in earlier torch versions
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
+    cos, sin = (
+        cos[offset : q.shape[0] + offset, ...],
+        sin[offset : q.shape[0] + offset, ...],
+    )
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+
+
+def apply_rotary_pos_emb_torch(
+    q, k, cos, sin, offset: int = 0
+):  # jitting fails with bf16
+    cos, sin = (
+        cos[offset : q.shape[0] + offset, ...],
+        sin[offset : q.shape[0] + offset, ...],
+    )
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+
+
+class AliBi(torch.nn.Module):
+    def __init__(self, num_heads, mp_size=1, mp_rank=1):
+        super().__init__()
+        # megatron splits across heads, so we need to make sure each
+        # head receives the correct matrix
+        assert mp_size <= num_heads and mp_rank <= mp_size
+        self.mp_size = mp_size
+        self.mp_rank = mp_rank
+        self.num_heads = num_heads
+        self.slice_size = num_heads // mp_size
+        self.cached_matrix = None
+        self.cached_seq_len = None
+        slopes = torch.Tensor(self._get_slopes(num_heads))[
+            mp_rank * self.slice_size : (mp_rank + 1) * self.slice_size
+        ]
+        self.register_buffer("slopes", slopes)
+
+    def _get_slopes(self, n):
+        """
+        Get slopes for Alibi positional embedding
+        n : int = number of heads.
+        For best performance, restrict n to a power of 2.
+        """
+
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + self._get_slopes(2 * closest_power_of_2)[0::2][
+                    : n - closest_power_of_2
+                ]
+            )
+
+    def bias(self, seq_len_q, seq_len_k, device, dtype):
+        # [b, np, sq, sk]
+        # seq_len_q = x.shape[-2]
+        # seq_len_k = x.shape[-1]
+
+        # Initialize the AliBi matrix to match the first provided key length; grow it exponentially
+        # afterwards if longer inputs are provided. This is important for inference, where we will
+        # encounter progressively longer samples; it should have no effect at training time.
+        if self.cached_seq_len is not None and self.cached_seq_len >= seq_len_k:
+            a = self.cached_matrix
+        else:
+            target_seq_len = (
+                seq_len_k if self.cached_seq_len is None else self.cached_seq_len * 4
+            )
+            a = -torch.tril(
+                torch.arange(target_seq_len)
+                .view(target_seq_len, 1)
+                .repeat(1, target_seq_len)
+                + torch.arange(0, -target_seq_len, -1)
+            )
+            a = a.to(device).to(dtype)
+            slopes = self.slopes.to(a.device).to(a.dtype)
+            a = a * slopes.view(self.slopes.shape[0], 1, 1)
+            self.cached_seq_len = target_seq_len
+            self.cached_matrix = a
+
+        # If the AliBi matrix is larger than the key length, clip it.
+        if self.cached_seq_len > seq_len_k:
+            a = self.cached_matrix[:, :seq_len_k, :seq_len_k]
+
+        if seq_len_q != seq_len_k:
+            # In the train case x has dimensionality [b, np, sq, sk] with sq == sk
+            # The number of query tokens is equal to the number of key tokens
+            # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence)
+            # In this case we use the appropriate token index of the cache matrix.
+            # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used
+            assert (
+                seq_len_q == 1
+            ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1"
+            a = a[:, seq_len_k - 1, :].view(
+                a.shape[0], 1, a.shape[2]
+            )  # seq_len_k - 1 points to the last token index in the current inference batch.
+
+        return a
+
+    def forward(self, x):
+        # [b, np, sq, sk]
+        seq_len_q = x.shape[-2]
+        seq_len_k = x.shape[-1]
+
+        # Initialize the AliBi matrix to match the first provided key length; grow it exponentially
+        # afterwards if longer inputs are provided. This is important for inference, where we will
+        # encounter progressively longer samples; it should have no effect at training time.
+        if self.cached_seq_len is not None and self.cached_seq_len >= seq_len_k:
+            a = self.cached_matrix
+        else:
+            target_seq_len = (
+                seq_len_k if self.cached_seq_len is None else self.cached_seq_len * 4
+            )
+            a = -torch.tril(
+                torch.arange(target_seq_len)
+                .view(target_seq_len, 1)
+                .repeat(1, target_seq_len)
+                + torch.arange(0, -target_seq_len, -1)
+            )
+            a = a.to(x.device).to(x.dtype)
+            slopes = self.slopes.to(a.device).to(a.dtype)
+            a = a * slopes.view(self.slopes.shape[0], 1, 1)
+            self.cached_seq_len = target_seq_len
+            self.cached_matrix = a
+
+        # If the AliBi matrix is larger than the key length, clip it.
+        if self.cached_seq_len > seq_len_k:
+            a = self.cached_matrix[:, :seq_len_k, :seq_len_k]
+
+        if seq_len_q != seq_len_k:
+            # In the train case x has dimensionality [b, np, sq, sk] with sq == sk
+            # The number of query tokens is equal to the number of key tokens
+            # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence)
+            # In this case we use the appropriate token index of the cache matrix.
+            # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used
+            assert (
+                seq_len_q == 1
+            ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1"
+            a = a[:, seq_len_k - 1, :].view(
+                a.shape[0], 1, a.shape[2]
+            )  # seq_len_k - 1 points to the last token index in the current inference batch.
+
+        return x + a
diff --git a/benchmarks/sizing/megatron/model/transformer.py b/benchmarks/sizing/megatron/model/transformer.py
new file mode 100644
index 0000000..f6e72e9
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/transformer.py
@@ -0,0 +1,1031 @@
+# Copyright (c) 2021 EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer."""
+
+import math
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+
+from .norms import get_norm
+from megatron import mpu
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.activations import get_activation
+from megatron.model.utils import exists, get_fusion_type
+from megatron.model.positional_embeddings import (
+    RotaryEmbedding,
+    apply_rotary_pos_emb_torch,
+    apply_rotary_pos_emb,
+    AliBi,
+)
+from megatron.model.fused_bias_dropout import (
+    get_bias_dropout_add,
+    bias_dropout_add_fused_train,
+    bias_dropout_add_fused_inference,
+)
+from megatron.model.utils import configure_sparse_attention
+
+import time
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+        attention_mask_func: a function that takes `unmasked-attention-scores`
+            with size [b, np, s, s] and an `attention-mask` and will apply
+            the masking. The function should return a masked score of the
+            same size [b, np, s, s].
+               masked-attention-scores = attention_mask_func(
+                                     unmasked-attention-scores, attention-mask)
+"""
+
+
+class ParallelMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+    """
+
+    def __init__(
+        self, neox_args, init_method, output_layer_init_method, parallel_output=False
+    ):
+        super().__init__()
+
+        self.activation_func = get_activation(neox_args)
+        self.activation_type = neox_args.activation
+        self.bias_gelu_fusion = neox_args.bias_gelu_fusion
+
+        # auto scale so geglu has equal parameters
+        ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4
+        ff_dim = (
+            int(ff_mult * neox_args.hidden_size) * 2
+            if self.activation_type == "geglu"
+            else ff_mult * neox_args.hidden_size
+        )
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=ff_dim,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True,
+        )
+
+
+        ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinear(
+            neox_args=neox_args,
+            input_size=ff_dim_in,
+            output_size=neox_args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+            parallel_output=parallel_output,
+        )
+
+    def forward(self, hidden_states):
+        #torch.cuda.synchronize()
+        #st = time.time()
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+        if (
+            self.activation_type == "gelu" and self.bias_gelu_fusion
+        ) or self.activation_type == "geglu":
+            intermediate_parallel = self.activation_func(
+                intermediate_parallel, bias_parallel
+            )
+        else:
+            intermediate_parallel = self.activation_func(
+                intermediate_parallel + bias_parallel
+            )
+        #torch.cuda.synchronize()
+        #print(f"MLP_h_4h: {time.time()-st}")
+
+        #torch.cuda.synchronize()
+        #st=time.time()
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        #torch.cuda.synchronize()
+        #print(f"MLP_4h_h: {time.time()-st}")
+        return output, output_bias
+
+
+class LLaMAParallelMLP(nn.Module):
+    """LLaMA's MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Note: multiple_of is used to compute the hidden dimension of the MLP
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        init_method,
+        output_layer_init_method,
+        parallel_output=False,
+        multiple_of=256,
+    ):
+        super().__init__()
+
+        self.activation_func = get_activation(neox_args)
+        self.activation_type = neox_args.activation
+
+        self.multiple_of = multiple_of
+
+        ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
+        ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+        self.w1 = mpu.ColumnParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=ff_dim,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True,
+            bias=False,
+        )
+        self.w3 = mpu.ColumnParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=ff_dim,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True,
+            bias=False,
+        )
+        self.w2 = mpu.RowParallelLinear(
+            neox_args=neox_args,
+            input_size=ff_dim,
+            output_size=neox_args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+            parallel_output=parallel_output,
+            bias=False,
+        )
+
+    def forward(self, hidden_states):
+        w1_out, _ = self.w1(hidden_states)
+        w3_out, _ = self.w3(hidden_states)
+        return self.w2(self.activation_func(w1_out) * w3_out)
+
+
+class ParallelLinear(nn.Module):
+    """
+    A Parallel Linear Layer transforming the transformer outputs from hidden_size -> vocab_size
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        parallel_output=True,
+        init_method=nn.init.xavier_normal_,
+        is_last_layer=False,
+    ):
+        super().__init__()
+        parallelism = neox_args.output_layer_parallelism
+        if parallelism == "column":
+            self.final_linear = mpu.ColumnParallelLinear(
+                neox_args=neox_args,
+                input_size=neox_args.hidden_size,
+                output_size=neox_args.padded_vocab_size,
+                bias=False,
+                init_method=init_method,
+                gather_output=not parallel_output,
+                skip_bias_add=False,
+                mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
+            )
+#        else:
+#            print(
+#                'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
+#            )
+#            exit()
+#            self.final_linear = mpu.RowParallelLinear(
+#                neox_args=neox_args,
+#                input_size=neox_args.hidden_size,
+#                output_size=neox_args.padded_vocab_size,
+#                bias=False,
+#                input_is_parallel=False,
+#                init_method=init_method,
+#                parallel_output=parallel_output,
+#                skip_bias_add=False,
+#                mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
+#            )
+
+    def forward(self, hidden_states):
+        return self.final_linear(hidden_states)
+
+
+class ParallelSelfAttention(nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        attention_mask_func,
+        init_method,
+        output_layer_init_method,
+        layer_number,
+        rpe=None,
+        rotary=False,
+        use_cache=False,
+        parallel_output=False,
+    ):
+        super().__init__()
+
+        self.fp16 = neox_args.precision == "fp16"
+        self.bf16 = neox_args.precision == "bfloat16"
+        self.attention_mask_func = attention_mask_func
+        self.apply_query_key_layer_scaling = neox_args.apply_query_key_layer_scaling
+        self.use_cache = use_cache
+        self.attention_softmax_in_fp32 = neox_args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = layer_number
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(neox_args.hidden_size, world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            neox_args.hidden_size, neox_args.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = mpu.divide(
+            neox_args.num_attention_heads, world_size
+        )
+        self.pos_emb = neox_args.pos_emb
+
+        # Strided linear layer.
+        self.query_key_value = mpu.ColumnParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=3 * neox_args.hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            bias=neox_args.use_bias_in_attn_linear,
+        )
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = max(1, self.layer_number)
+            self.norm_factor *= coeff
+
+        if neox_args.use_mup:
+            self.norm_factor = self.hidden_size_per_attention_head
+
+        self.rpe = rpe
+
+        if self.pos_emb == "alibi":
+            self.alibi_embed = AliBi(
+                neox_args.num_attention_heads,
+                neox_args.model_parallel_size,
+                mpu.get_model_parallel_rank(),
+            )
+
+        # TODO: this arg shouldn't need to be passed in - get from neox_args
+        if rotary:
+            if neox_args.rotary_pct == 1:
+                self.rotary_ndims = None
+            else:
+                assert neox_args.rotary_pct < 1
+                self.rotary_ndims = int(
+                    self.hidden_size_per_attention_head * neox_args.rotary_pct
+                )
+            dim = (
+                self.rotary_ndims
+                if self.rotary_ndims is not None
+                else self.hidden_size_per_attention_head
+            )
+            self.rotary_emb = RotaryEmbedding(
+                dim, base=neox_args.rotary_emb_base, precision=neox_args.params_dtype
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attention_type = neox_args.attention_config[layer_number]
+        self.use_flash_attention = self.attention_type == "flash"
+        self.sparse = self.attention_type not in ("global", "flash")
+        if self.sparse:
+            self.sparse_attn = configure_sparse_attention(
+                neox_args,
+                self.attention_type,
+                self.num_attention_heads_per_partition,
+                mpu=mpu,
+            )
+        else:
+            if self.use_flash_attention:
+                from megatron.model.flash_attention import (
+                    # flash_attn_unpadded_qkvpacked_func_cuda,
+                    # flash_attn_unpadded_kvpacked_func_cuda,
+                    # Change of function names going from flash attention 1 -> flash attention 2
+                    flash_attn_varlen_qkvpacked_func,
+                    flash_attn_varlen_kvpacked_func,
+                    flash_attn_unpadded_unpacked_func_triton
+                )
+
+                self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
+                self.flash_qkv_fn = flash_attn_varlen_qkvpacked_func
+                self.flash_kv_fn = flash_attn_varlen_kvpacked_func
+            else:
+                self.scale_mask_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=self.fp16,
+                    input_in_bf16=self.bf16,
+                    fusion_type=get_fusion_type(neox_args),
+                    mask_func=self.attention_mask_func,
+                    softmax_in_fp32=self.attention_softmax_in_fp32,
+                    scale=coeff,
+                )
+
+            # Dropout. Note that for a single iteration, this layer will generate
+            # different outputs on different number of parallel partitions but
+            # on average it should not be partition dependent.
+            self.dropout_p = neox_args.attention_dropout
+            self.attention_dropout = nn.Dropout(self.dropout_p)
+
+        # Output.
+        self.dense = mpu.RowParallelLinear(
+            neox_args=neox_args,
+            input_size=neox_args.hidden_size,
+            output_size=neox_args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+            parallel_output=parallel_output,
+            bias=neox_args.use_bias_in_attn_linear,
+        )
+
+    def attention(
+        self, query_layer, key_layer, value_layer, layer_past, attention_mask
+    ):
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(
+            output_size[2], output_size[0] * output_size[1], -1
+        )
+        key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+        # preallocating result tensor: [b * np, sq, sk]
+        matmul_result = torch.empty(
+            output_size[0] * output_size[1],
+            output_size[2],
+            output_size[3],
+            dtype=query_layer.dtype,
+            device=torch.cuda.current_device(),
+        )
+        #torch.cuda.synchronize()
+        #st = time.time()
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_result,
+            query_layer.transpose(0, 1),  # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        #torch.cuda.synchronize()
+        #print(f"Attention Score: {time.time()-st}")
+        
+
+        # ==================================================
+        # Update attention mask for inference. [b, np, sq, sk]
+        # ==================================================
+
+        if self.use_cache:
+            with torch.no_grad():
+                attention_mask = attention_mask[
+                    ..., : attention_scores.size(3), : attention_scores.size(3)
+                ]
+
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        if exists(self.rpe):
+            rpe = self.rpe(query_layer.size(0), key_layer.size(0))
+            attention_scores += rpe  # [1, np, sq, sk]
+
+        if self.pos_emb == "alibi":
+            attention_scores = self.alibi_embed(attention_scores)
+
+        # attention scores and attention mask [b, np, sq, sk]
+        #torch.cuda.synchronize()
+        #st=time.time()
+        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
+        #torch.cuda.synchronize()
+        #print(f"Attention Softmax: {time.time()-st}")
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        #torch.cuda.synchronize()
+        #st=time.time()
+        with mpu.get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+        #torch.cuda.synchronize()
+        #print(f"Attention Dropout: {time.time()-st:.20f}")
+
+        
+        
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (
+            value_layer.size(1),
+            value_layer.size(2),
+            query_layer.size(0),
+            value_layer.size(3),
+        )
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(
+            value_layer.size(0), output_size[0] * output_size[1], -1
+        )
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(
+            output_size[0] * output_size[1], output_size[2], -1
+        )
+        #torch.cuda.synchronize()
+        #st=time.time()
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        #torch.cuda.synchronize()
+        #print(f"Attention Over Value: {time.time()-st}")
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        
+        return context_layer
+
+    def flash_attention(self, query_layer, key_layer, value_layer):
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
+
+        if self.pos_emb != "alibi":
+
+            # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
+            key_layer = key_layer.transpose(0, 1).reshape(
+                output_size[0] * output_size[3], 1, output_size[1], -1
+            )
+            value_layer = value_layer.transpose(0, 1).reshape(
+                output_size[0] * output_size[3], 1, output_size[1], -1
+            )
+
+            batch_size = output_size[0]
+            max_seqlen_q = output_size[2]
+            max_seqlen_k = output_size[3]
+
+            cu_seqlens_q = torch.arange(
+                0,
+                (batch_size + 1) * max_seqlen_q,
+                step=max_seqlen_q,
+                dtype=torch.int32,
+                device=query_layer.device,
+            )
+
+            cu_seqlens_k = torch.arange(
+                0,
+                (batch_size + 1) * max_seqlen_k,
+                step=max_seqlen_k,
+                dtype=torch.int32,
+                device=key_layer.device,
+            )
+
+            if not self.training:
+
+                # [sq, b, np, hn] -> [b * sq, np, hn]
+                query_layer = query_layer.transpose(0, 1).reshape(
+                    output_size[0] * output_size[2], output_size[1], -1
+                )
+
+                # Combined k/v into [b * sk, 2, np, hn].
+                kv = torch.concat([key_layer, value_layer], dim=1)
+                output = self.flash_kv_fn(
+                    query_layer,
+                    kv,
+                    cu_seqlens_q,
+                    cu_seqlens_k,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=None,
+                    causal=True,
+                )
+
+            else:
+
+                # [sq, b, np, hn] -> [b * sq, 1, np, hn]
+                query_layer = query_layer.transpose(0, 1).reshape(
+                    output_size[0] * output_size[2], 1, output_size[1], -1
+                )
+
+                # Combined q/k/v into [b * s, 3, np, hn].
+                qkv = torch.concat([query_layer, key_layer, value_layer], dim=1)
+                output = self.flash_qkv_fn(
+                    qkv,
+                    cu_seqlens_q,
+                    max_seqlen_q,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=None,
+                    causal=True,
+                )
+
+            # [b * sq, np, hn] -> [b, sq, np, hn]
+            matmul_result = output.view(
+                output_size[0], output_size[2], output.shape[1], output.shape[2]
+            )
+            # [b, sq, np, hn] -> [b, np, sq, hn]
+            matmul_result = matmul_result.transpose(1, 2)
+
+        else:
+            # [sq, b, np, hn] -> [b, sq, np, hn]
+            sq = query_layer.size(0)
+            b = query_layer.size(1)
+            sk = key_layer.size(0)
+
+            query_layer = query_layer.transpose(0, 1)
+            key_layer = key_layer.transpose(0, 1)
+            value_layer = value_layer.transpose(0, 1)
+
+            bias = self.alibi_embed.bias(sq, sk, query_layer.device, query_layer.dtype)
+            bias = bias.unsqueeze(0).tile((b, 1, 1, 1))
+
+            matmul_result = self.flash_triton_fn(
+                query_layer, key_layer, value_layer, bias=bias, causal=True
+            )
+            matmul_result = matmul_result.transpose(1, 2)
+
+        return matmul_result
+
+    def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
+        # TODO: sparse attn dropout?
+        # TODO: pad to block size
+        # shape of q/k/v is [sq, b, np, hn] and needs to be transposed to [b, np, sq, hn]
+        query_layer, key_layer, value_layer = map(
+            lambda t: t.permute(1, 2, 0, 3).contiguous(),
+            (query_layer, key_layer, value_layer),
+        )
+        # output shape [b, np(heads), sq, hn]
+        attn_mask = attention_mask.to(query_layer.dtype) * -10000
+        if exists(self.rpe):
+            rpe = self.rpe(query_layer.size(0), key_layer.size(0))
+        else:
+            rpe = None
+        return self.sparse_attn(
+            query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe
+        )
+
+    def forward(self, hidden_states, attention_mask, layer_past=None):
+
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        #torch.cuda.synchronize()
+        #st = time.time()
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        #torch.cuda.synchronize()
+        #print(f"QKV Transform: {time.time()-st}")
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
+            mixed_x_layer, 3
+        )
+
+        if exists(self.rotary_emb):
+            if exists(self.rotary_ndims):
+                # partial rotary
+                query_rot, query_pass = (
+                    query_layer[..., : self.rotary_ndims],
+                    query_layer[..., self.rotary_ndims :],
+                )
+                key_rot, key_pass = (
+                    key_layer[..., : self.rotary_ndims],
+                    key_layer[..., self.rotary_ndims :],
+                )
+            else:
+                # full rotary
+                query_rot, key_rot = query_layer, key_layer
+
+            apply_rotary_fn = (
+                apply_rotary_pos_emb_torch if self.bf16 else apply_rotary_pos_emb
+            )
+
+            seq_len = key_layer.shape[0]
+            offset = 0
+            if exists(layer_past) and layer_past.numel() > 0:
+                offset = layer_past[0].shape[0]
+                seq_len += offset
+            cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
+            query_layer, key_layer = apply_rotary_fn(
+                query_rot, key_rot, cos, sin, offset=offset
+            )
+
+            if exists(self.rotary_ndims):
+                query_layer = torch.cat((query_layer, query_pass), dim=-1)
+                key_layer = torch.cat((key_layer, key_pass), dim=-1)
+
+        
+
+        # ==================================
+        # Cache key and value for inference
+        # ==================================
+
+        if exists(layer_past) and layer_past.numel() > 0:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=0)
+            value_layer = torch.cat(
+                (past_value.type_as(value_layer), value_layer), dim=0
+            )
+
+        if self.use_cache:
+            present = torch.stack((key_layer, value_layer))
+
+        if self.use_flash_attention:
+            context_layer = self.flash_attention(query_layer, key_layer, value_layer)
+            #print("using flash")
+        elif not self.sparse:
+            context_layer = self.attention(
+                query_layer, key_layer, value_layer, layer_past, attention_mask
+            )
+        else:
+            context_layer = self.sparse_attention(
+                query_layer, key_layer, value_layer, attention_mask
+            )
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition,
+        )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        #torch.cuda.synchronize()
+        #st=time.time()
+        output, bias = self.dense(context_layer)
+
+        if self.use_cache:
+            output = [output, present]
+        #torch.cuda.synchronize()
+        #print(f"Attention linproj: {time.time()-st}")
+
+        
+
+        return output, bias
+
+
+class ParallelTransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        attention_mask_func,
+        init_method,
+        output_layer_init_method,
+        layer_number,
+        rpe=None,
+        rotary=False,
+        use_cache=False,
+    ):
+
+        super().__init__()
+        self.layer_number = layer_number
+
+        norm, eps = get_norm(neox_args)
+
+        # Layernorm on the input data.
+        self.input_layernorm = norm(neox_args.hidden_size, eps=eps)
+        self.use_cache = use_cache
+
+        self.hidden_dropout = neox_args.hidden_dropout
+        self.bias_dropout_fusion = neox_args.bias_dropout_fusion
+        self.gpt_j_residual = neox_args.gpt_j_residual
+        self.gpt_j_tied = neox_args.gpt_j_tied
+        self.mlp_type = neox_args.mlp_type
+
+        if self.gpt_j_residual:
+            self.reduce = mpu.mappings.reduce_from_model_parallel_region
+
+        # Self attention.
+        self.attention = ParallelSelfAttention(
+            neox_args=neox_args,
+            attention_mask_func=attention_mask_func,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            layer_number=layer_number,
+            rpe=rpe,
+            use_cache=self.use_cache,
+            rotary=rotary,
+            parallel_output=self.gpt_j_residual,
+        )
+
+        # Layernorm on the output of the attention layer.
+        # If GPT-J residuals are used, this is surpurfulous but leaving it in
+        # leads to cleaner code
+        self.post_attention_layernorm = norm(neox_args.hidden_size, eps=eps)
+
+        # MLP
+        if neox_args.mlp_type == "regular":
+            self.mlp = ParallelMLP(
+                neox_args=neox_args,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                parallel_output=self.gpt_j_residual,
+            )
+        elif neox_args.mlp_type == "llama":
+            self.mlp = LLaMAParallelMLP(
+                neox_args=neox_args,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                parallel_output=self.gpt_j_residual,
+            )
+        else:
+            raise KeyError(neox_args.mlp_type)
+
+        self.layer_past = None  # used to cache k/v pairs in inference
+
+    def _get_bias_dropout(self):
+        if self.bias_dropout_fusion:
+            fn = (
+                bias_dropout_add_fused_train
+                if self.training
+                else bias_dropout_add_fused_inference
+            )
+        else:
+            fn = get_bias_dropout_add(self.training)
+        return fn
+
+    def forward(self, x, attention_mask, layer_past=None):
+        #torch.cuda.synchronize()
+        #total_st = time.time()
+        layer_past = layer_past if layer_past is not None else self.layer_past
+        bias_dropout_fn = self._get_bias_dropout()
+        # x: [b, s, h]
+        if self.gpt_j_residual:
+            # pseudocode:
+            # x = x + attn(ln(x)) + mlp(ln(x))
+            # this means we can avoid doing the allreduce in the attn / mlp outputs
+            # to save communication time (we can do a single allreduce after we add mlp / attn outputs).
+            # due to a bug, the two layernorms are not tied in GPT-NeoX-20B. This is non-desirable, but
+            # we preserve the functionality for backwards compatibility
+
+            residual = x
+            # applies the correct normalization depending on if the norms are tied
+            if self.gpt_j_tied:
+                x = self.input_layernorm(x)
+                x1, x2 = x, x
+            else:
+                x1, x2 = self.input_layernorm(x), self.post_attention_layernorm(x)
+
+            # attention operator
+            attention_output, attention_bias = self.attention(
+                x1, attention_mask, layer_past=layer_past
+            )
+            if self.use_cache:
+                attention_output, presents = attention_output
+                self.layer_past = presents
+
+            with torch.enable_grad():
+                attention_output = bias_dropout_fn(
+                    attention_output,
+                    bias=attention_bias.expand_as(attention_output),
+                    residual=None,
+                    prob=self.hidden_dropout,
+                )
+
+            # mlp operator
+            mlp_output, mlp_bias = self.mlp(x2)
+            with torch.enable_grad():
+                output = bias_dropout_fn(
+                    mlp_output,
+                    bias=mlp_bias.expand_as(mlp_output),
+                    residual=attention_output,
+                    prob=self.hidden_dropout,
+                )
+
+            # output = (x + attn(ln(x)) + mlp(ln(x))
+            output = residual + self.reduce(output)
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+
+            residual = x
+
+            # x = x + attn(ln1(x))
+            #torch.cuda.synchronize()
+            #st = time.time()
+            ln_input = self.input_layernorm(x)
+            #torch.cuda.synchronize()
+            #print(f"LN1: {time.time()-st}")
+
+            attention_output, attention_bias = self.attention(
+                ln_input, attention_mask, layer_past=layer_past
+            )
+            if self.use_cache:
+                attention_output, presents = attention_output
+                self.layer_past = presents
+            #torch.cuda.synchronize()
+            #st=time.time()
+            with torch.enable_grad():
+                if attention_bias is not None:
+                    # Use special bias_dropout_fn if we have a bias term from the above attention layer
+                    attention_output = bias_dropout_fn(
+                        attention_output,
+                        bias=attention_bias.expand_as(residual),
+                        residual=residual,
+                        prob=self.hidden_dropout,
+                    )
+                else:
+                    # Otherwise just apply dropout + residual
+                    print("simple dropout")
+                    attention_output = (
+                        torch.nn.functional.dropout(
+                            attention_output,
+                            p=self.hidden_dropout,
+                            training=self.training,
+                        )
+                        
+                    )
+            #torch.cuda.synchronize()
+            #print(f"Post-attention Dropout: {time.time()-st}")
+            #torch.cuda.synchronize()
+            #st=time.time()
+            attention_output += residual
+            #torch.cuda.synchronize()
+            #print(f"Post-attention residual: {time.time()-st}")
+            # output = x + mlp(ln2(x))
+            #torch.cuda.synchronize()
+            #st=time.time()
+            ln2_output = self.post_attention_layernorm(attention_output)
+            #torch.cuda.synchronize()
+            #print(f"LN2: {time.time()-st}")
+            mlp_output, mlp_bias = self.mlp(
+                ln2_output
+            )
+            #torch.cuda.synchronize()
+            #st=time.time()
+            with torch.enable_grad():
+                if self.mlp_type == "llama":
+                    # No dropout either
+                    assert mlp_bias is None
+                    output = mlp_output + attention_output
+                else:
+                    output = bias_dropout_fn(
+                        mlp_output,
+                        bias=mlp_bias.expand_as(attention_output),
+                        residual=attention_output,
+                        prob=self.hidden_dropout,
+                    )
+            #torch.cuda.synchronize()
+            #print(f"Post-MLP residual: {time.time()-st}")
+            #torch.cuda.synchronize()
+            #print(f"Attention layer time: {time.time()-total_st}")
+        return output
+
+
+class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline."""
+
+    def forward(self, args):
+        assert (
+            len(args) == 2
+        ), "ParallelTransformerLayerPipe expects 2 arguments - hidden_states and attention_mask"
+        hidden_states, attention_mask = args
+        # we are returning just [hidden_states, mask]
+        return super().forward(hidden_states, attention_mask), attention_mask
+
+
+class ParallelLinearPipe(ParallelLinear):
+    """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
+
+    def forward(self, args):
+        assert isinstance(
+            args, torch.Tensor
+        ), "ParallelLinearPipe expects a single argument - hidden_states"
+        hidden_state = args
+        logits, bias = super().forward(hidden_state)
+        return logits
+
+
+class NormPipe(nn.Module):
+    """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
+
+    def __init__(self, norm_class, hidden_size, eps):
+        super().__init__()
+        self.norm = norm_class(hidden_size, eps=eps)
+
+    def forward(self, args):
+        assert not isinstance(
+            args, tuple
+        ), "NormPipe should only receive a single tensor as input"
+        return self.norm(args)
+
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_model_parallel_region(input_)
+
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
+
+    return mpu.gather_from_model_parallel_region(logits_parallel)
diff --git a/benchmarks/sizing/megatron/model/utils.py b/benchmarks/sizing/megatron/model/utils.py
new file mode 100644
index 0000000..141a6e4
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/utils.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2021 EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for models."""
+
+import torch
+from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
+from megatron.model.fused_softmax import SoftmaxFusionTypes
+from types import GeneratorType
+import torch.distributed as dist
+
+
+def get_params_for_weight_decay_optimization(module, neox_args):
+    """Divide params into with-weight-decay and without-weight-decay groups.
+    Layernorms and biases will have no weight decay but the rest will.
+    """
+    weight_decay_params = {"params": []}
+    no_weight_decay_params = {"params": [], "weight_decay": 0.0}
+    for module_ in module.modules():
+        if any(
+            [
+                isinstance(module_, LayerNorm),
+                isinstance(module_, RMSNorm),
+                isinstance(module_, ScaleNorm),
+            ]
+        ) or (
+            neox_args.weight_decay == 0.0
+        ):  # also include all parameters here if no weight decay is being done
+            no_weight_decay_params["params"].extend(
+                [p for p in list(module_._parameters.values()) if p is not None]
+            )
+        else:
+            weight_decay_params["params"].extend(
+                [
+                    p
+                    for n, p in list(module_._parameters.items())
+                    if p is not None and n != "bias"
+                ]
+            )
+            no_weight_decay_params["params"].extend(
+                [
+                    p
+                    for n, p in list(module_._parameters.items())
+                    if p is not None and n == "bias"
+                ]
+            )
+    if neox_args.weight_decay == 0.0:
+        # only return a single param group
+        # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
+        # to avoid this, only use a single param group when weight decay is off.
+        return [no_weight_decay_params]
+    return weight_decay_params, no_weight_decay_params
+
+
+def exists(x):
+    return x is not None
+
+
+class Lambda(torch.nn.Module):
+    def __init__(self, func):
+        super().__init__()
+        self.func = func
+
+    def forward(self, x):
+        return self.func(x)
+
+
+class SequentialWrapper(torch.nn.Module):
+    """
+    Used to convert a deepspeed PipelineModule to an nn.Sequential like model whilst retaining
+    activation checkpointing.
+    """
+
+    def __init__(
+        self,
+        layers,
+        activation_checkpoint_interval,
+        activation_checkpoint_func,
+        parent_class_name=None,
+    ):
+        super().__init__()
+        self.sequential = torch.nn.Sequential(*layers)
+        self.activation_checkpoint_interval = activation_checkpoint_interval
+        self.parent_class_name = parent_class_name
+        self.activation_checkpoint_func = activation_checkpoint_func
+
+    def _is_checkpointable(self, funcs):
+        if self.parent_class_name == "GPT2ModelPipe":
+            return all(
+                "ParallelTransformerLayerPipe" in f.__class__.__name__ for f in funcs
+            )
+        params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)]
+        return any(len(list(p)) > 0 for p in params)
+
+    def inference_mode(self, use_cache=True):
+        """
+        Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
+        so logits are gathered across model parallel ranks.
+
+        :param cache: (bool) True if you want to use caching during inference, False otherwise
+        """
+        _set_use_cache(self.sequential, use_cache)
+        recursive_setattr(self.sequential, "training", False)
+
+    def train_mode(self):
+        """
+        Sets up the model for training by turning off k/v caching.
+        """
+        _set_use_cache(self.sequential, False)
+        recursive_setattr(self.sequential, "training", True)
+
+    def forward(
+        self, forward_input, curriculum_seqlen=None, labels=None, neox_args=None
+    ):
+
+        if (
+            curriculum_seqlen is not None
+            and isinstance(forward_input, tuple)
+            and len(forward_input) == 3
+        ):
+            neox_args.update_value("curriculum_seqlen", curriculum_seqlen)
+            tokens = forward_input[0]
+            input_ids = forward_input[1]
+            attention_mask = forward_input[2]
+            if curriculum_seqlen < input_ids.size()[1]:
+                # seqlen-based curriculum learning
+                # input_ids, position_ids, labels have size [batch size, seqlen]
+                input_ids = input_ids[:, :curriculum_seqlen].contiguous()
+                tokens = tokens[:, :curriculum_seqlen].contiguous()
+                # position_ids = position_ids[:, :curriculum_seqlen].contiguous()
+                if labels is not None:
+                    labels = labels[:, :curriculum_seqlen].contiguous()
+                # attention_mask has size [1, 1, seqlen, seqlen]
+                attention_mask = attention_mask[
+                    :, :, :curriculum_seqlen, :curriculum_seqlen
+                ].contiguous()
+            forward_input = (tokens, input_ids, attention_mask)
+
+        def exec_range_func(start, end):
+            """Helper function to be used with checkpoint()
+            Adapted from torch.utils.checkpoint:checkpoint_sequential()
+            """
+
+            def exec_func(*inputs):
+                # Single tensor inputs need to be unwrapped
+                if len(inputs) == 1:
+                    inputs = inputs[0]
+                for idx, layer in enumerate(self.sequential[start:end]):
+                    inputs = layer(inputs)
+                return inputs
+
+            return exec_func
+
+        if self.activation_checkpoint_interval == 0:
+            func = exec_range_func(0, len(self.sequential))
+            x = func(forward_input)
+        else:
+            num_layers = len(self.sequential)
+            x = forward_input
+            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+                end_idx = min(
+                    start_idx + self.activation_checkpoint_interval, num_layers
+                )
+
+                funcs = self.sequential[start_idx:end_idx]
+                # Since we either pass tensors or tuples of tensors without unpacking, we
+                # need to be careful not to double-wrap tensors with tuple.
+                if not isinstance(x, tuple):
+                    x = (x,)
+
+                if self._is_checkpointable(funcs):
+                    x = self.activation_checkpoint_func(
+                        exec_range_func(start_idx, end_idx), *x
+                    )
+                else:
+                    x = exec_range_func(start_idx, end_idx)(*x)
+        return x
+
+
+def recursive_setattr(m, attr, value, assert_type=None, type_filter=None):
+    """
+    Recursively set attributes on a pytorch module or an iterable of modules.
+    If an assert_type is provided, it will assert that the type of the value is the same as the assert_type.
+    If a type_filter is provided, it will only set attributes on modules that match that type.
+    """
+    if assert_type is not None:
+        assert isinstance(value, assert_type), "Value is not the correct type."
+
+    # if m is a list or a generator, iterate over the elements
+    if isinstance(m, (list, GeneratorType)):
+        for i in m:
+            recursive_setattr(i, attr, value, assert_type, type_filter)
+    elif isinstance(m, torch.nn.Module):
+        if hasattr(m, attr):
+            if type_filter is None or isinstance(m, type_filter):
+                setattr(m, attr, value)
+        if hasattr(m, "children"):
+            recursive_setattr(m.children(), attr, value, assert_type, type_filter)
+
+
+def _set_use_cache(modules, value: bool):
+    """
+    Recursively sets an use_cache to `value` on a list of pytorch modules, if they have a use_cache attribute.
+    use_cache is used to decide whether we cache past key value activations or not in inference.
+    """
+    recursive_setattr(modules, "use_cache", value, assert_type=bool)
+
+
+def configure_sparse_attention(neox_args, attention_type, num_attention_heads, mpu):
+    from deepspeed.ops.sparse_attention import (
+        SparseSelfAttention,
+        VariableSparsityConfig,
+        FixedSparsityConfig,
+        BigBirdSparsityConfig,
+        BSLongformerSparsityConfig,
+    )
+    from deepspeed.ops.sparse_attention.sparsity_config import (
+        LocalSlidingWindowSparsityConfig,
+    )
+
+    if attention_type == "sparse_fixed":
+        # you can think of local window size as `block_size` * `num_local_blocks`.
+        # so if you wanted to set a local window size of 256, set block size to 16 and `num_local_blocks` to 16
+        sparsity_config = FixedSparsityConfig(
+            num_heads=num_attention_heads,
+            block=neox_args.sparsity_config.get("block", 16),
+            different_layout_per_head=neox_args.sparsity_config.get(
+                "different_layout_per_head", False
+            ),
+            num_local_blocks=neox_args.sparsity_config.get("num_local_blocks", 4),
+            num_global_blocks=neox_args.sparsity_config.get("num_global_blocks", 1),
+            num_different_global_patterns=neox_args.sparsity_config.get(
+                "num_different_global_patterns", 1
+            ),
+            attention="unidirectional",
+            horizontal_global_attention=False,
+        )
+    elif attention_type == "sparse_variable":
+        sparsity_config = VariableSparsityConfig(
+            num_heads=num_attention_heads,
+            block=neox_args.sparsity_config.get("block", 16),
+            different_layout_per_head=neox_args.sparsity_config.get(
+                "different_layout_per_head", False
+            ),
+            num_random_blocks=neox_args.sparsity_config.get("num_random_blocks", 0),
+            local_window_blocks=neox_args.sparsity_config.get(
+                "local_window_blocks", [4]
+            ),
+            global_block_indices=neox_args.sparsity_config.get(
+                "global_block_indices", [0]
+            ),
+            global_block_end_indices=neox_args.sparsity_config.get(
+                "global_block_end_indices", None
+            ),
+            attention="unidirectional",
+            horizontal_global_attention=False,
+        )
+    elif attention_type == "local":
+        # can configure with `num_local_blocks` or `num_sliding_window_blocks`
+        num_local_blocks = neox_args.sparsity_config.get(
+            "num_local_blocks",
+            neox_args.sparsity_config.get("num_sliding_window_blocks", 4),
+        )
+        sparsity_config = LocalSlidingWindowSparsityConfig(
+            num_heads=num_attention_heads,
+            block=neox_args.sparsity_config.get("block", 16),
+            num_sliding_window_blocks=num_local_blocks,
+            attention="unidirectional",
+        )
+    elif attention_type == "bigbird":
+        sparsity_config = BigBirdSparsityConfig(
+            num_heads=num_attention_heads,
+            block=neox_args.sparsity_config.get("block", 16),
+            different_layout_per_head=neox_args.sparsity_config.get(
+                "different_layout_per_head", False
+            ),
+            num_random_blocks=neox_args.sparsity_config.get("num_random_blocks", 1),
+            num_sliding_window_blocks=neox_args.sparsity_config.get(
+                "num_sliding_window_blocks", 3
+            ),
+            num_global_blocks=neox_args.sparsity_config.get("num_global_blocks", 1),
+            attention="unidirectional",
+        )
+    elif attention_type == "bslongformer":
+        sparsity_config = BSLongformerSparsityConfig(
+            num_heads=num_attention_heads,
+            block=neox_args.sparsity_config.get("block", 16),
+            different_layout_per_head=neox_args.sparsity_config.get(
+                "different_layout_per_head", False
+            ),
+            num_sliding_window_blocks=neox_args.sparsity_config.get(
+                "num_sliding_window_blocks", 3
+            ),
+            global_block_indices=neox_args.sparsity_config.get(
+                "global_block_indices", [0]
+            ),
+            global_block_end_indices=neox_args.sparsity_config.get(
+                "global_block_end_indices", None
+            ),
+            attention="unidirectional",
+        )
+    else:
+        raise ValueError(f"Attention type {attention_type} not recognized")
+    return SparseSelfAttention(
+        sparsity_config=sparsity_config,
+        max_seq_length=neox_args.seq_length,
+        attn_mask_mode="add",
+        mpu=mpu,
+    )
+
+
+def get_fusion_type(neox_args):
+    fusion_type = SoftmaxFusionTypes.none
+    if neox_args.scaled_upper_triang_masked_softmax_fusion:
+        fusion_type = SoftmaxFusionTypes.upper_triang
+    elif neox_args.scaled_masked_softmax_fusion:
+        fusion_type = SoftmaxFusionTypes.general
+    return fusion_type
diff --git a/benchmarks/sizing/megatron/model/word_embeddings.py b/benchmarks/sizing/megatron/model/word_embeddings.py
new file mode 100644
index 0000000..16b660d
--- /dev/null
+++ b/benchmarks/sizing/megatron/model/word_embeddings.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import math
+from torch.nn.parameter import Parameter
+
+from megatron import mpu
+from megatron.model.positional_embeddings import SinusoidalPositionalEmbedding
+from megatron.model.init_functions import get_init_methods
+
+import time
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings.
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        init_method,
+        num_tokentypes=0,
+        use_pos_emb=True,
+    ):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+        self.use_mup = neox_args.use_mup
+        self.mup_embedding_mult = neox_args.mup_embedding_mult
+        self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            neox_args=neox_args,
+            num_embeddings=vocab_size,
+            embedding_dim=self.hidden_size,
+            init_method=self.init_method,
+        )
+        self._word_embeddings_key = "word_embeddings"
+
+        if neox_args.use_bnb_optimizer:
+            try:
+                import bitsandbytes as bnb
+
+                self.embedding_module = bnb.nn.StableEmbedding
+            except ModuleNotFoundError:
+                print(
+                    "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
+                )
+                raise Exception
+        else:
+            self.embedding_module = torch.nn.Embedding
+
+        # Position embedding (serial).
+        self.use_pos_emb = use_pos_emb
+        if self.use_pos_emb:
+            self.embedding_type = neox_args.pos_emb
+            if self.embedding_type == "learned":
+                self.position_embeddings = self.embedding_module(
+                    max_sequence_length, self.hidden_size
+                )
+                self._position_embeddings_key = "position_embeddings"
+                # Initialize the position embeddings.
+                self.init_method(self.position_embeddings.weight)
+            elif self.embedding_type == "sinusoidal":
+                self.position_embeddings = SinusoidalPositionalEmbedding(
+                    self.hidden_size
+                )
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = "tokentype_embeddings"
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = self.embedding_module(
+                self.num_tokentypes, self.hidden_size
+            )
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.opt_pos_emb_offset = neox_args.opt_pos_emb_offset
+
+        # For ticking position ids forward
+        self.layer_past = None
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception("tokentype embeddings is already initialized")
+        if torch.distributed.get_rank() == 0:
+            print(
+                "adding embedding for {} tokentypes".format(num_tokentypes), flush=True
+            )
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = self.embedding_module(
+            num_tokentypes, self.hidden_size
+        )
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        if self.use_pos_emb and self.embedding_type in ["learned", "sinusoidal"]:
+            if self.opt_pos_emb_offset:
+                if self.layer_past is not None:
+                    position_ids = position_ids + self.layer_past + 1
+                self.layer_past = position_ids[:, -1]
+                # OPT always adds 2 for some reason, according to the HF implementation
+                position_ids = position_ids + self.opt_pos_emb_offset
+            position_embeddings = self.position_embeddings(position_ids)
+            position_embeddings.mul_(self.mup_rp_embedding_mult)
+            embeddings = words_embeddings + position_embeddings
+        else:
+            embeddings = words_embeddings
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+
+        if self.use_mup:
+            with torch.no_grad():
+                embeddings.mul_(self.mup_embedding_mult)
+
+
+        return embeddings
+
+
+class EmbeddingPipe(Embedding):
+    """Extends Embedding to forward attention_mask through the pipeline."""
+
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+
+    def forward(self, args):
+        assert (
+            len(args) == 3
+        ), f"Expected 3 arguments (input_ids, position_ids, attention_mask), but got {len(args)}."
+
+        torch.cuda.synchronize()
+        start_time = time.time()
+        input_ids = args[0]
+        position_ids = args[1]
+        attention_mask = args[2]
+        embeddings = super().forward(input_ids, position_ids)
+        torch.cuda.synchonize()
+        latency = time.time()-start_time
+        print(f"Embedding layer latency: {latency}")
+        return embeddings, attention_mask
+
+
+class SoftEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        neox_args,
+        wte,
+        n_tokens: int = 10,
+        init_range: float = 0.5,
+        init_string: str = "",
+    ):
+        super(SoftEmbedding, self).__init__()
+        self.n_tokens = n_tokens
+        self.neox_args = neox_args
+        self.init_range = init_range
+        self.init_string = init_string
+        self.soft_embedding_weight = torch.nn.parameter.Parameter(
+            self.initialize_embedding(wte)
+        )
+
+    def initialize_embedding(self):
+        if self.init_string:
+            embeds = torch.LongTensor(
+                self.neox_args.tokenizer.tokenize(self.init_string)
+            ).to(self.embedding_module.weight.device)
+            embeds = self.embedding_module(embeds)
+            if embeds.shape[0] >= self.n_tokens:
+                embeds = embeds[: self.n_tokens, :]  # slice
+            else:
+                embeds = embeds.repeat(math.ceil(self.n_tokens / embeds.shape[0]), 1)[
+                    : self.n_tokens, :
+                ]  # pad up to n_tokens
+            return embeds
+        return torch.Tensor(n_tokens, neox_args.hidden_size).uniform_(
+            -self.random_range, self.random_range
+        )
+
+    def forward(self, args: tuple):
+        torch.cuda.synchronize()
+        start_time=time.time()
+        in_inference = len(args) == 3  # embeddings, layer_past, attention_mask
+        in_train = len(args) == 2  # embeddings, attention_mask
+        if in_train:
+            embedding, attention_mask = args
+        else:
+            embedding, layer_past, attention_mask = args
+        soft_embedding = self.soft_embedding_weight.repeat(
+            embedding.shape[0], 1, 1
+        )  # repeat batch_size times
+        if in_train:
+            # append soft embedding at the beginning in training
+            embedding = torch.cat((soft_embedding, embedding), dim=1)
+            embedding = embedding[:, : self.neox_args.seq_length, ...]
+            torch.cuda.synchonize()
+            latency = time.time()-start_time
+            print(f"Embedding layer latency: {latency}")
+            return embedding, attention_mask
+        else:
+            if not (exists(layer_past) and layer_past.numel() > 0):
+                # if in inference, on the first forward pass, we want to do the same as in training (append soft embedding)
+                embedding = torch.cat((soft_embedding, embedding), dim=1)
+                embedding = embedding[:, : self.neox_args.seq_length, ...]
+            # otherwise, we're in incremental mode, and just want to forward the single embedding (since the soft prompt has already been cached)
+            torch.cuda.synchonize()
+            latency = time.time()-start_time
+            print(f"Embedding layer latency: {latency}")
+            return embedding, layer_past, attention_mask
diff --git a/benchmarks/sizing/megatron/mpu/__init__.py b/benchmarks/sizing/megatron/mpu/__init__.py
new file mode 100644
index 0000000..611d2ad
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+
+from .data import broadcast_data
+
+from .initialize import is_unitialized
+from .initialize import destroy_model_parallel
+from .initialize import get_data_parallel_group
+from .initialize import get_data_parallel_rank
+from .initialize import get_data_parallel_world_size
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank, set_model_parallel_rank
+from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
+from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
+from .initialize import get_topology
+from .initialize import get_pipe_parallel_group
+from .initialize import get_pipe_parallel_rank
+from .initialize import get_pipe_parallel_world_size
+from .initialize import get_io_parallel_group
+from .initialize import initialize_model_parallel
+from .initialize import model_parallel_is_initialized
+
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .layers import VocabParallelEmbedding
+from .layers import ParallelRelativePositionBias
+
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+from .random import model_parallel_cuda_manual_seed
+
+from .utils import divide
+from .utils import split_tensor_along_last_dim
diff --git a/benchmarks/sizing/megatron/mpu/cross_entropy.py b/benchmarks/sizing/megatron/mpu/cross_entropy.py
new file mode 100644
index 0000000..d28e0cc
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/cross_entropy.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group(),
+        )
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size
+        )
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(
+            start=0, end=logits_2d.size()[0], device=logits_2d.device
+        )
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group(),
+        )
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group(),
+        )
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retrieve tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/benchmarks/sizing/megatron/mpu/data.py b/benchmarks/sizing/megatron/mpu/data.py
new file mode 100644
index 0000000..7b93705
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/data.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_src_rank
+
+
+_MAX_DATA_DIM = 4
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert (
+            data[key].dtype == target_dtype
+        ), "{} has data type {} which " "is different than {}".format(
+            key, data[key].dtype, target_dtype
+        )
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda, get_model_parallel_src_rank(), group=get_model_parallel_group()
+    )
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data dictionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0
+        ).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype
+        )
+
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, get_model_parallel_src_rank(), group=get_model_parallel_group()
+    )
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/benchmarks/sizing/megatron/mpu/initialize.py b/benchmarks/sizing/megatron/mpu/initialize.py
new file mode 100644
index 0000000..325e46b
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/initialize.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+# Pipeline parallel group that the current rank belongs to.
+_PIPE_PARALLEL_GROUP = None
+
+# A group used to sync during the IO process. Usually this is data_parallel_group(),
+# but with pipeline parallelism it must also involve the last stage (which is not in the
+# DP group of rank 0)
+_IO_PARALLEL_GROUP = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_WORLD_SIZE = None
+_MPU_RANK = None
+
+# Used to query 3D topology
+_MPU_TOPOLOGY = None
+
+# Get fp32_allreduce flag
+_FP32_ALLREDUCE = None
+
+
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+
+
+def initialize_model_parallel(model_parallel_size, topology=None, fp32_allreduce=False):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel groups as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print("> initializing model parallel with size {}".format(model_parallel_size))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    if world_size < model_parallel_size:
+        raise ValueError("world size cannot be smaller than model parallel size")
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    global _MPU_TOPOLOGY
+    if topology:
+        _MPU_TOPOLOGY = topology
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, "data parallel group is already initialized"
+    if topology:
+        for dp_group in topology.get_axis_comm_lists("data"):
+            group = torch.distributed.new_group(ranks=dp_group)
+            if rank == 0:
+                print(f"MPU DP:", dp_group)
+            if rank in dp_group:
+                _DATA_PARALLEL_GROUP = group
+    else:
+        for i in range(model_parallel_size):
+            ranks = range(i, world_size, model_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if i == (rank % model_parallel_size):
+                _DATA_PARALLEL_GROUP = group
+
+    # Build pipeline parallel group
+    if topology is not None:
+        global _PIPE_PARALLEL_GROUP
+        for pp_group in topology.get_axis_comm_lists("pipe"):
+            group = torch.distributed.new_group(ranks=pp_group)
+            if rank == 0:
+                print(f"MPU PP:", pp_group)
+            if rank in pp_group:
+                _PIPE_PARALLEL_GROUP = group
+
+    # Build IO group
+    global _IO_PARALLEL_GROUP
+    if topology and topology.get_dim("pipe") > 1:
+        io_stages = [0, topology.get_dim("pipe") - 1]
+        io_group = []
+        for stage in io_stages:
+            io_group.extend(topology.filter_match(pipe=stage, model=0))
+        if rank == 0:
+            print(f"MPU IO:", io_group)
+        group = torch.distributed.new_group(ranks=io_group)
+        if rank in io_group:
+            _IO_PARALLEL_GROUP = group
+    else:
+        _IO_PARALLEL_GROUP = get_data_parallel_group()
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
+    if topology:
+        # Short circuit case without model parallelism.
+        # TODO: it would be nice  to avoid this branching case?
+        if model_parallel_size == 1:
+            for group_rank in range(world_size):
+                group = torch.distributed.new_group(ranks=[group_rank])
+                if rank == 0:
+                    print(f"MPU MP:", [group_rank])
+                if rank == group_rank:
+                    _MODEL_PARALLEL_GROUP = group
+            return
+
+        for mp_group in topology.get_axis_comm_lists("model"):
+            group = torch.distributed.new_group(ranks=mp_group)
+            if rank == 0:
+                print(f"MPU MP:", mp_group)
+            if rank in mp_group:
+                _MODEL_PARALLEL_GROUP = group
+
+    else:
+        for i in range(world_size // model_parallel_size):
+            ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if i == (rank // model_parallel_size):
+                _MODEL_PARALLEL_GROUP = group
+
+    global _FP32_ALLREDUCE
+    assert _FP32_ALLREDUCE is None, "fp32_allreduce is already initialized"
+    _FP32_ALLREDUCE = fp32_allreduce
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, "model parallel group is not initialized"
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized"
+    return _DATA_PARALLEL_GROUP
+
+
+def get_io_parallel_group():
+    """Get the IO parallel group the caller rank belongs to."""
+    assert _IO_PARALLEL_GROUP is not None, "IO parallel group is not initialized"
+    return _IO_PARALLEL_GROUP
+
+
+def set_model_parallel_world_size(world_size):
+    """Set the model parallel size"""
+    global _MPU_WORLD_SIZE
+    _MPU_WORLD_SIZE = world_size
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    global _MPU_WORLD_SIZE
+    if _MPU_WORLD_SIZE is not None:
+        return _MPU_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def set_model_parallel_rank(rank):
+    """Set model parallel rank."""
+    global _MPU_RANK
+    _MPU_RANK = rank
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    global _MPU_RANK
+    if _MPU_RANK is not None:
+        return _MPU_RANK
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zero
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zero
+    in the data parallel group."""
+    global_rank = torch.distributed.get_rank()
+    topo = get_topology()
+    if topo is None:
+        # we are just using model parallel
+        return global_rank % get_model_parallel_world_size()
+    else:
+        # We are using pipeline parallel
+        d = topo.get_axis_comm_lists("data")
+        for l in d:
+            if global_rank in l:
+                return l[0]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def get_topology():
+    return _MPU_TOPOLOGY
+
+
+def get_pipe_parallel_group():
+    """Get the pipe parallel group the caller rank belongs to."""
+    assert _PIPE_PARALLEL_GROUP is not None, "data parallel group is not initialized"
+    return _PIPE_PARALLEL_GROUP
+
+
+def get_pipe_parallel_rank():
+    """Return my rank for the pipe parallel group."""
+    return torch.distributed.get_rank(group=get_pipe_parallel_group())
+
+
+def get_pipe_parallel_world_size():
+    """Return world size for the pipe parallel group."""
+    return torch.distributed.get_world_size(group=get_pipe_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _PIPE_PARALLEL_GROUP
+    _PIPE_PARALLEL_GROUP = None
+    global _IO_PARALLEL_GROUP
+    _IO_PARALLEL_GROUP = None
+    global _MPU_WORLD_SIZE
+    global _MPU_RANK
+    _MPU_WORLD_SIZE = None
+    _MPU_RANK = None
+    global _MPU_TOPOLOGY
+    _MPU_TOPOLOGY = None
+    global _FP32_ALLREDUCE
+    _FP32_ALLREDUCE = None
+
+
+def get_fp32_allreduce():
+    """Get the fp32 allreduce flag"""
+    assert _FP32_ALLREDUCE is not None, "fp32_allreduce is not Initialized"
+    return _FP32_ALLREDUCE
diff --git a/benchmarks/sizing/megatron/mpu/layers.py b/benchmarks/sizing/megatron/mpu/layers.py
new file mode 100644
index 0000000..92edbd6
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/layers.py
@@ -0,0 +1,757 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import VocabUtility
+from functools import partial
+
+
+def _initialize_affine_weight_gpu(weight, init_method, partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    weight.model_parallel = True
+    weight.partition_dim = partition_dim
+    weight.partition_stride = stride
+
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(
+    neox_args,
+    weight,
+    output_size,
+    input_size,
+    per_partition_size,
+    partition_dim,
+    init_method,
+    stride=1,
+    return_master_weight=False,
+):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    weight.model_parallel = True
+    weight.partition_dim = partition_dim
+    weight.partition_stride = stride
+
+    # Initialize master weight
+    master_weight = torch.empty(
+        output_size, input_size, dtype=torch.float, requires_grad=False
+    )
+    init_method(master_weight)
+    master_weight = master_weight.to(dtype=neox_args.params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(
+        master_weight, per_partition_per_stride_size, dim=partition_dim
+    )
+    rank = get_model_parallel_rank()
+    world_size = get_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(
+        self, neox_args, num_embeddings, embedding_dim, init_method=init.xavier_normal_
+    ):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.model_parallel_size = get_model_parallel_world_size()
+        # Divide the weight matrix along the vocabulary dimension.
+        (
+            self.vocab_start_index,
+            self.vocab_end_index,
+        ) = VocabUtility.vocab_range_from_global_vocab_size(
+            self.num_embeddings, get_model_parallel_rank(), self.model_parallel_size
+        )
+        self.num_embeddings_per_partition = (
+            self.vocab_end_index - self.vocab_start_index
+        )
+        self.init_method = init_method
+
+        # Allocate weights and initialize.
+        if neox_args.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.num_embeddings,
+                self.embedding_dim,
+                self.num_embeddings_per_partition,
+                0,
+                init_method,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(
+                self.weight, init_method, partition_dim=0, stride=1
+            )
+
+    def mup_reinitialize_weights(self, neox_args):
+        if neox_args.use_cpu_initialization:
+            _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.num_embeddings,
+                self.embedding_dim,
+                self.num_embeddings_per_partition,
+                0,
+                partial(self.init_method, use_mup=True),
+            )
+        else:
+            _initialize_affine_weight_gpu(
+                self.weight,
+                partial(self.init_method, use_mup=True),
+                partition_dim=0,
+                stride=1,
+            )
+
+    def forward(self, input_):
+        if self.model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | (
+                input_ >= self.vocab_end_index
+            )
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(
+            masked_input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        # Mask the output embedding.
+        if self.model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ParallelRelativePositionBias(torch.nn.Module):
+    """T5 Relative Position Bias parallelized in the heads dimension
+
+    Based on https://github.com/lucidrains/x-transformers/blob/6b93c21be0d0a679da6f7b9621d9bb638ab18428/x_transformers/x_transformers.py#L106 (14.12.2021)
+    and adapted for megatron's model parallelism
+
+    Arguments:
+        scale: scaling factor for the bias
+        causal: flag for causal/non-causal language modelling.
+        num_buckets: number of rp buckets.
+        max_distance: max distance in sequence dim for each bucket.
+        heads: number of attention heads (total)
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        scale,
+        causal=True,
+        num_buckets=32,
+        max_distance=128,
+        heads=8,
+        init_method=init.xavier_normal_,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.heads = heads
+
+        # Set the defaults for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.model_parallel_size = get_model_parallel_world_size()
+        self.model_parallel_rank = get_model_parallel_rank()
+
+        # Divide the weight matrix along the heads dimension.
+        self.head_start_index, self.head_end_index = self.get_heads_range(
+            self.heads, self.model_parallel_rank, self.model_parallel_size
+        )
+        self.num_heads_per_partition = self.head_end_index - self.head_start_index
+        self.init_method = init_method
+
+        # Allocate weights and initialize.
+        if neox_args.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_buckets,
+                    self.num_heads_per_partition,
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.num_buckets,
+                self.heads,
+                self.num_heads_per_partition,
+                partition_dim=1,
+                init_method=init_method,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_buckets,
+                    self.num_heads_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(
+                self.weight, init_method, partition_dim=1, stride=1
+            )
+        self._q_len_cached = None
+        self._k_len_cached = None
+        self._rel_pos_bucket_cached = None
+
+    def mup_reinitialize_weights(self, neox_args):
+        if self.use_cpu_initialization:
+            _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.num_buckets,
+                self.heads,
+                self.num_heads_per_partition,
+                partition_dim=1,
+                init_method=partial(self.init_method, use_mup=True),
+            )
+        else:
+            _initialize_affine_weight_gpu(
+                self.weight,
+                partial(self.init_method, use_mup=True),
+                partition_dim=1,
+                stride=1,
+            )
+
+    @staticmethod
+    def get_heads_range(global_n_heads, rank, world_size):
+        per_partition_n_heads = divide(global_n_heads, world_size)
+        index_f = rank * per_partition_n_heads
+        index_l = index_f + per_partition_n_heads
+        return index_f, index_l
+
+    def _relative_position_bucket(
+        self, relative_position, num_buckets=32, max_distance=128
+    ):
+        ret = 0
+        n = -relative_position
+        if not self.causal:
+            num_buckets //= 2
+            ret += (n < 0).long() * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        val_if_large = (
+            max_exact
+            + (
+                torch.log(n.float() / max_exact)
+                / math.log(max_distance / max_exact)
+                * (num_buckets - max_exact)
+            ).long()
+        )
+        val_if_large = torch.min(
+            val_if_large, torch.full_like(val_if_large, num_buckets - 1)
+        )
+
+        ret += torch.where(is_small, n, val_if_large)
+        self._rel_pos_bucket_cached = ret
+        return self._rel_pos_bucket_cached
+
+    def forward(self, q_len, k_len):
+        if self._q_len_cached != q_len or self._k_len_cached != k_len:
+            # cache bucket if first step seq len stays constant
+            self._q_len_cached, self._k_len_cached = q_len, k_len
+            q_pos = torch.arange(
+                q_len, dtype=torch.long, device=torch.cuda.current_device()
+            )
+            k_pos = torch.arange(
+                k_len, dtype=torch.long, device=torch.cuda.current_device()
+            )
+            rel_pos = k_pos[None, :] - q_pos[:, None]
+            rp_bucket = self._relative_position_bucket(
+                rel_pos, num_buckets=self.num_buckets, max_distance=self.max_distance
+            )
+        else:
+            rp_bucket = self._rel_pos_bucket_cached
+        values = F.embedding(
+            rp_bucket,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        bias = values.movedim(2, 0).unsqueeze(0)
+        return bias * self.scale
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        input_size,
+        output_size,
+        bias=True,
+        gather_output=True,
+        init_method=init.xavier_normal_,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        mup_rescale_parameters=False,
+    ):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        self.init_method = init_method
+        self.stride = stride
+        self.mup_rescale_parameters = mup_rescale_parameters
+        self.use_mup = neox_args.use_mup
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if neox_args.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size_per_partition,
+                    self.input_size,
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            self.master_weight = _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.output_size_per_partition,
+                0,
+                init_method,
+                stride=stride,
+                return_master_weight=keep_master_weight_for_test,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size_per_partition,
+                    self.input_size,
+                    device=torch.cuda.current_device(),
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(
+                self.weight, init_method, partition_dim=0, stride=stride
+            )
+
+        if bias:
+            if neox_args.use_cpu_initialization:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition, dtype=neox_args.params_dtype
+                    )
+                )
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        device=torch.cuda.current_device(),
+                        dtype=neox_args.params_dtype,
+                    )
+                )
+            self.bias.model_parallel = True
+            self.bias.partition_dim = 0
+            self.bias.stride = stride
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter("bias", None)
+
+    # Copied from Mup
+    def width_mult(self):
+        assert hasattr(self.weight, "infshape"), (
+            "Please call set_base_shapes(...). If using torch.nn.DataParallel, "
+            "switch to distributed training with "
+            "torch.nn.parallel.DistributedDataParallel instead"
+        )
+        return self.weight.infshape.width_mult()
+
+    # Copied from Mup
+    def _rescale_parameters(self):
+        """Rescale parameters to convert SP initialization to μP initialization.
+        Warning: This method is NOT idempotent and should be called only once
+        unless you know what you are doing.
+        """
+        if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params:
+            raise RuntimeError(
+                "`_rescale_parameters` has been called once before already. "
+                "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n"
+                "If you called `set_base_shapes` on a model loaded from a checkpoint, "
+                "or just want to re-set the base shapes of an existing model, "
+                "make sure to set the flag `rescale_params=False`.\n"
+                "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call."
+            )
+        if self.bias is not None:
+            self.bias.data *= self.width_mult() ** 0.5
+        self.weight.data *= self.width_mult() ** 0.5
+        self._has_rescaled_params = True
+
+    def mup_reinitialize_weights(self, neox_args):
+        if neox_args.use_cpu_initialization:
+            self.master_weight = _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.output_size_per_partition,
+                0,
+                partial(self.init_method, use_mup=True),
+                stride=self.stride,
+                return_master_weight=keep_master_weight_for_test,
+            )
+        else:
+            _initialize_affine_weight_gpu(
+                self.weight,
+                partial(self.init_method, use_mup=True),
+                partition_dim=0,
+                stride=self.stride,
+            )
+
+    def set_parallel_output(self, value: bool):
+        assert isinstance(value, bool)
+        self.gather_output = (
+            not value
+        )  # if gather_output is True, parallel output is False, so we set the opposite
+
+    def forward(self, input_):
+        if self.use_mup and self.mup_rescale_parameters:
+            input_ /= self.width_mult()
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+
+        bias = self.bias if not self.skip_bias_add else None
+        output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        input_size,
+        output_size,
+        bias=True,
+        input_is_parallel=False,
+        init_method=init.xavier_normal_,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        parallel_output=False,
+        mup_rescale_parameters=False,
+    ):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        self.parallel_output = parallel_output
+        self.init_method = init_method
+        self.stride = stride
+        self.keep_master_weight_for_test = keep_master_weight_for_test
+        self.mup_rescale_parameters = mup_rescale_parameters
+        self.use_mup = neox_args.use_mup
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if neox_args.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size,
+                    self.input_size_per_partition,
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            self.master_weight = _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.input_size_per_partition,
+                1,
+                init_method,
+                stride=stride,
+                return_master_weight=keep_master_weight_for_test,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size,
+                    self.input_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=neox_args.params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(
+                self.weight, init_method, partition_dim=1, stride=stride
+            )
+        if bias:
+            if neox_args.use_cpu_initialization:
+                self.bias = Parameter(
+                    torch.empty(self.output_size, dtype=neox_args.params_dtype)
+                )
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size,
+                        device=torch.cuda.current_device(),
+                        dtype=neox_args.params_dtype,
+                    )
+                )
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter("bias", None)
+
+    # Copied from Mup
+    def width_mult(self):
+        assert hasattr(self.weight, "infshape"), (
+            "Please call set_base_shapes(...). If using torch.nn.DataParallel, "
+            "switch to distributed training with "
+            "torch.nn.parallel.DistributedDataParallel instead"
+        )
+        return self.weight.infshape.width_mult()
+
+    # Copied from Mup
+    def _rescale_parameters(self):
+        """Rescale parameters to convert SP initialization to μP initialization.
+        Warning: This method is NOT idempotent and should be called only once
+        unless you know what you are doing.
+        """
+        if hasattr(self, "_has_rescaled_params") and self._has_rescaled_params:
+            raise RuntimeError(
+                "`_rescale_parameters` has been called once before already. "
+                "Unless you know what you are doing, usually you should not be calling `_rescale_parameters` more than once.\n"
+                "If you called `set_base_shapes` on a model loaded from a checkpoint, "
+                "or just want to re-set the base shapes of an existing model, "
+                "make sure to set the flag `rescale_params=False`.\n"
+                "To bypass this error and *still rescale parameters*, set `self._has_rescaled_params=False` before this call."
+            )
+        if self.bias is not None:
+            self.bias.data *= self.width_mult() ** 0.5
+        self.weight.data *= self.width_mult() ** 0.5
+        self._has_rescaled_params = True
+
+    def mup_reinitialize_weights(self, neox_args):
+        if neox_args.use_cpu_initialization:
+            self.master_weight = _initialize_affine_weight_cpu(
+                neox_args,
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.input_size_per_partition,
+                1,
+                partial(self.init_method, use_mup=True),
+                stride=self.stride,
+                return_master_weight=self.keep_master_weight_for_test,
+            )
+        else:
+            _initialize_affine_weight_gpu(
+                self.weight,
+                partial(self.init_method, use_mup=True),
+                partition_dim=1,
+                stride=self.stride,
+            )
+
+    def set_parallel_output(self, parallel_output: bool):
+        assert isinstance(parallel_output, bool)
+        self.parallel_output = parallel_output
+
+    def forward(self, input_):
+        if self.use_mup and self.mup_rescale_parameters:
+            input_ /= self.width_mult()
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        if not self.parallel_output:
+            output_ = reduce_from_model_parallel_region(output_parallel)
+        else:
+            output_ = output_parallel
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
diff --git a/benchmarks/sizing/megatron/mpu/mappings.py b/benchmarks/sizing/megatron/mpu/mappings.py
new file mode 100644
index 0000000..c14693a
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/mappings.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (
+    get_model_parallel_group,
+    get_model_parallel_world_size,
+    get_model_parallel_rank,
+    get_fp32_allreduce,
+)
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_model_parallel_world_size() == 1:
+        return input_
+
+    # Bf16 convert
+    dt = input_.dtype
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        input_ = input_.float()
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_model_parallel_group())
+
+    # Bf16 convert
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        input_ = input_.bfloat16()
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Bf16 convert
+    dt = input_.dtype
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        input_ = input_.float()
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    # Bf16 convert
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        output = output.bfloat16()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Bf16 convert
+    dt = input_.dtype
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        input_ = input_.float()
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    # Bf16 convert
+    if dt == torch.bfloat16 and get_fp32_allreduce():
+        output = output.bfloat16()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/benchmarks/sizing/megatron/mpu/random.py b/benchmarks/sizing/megatron/mpu/random.py
new file mode 100644
index 0000000..f93a912
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/random.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
+# TODO: should be able to get rid of this file entirely
+
+import deepspeed
+import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = (
+    deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
+)
+
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
+
+# Deepspeed checkpointing functions
+# TODO: replace calls to these in our codebase with calls to the deepspeed ones
+_set_cuda_rng_state = checkpointing._set_cuda_rng_state
+checkpoint = checkpointing.checkpoint
+model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
+get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
diff --git a/benchmarks/sizing/megatron/mpu/utils.py b/benchmarks/sizing/megatron/mpu/utils.py
new file mode 100644
index 0000000..cb12f25
--- /dev/null
+++ b/benchmarks/sizing/megatron/mpu/utils.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [first, last]"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size, rank, world_size
+    ):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
diff --git a/benchmarks/sizing/megatron/mup_substitute.py b/benchmarks/sizing/megatron/mup_substitute.py
new file mode 100644
index 0000000..e16a215
--- /dev/null
+++ b/benchmarks/sizing/megatron/mup_substitute.py
@@ -0,0 +1,212 @@
+"""
+Helper functions for performing coord check.
+"""
+import os
+from copy import copy
+from itertools import product
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+
+from mup import coord_check as mup_coord_check
+from megatron.training import train_step
+
+
+def _get_coord_data(
+    neox_args,
+    timers,
+    lr_scheduler,
+    models,
+    dataloader,
+    optcls,
+    nsteps=3,
+    dict_in_out=False,
+    flatten_input=False,
+    flatten_output=False,
+    output_name="loss",
+    lossfn="xent",
+    filter_module_by_name=None,
+    fix_data=True,
+    cuda=True,
+    nseeds=1,
+    output_fdict=None,
+    input_fdict=None,
+    param_fdict=None,
+    show_progress=True,
+    one_hot_target=False,
+):
+    df = []
+
+    for i in range(nseeds):
+        torch.manual_seed(i)
+        for width, model in models.items():
+            model = model()
+            model.train()
+            optimizer = optcls(model)
+            for step in range(nsteps + 1):
+                remove_hooks = []
+                # add hooks
+                for name, module in model.named_modules():
+                    if filter_module_by_name and not filter_module_by_name(name):
+                        continue
+                    remove_hooks.append(
+                        module.register_forward_hook(
+                            mup_coord_check._record_coords(
+                                df,
+                                width,
+                                name,
+                                step + 1,
+                                output_fdict=output_fdict,
+                                input_fdict=input_fdict,
+                                param_fdict=param_fdict,
+                            )
+                        )
+                    )
+
+                # train for a step
+                loss_dict, skipped_iter = train_step(
+                    neox_args=neox_args,
+                    timers=timers,
+                    data_iterator=dataloader,
+                    model=model,
+                    optimizer=optimizer,
+                    lr_scheduler=lr_scheduler,
+                )
+
+                # remove hooks
+                for handle in remove_hooks:
+                    handle.remove()
+
+            import gc
+
+            del model
+            gc.collect()
+
+    return pd.DataFrame(df)
+
+
+def get_coord_data(
+    neox_args,
+    timers,
+    lr_scheduler,
+    models,
+    dataloader,
+    optimizer="sgd",
+    lr=None,
+    mup=True,
+    filter_trainable_by_name=None,
+    **kwargs
+):
+    """Get coord data for coord check.
+    Train the models in `models` with data from `dataloader` and optimizer
+    specified by `optimizer` and `lr` for `nsteps` steps, and record coordinate
+    statistics specified by `output_fdict`, `input_fdict`, `param_fdict`. By
+    default, only `l1` is computed for output activations of each module.
+    This function wraps around `_get_coord_data`, with the main difference being
+    user can specify common optimizers via a more convenient interface.
+    Inputs:
+        models:
+            a dict of lazy models, where the keys are numbers indicating width.
+            Each entry of `models` is a function that instantiates a model given
+            nothing.
+        dataloader:
+            an iterator whose elements are either Huggingface style dicts, if
+            `dict_in_out` is True, or (input, label). If `fix_data` is True
+            (which is the default), then only the first element of `dataloader`
+            is used in a loop and the rest of `dataloder` is ignored.
+        optimizer:
+            a string in `['sgd', 'adam', 'adamw']`, with default being `'sgd'`.
+        lr:
+            learning rate. By default is 0.1 for `'sgd'` and 1e-3 for others.
+        mup:
+            If True, then use the optimizer from `mup.optim`; otherwise, use the
+            one from `torch.optim`.
+        filter_trainable_by_name:
+            a function that returns a bool given module names (from
+            `model.named_modules()`), or None. If not None, then only modules
+            whose name yields True will be trained.
+        nsteps:
+            number of steps to train the model
+        dict_in_out:
+            whether the data loader contains Huggingface-style dict input and
+            output. Default: False
+        flatten_input:
+            if not `dict_in_out`, reshape the input to be
+            `input.view(input.shape[0], -1)`. Typically used for testing MLPs.
+        flatten_output:
+            if not `dict_in_out`, reshape the label to be `label.view(-1,
+            input.shape[-1])`.
+        output_name:
+            if `dict_in_out`, this is the key for the loss value if the output
+            is a dict. If the output is not a dict, then we assume the first
+            element of the output is the loss.
+        lossfn:
+            loss function to use if not `dict_in_out`. Can be either a string from
+            [`xent`, 'mse', 'nll', 'l1'] or a python `callable` such that
+            `lossfn(output, target)` returns the loss value. Examples of valid
+            `callable`s are `F.cross_entropy`, `F.mse_loss`, etc, where `F` is
+            `torch.nn.functional`. Default: 'xent'
+        filter_module_by_name:
+            a function that returns a bool given module names (from
+            `model.named_modules()`), or None. If not None, then only modules
+            whose name yields True will be recorded.
+        cuda:
+            whether to use cuda or not. Default: True
+        nseeds:
+            number of times to repeat the training, each with different seeds.
+        output_fdict, input_fdict, param_fdict:
+            function dicts to be used in `_record_coords`. By default, only `l1`
+            is computed for output activations of each module.
+        show_progress:
+            show progress using tqdm. Default: True
+        one_hot_target:
+            convert target label into a one-hot vector. This typically is only
+            used for `'mse'` or `'l1'` losses in classification tasks.
+            Default: False
+    Output:
+        a pandas DataFrame containing recorded results. The column names are
+        `'width', 'module', 't'` as well as names of statistics recorded, such
+        as `'l1'` (see `FDICT` for other premade statistics that can be
+        collected).
+
+    Breaking Changes:
+        In v1.0.0, when `lossfn=='mse'`, the target is automatically converted
+        to a one hot vector before loss computation. Starting in v1.1.0, this
+        behavior is turned off, and the user needs to explicitly turn on this
+        behavior by setting `one_hot_target=True`.
+    """
+    if lr is None:
+        lr = 0.1 if optimizer == "sgd" else 1e-3
+    if mup:
+        from mup.optim import MuAdam as Adam
+        from mup.optim import MuAdamW as AdamW
+        from mup.optim import MuSGD as SGD
+    else:
+        from torch.optim import SGD, Adam, AdamW
+
+    def get_trainable(model):
+        params = model.parameters()
+        if filter_trainable_by_name is not None:
+            params = []
+            for name, p in model.named_parameters():
+                if filter_trainable_by_name(name):
+                    params.append(p)
+        return params
+
+    if optimizer == "sgd":
+        optcls = lambda model: SGD(get_trainable(model), lr=lr)
+    elif optimizer == "adam":
+        optcls = lambda model: Adam(get_trainable(model), lr=lr)
+    elif optimizer == "adamw":
+        optcls = lambda model: AdamW(get_trainable(model), lr=lr)
+    elif optimizer is None:
+        raise ValueError("optimizer should be sgd|adam|adamw or a custom function")
+
+    data = _get_coord_data(
+        neox_args, timers, lr_scheduler, models, dataloader, optcls, **kwargs
+    )
+    data["optimizer"] = optimizer
+    data["lr"] = lr
+    return data
diff --git a/benchmarks/sizing/megatron/neox_arguments/__init__.py b/benchmarks/sizing/megatron/neox_arguments/__init__.py
new file mode 100644
index 0000000..025464c
--- /dev/null
+++ b/benchmarks/sizing/megatron/neox_arguments/__init__.py
@@ -0,0 +1,36 @@
+"""
+NeoX Arguments manages all configuration arguments.
+
+**general**
+
+* The implementation makes use of the python dataclass.
+* The main class 'NeoXArgs' (in ./arguments) exposes all configuration attributes that are relevant to GPT NeoX
+* No attributes are nested (apart from attributes with type dict)
+* Output functions (enable_logging, save_yml, print) are implemented
+* Instantiation always runs NeoXArgs.__post_init__(), which calculates derived values and performs a validation (values, types, keys).
+* it is possible to set undefined attributes (e.g. line of code 'NeoXArgs().my_undefined_config = 42' works fine); such set attributes are not validated
+* It is possible to update attributes (e.g. line of code 'NeoXArgs().do_train = True' works fine); a validation can be performed by calling the validation functions on the class instance
+* In order to avoid setting undefined attributes you can use the function NeoXArgs().update_value(); this function raises an error if the to be set attribute is not defined
+
+**instantiation**
+NeoX args can be instantiated with the following options
+
+* NeoXArgs.from_ymls(["path_to_yaml1", "path_to_yaml2", ...]): load yaml configuration files and instantiate with the values provided; checks for duplications and unknown arguments are performed
+* NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed
+
+* NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main
+* NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance.
+
+
+**code structure**
+
+* NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen
+* The Subclasses group args according to their purpose
+* The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be omitted
+* The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be omitted
+* calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items())
+* refer to docstrings in code for more information
+"""
+
+
+from .arguments import NeoXArgs
diff --git a/benchmarks/sizing/megatron/neox_arguments/arguments.py b/benchmarks/sizing/megatron/neox_arguments/arguments.py
new file mode 100644
index 0000000..39fc569
--- /dev/null
+++ b/benchmarks/sizing/megatron/neox_arguments/arguments.py
@@ -0,0 +1,1323 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import os
+from pathlib import Path
+import yaml
+import json
+import logging
+import copy
+import torch
+import argparse
+
+from dataclasses import dataclass
+from typing import List, Dict
+from socket import gethostname
+
+try:
+    from typing import Literal, Union
+except ImportError:
+    from typing_extensions import Literal, Union
+from deepspeed.launcher.runner import DLTS_HOSTFILE
+from megatron.logging import Tee
+from megatron.tokenizer import build_tokenizer
+from megatron.utils import obtain_resource_pool, expand_attention_types
+from .deepspeed_args import NeoXArgsDeepspeedConfig, NeoXArgsDeepspeedRunner
+from .neox_args import (
+    NeoXArgsModel,
+    NeoXArgsTokenizer,
+    NeoXArgsTraining,
+    NeoXArgsParallelism,
+    NeoXArgsLogging,
+    NeoXArgsOther,
+    NeoXArgsTextgen,
+    NeoXArgsOptimizer,
+    NeoXArgsLRScheduler,
+    ATTENTION_TYPE_CHOICES,
+)
+
+# ZERO defaults by deespeed
+# These values should not be changed unless defaults in deepspeed are changed
+# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+ZERO_DEFAULTS = {
+    "stage": 0,
+    "allgather_partitions": True,
+    "reduce_scatter": True,
+    "allgather_bucket_size": int(5e8),
+    "overlap_comm": False,
+    "reduce_scatter": True,
+    "reduce_bucket_size": int(5e8),
+    "contiguous_gradients": False,
+}
+
+# NeoX optimizer defaults
+OPT_DEFAULT = "Adam"
+OPT_PARAMS_DEFAULTS = {
+    "lr": 0.001,
+    "betas": [0.9, 0.999],
+    "eps": 1.0e-8,
+    "weight_decay": 0,
+    "freeze_step": 400,
+    "momentum": 0.0,
+    "cuda_aware": False,
+}
+
+
+AUTOTUNING_ARGS = (
+    "train_batch_size",
+    "train_micro_batch_size_per_gpu",
+    "gradient_accumulation_steps",
+    "zero_optimization",
+    "autotuning",
+)
+
+BASE_CLASSES = [
+    NeoXArgsDeepspeedRunner,
+    NeoXArgsDeepspeedConfig,
+    NeoXArgsModel,
+    NeoXArgsLRScheduler,
+    NeoXArgsOptimizer,
+    NeoXArgsTokenizer,
+    NeoXArgsTraining,
+    NeoXArgsParallelism,
+    NeoXArgsLogging,
+    NeoXArgsTextgen,
+    NeoXArgsOther,
+]
+
+DEEPSPEED_ARG_CLASSES = [NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig]
+NEOX_ARG_CLASSES = [i for i in BASE_CLASSES if i not in DEEPSPEED_ARG_CLASSES]
+
+if "DLTS_HOSTFILE" in os.environ:
+    DLTS_HOSTFILE = os.environ["DLTS_HOSTFILE"]
+
+
+@dataclass
+class NeoXArgs(*BASE_CLASSES):
+    """
+    data class containing all configurations
+
+    NeoXArgs inherits from a number of small configuration classes
+    """
+
+    ############################################################################################################################
+    # start of instantiation
+
+    def __post_init__(self):
+        """
+        after initialization of default or loaded values
+        a number of functions are performed in order to
+        calculate values, assert consistency and do typechecking.
+        """
+        if not NeoXArgs.validate_keys():
+            raise ValueError(
+                self.__class__.__name__
+                + ".__post_init__() NeoXArgs keys cannot be validated"
+            )
+
+        self.enable_logging()
+
+        self.calculate_derived()
+
+        if not self.validate_types():
+            raise ValueError(
+                self.__class__.__name__
+                + ".__post_init__() NeoXArgs types cannot be validated"
+            )
+
+        if not self.validate_values():
+            raise ValueError(
+                self.__class__.__name__
+                + ".__post_init__() NeoXArgs values cannot be validated"
+            )
+
+    def build_tokenizer(self):
+        self.tokenizer = build_tokenizer(self)
+
+    def initialize_tensorboard_writer(self):
+        if self.tensorboard_dir and self.rank == 0:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+
+                print("> setting tensorboard ...")
+                self.tensorboard_writer = SummaryWriter(log_dir=self.tensorboard_dir)
+            except (ModuleNotFoundError, ImportError):
+                print(
+                    "WARNING: TensorBoard writing requested but is not "
+                    "available (are you using PyTorch 1.1.0 or later and do you have tensorboard installed?), "
+                    "no TensorBoard logs will be written.",
+                    flush=True,
+                )
+
+    @classmethod
+    def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None):
+        """
+        instantiates NeoXArgs while reading values from yml files
+
+        paths_to_yml_files: list of paths to yml files
+
+        overwrite_values: If provided, overwrite any values in the yamls with these values
+        """
+
+        print(cls.__name__ + ".from_ymls() " + str(paths_to_yml_files), flush=True)
+
+        # initialize an empty config dictionary to be filled by yamls
+        config = dict()
+        config_files = dict()
+        # iterate of all to be loaded yaml files
+        for conf_file_name in paths_to_yml_files:
+
+            # load file
+            with open(conf_file_name) as conf_file:
+                conf = yaml.load(conf_file, Loader=yaml.FullLoader)
+
+            # check for key duplicates and load values
+            for conf_key, conf_value in conf.items():
+                if conf_key in config:
+                    raise ValueError(
+                        f"Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}"
+                    )
+
+                conf_key_converted = conf_key.replace(
+                    "-", "_"
+                )  # TODO remove replace and update configuration files?
+                config[conf_key_converted] = conf_value
+
+            # load original config files to save unchanged with checkpoint
+            # saving the original config retains comments
+            filename = os.path.basename(conf_file_name)
+            assert (
+                filename not in config_files
+            ), "At least two config files have the same filename. This will result in conflicts when saving out configs with the checkpoint in one single directory. Please use unique names for configs."
+            config_files[filename] = open(conf_file_name).read()
+
+        # add config file content to neox args to make them accessible in code
+        # this is used when saving checkpoints
+        config["config_files"] = config_files
+
+        # Configuration parameters not specified
+        params_not_in_config = sorted(
+            list(set(cls.__dataclass_fields__.keys()) - set(config.keys()))
+        )
+        if len(params_not_in_config) > 0:
+            logging.debug(
+                cls.__name__
+                + ".from_ymls() Configuration parameters not specified (using defaults): "
+                + ", ".join(params_not_in_config)
+            )
+
+        if overwrite_values is not None:
+            for k, v in overwrite_values.items():
+                config[k] = v
+
+        # instantiate class and return
+        # duplicate values and unrecognized keys are again checked upon instantiation
+        return cls(**config)
+
+    @classmethod
+    def from_dict(cls, args_dict: Dict):
+        """
+        instantiates NeoXArgs while reading values from input dict
+        """
+        return cls(**args_dict)
+
+    ############################################################################################################################
+    # start of command line args interface
+
+    @classmethod
+    def consume_deepy_args(cls):
+        """
+        entry point for deepy.py configuring and consuming command line arguments.
+
+        We can use `--wandb_group` / `--wandb_team` to overwrite those args from the command line, otherwise the value from the config is taken.
+        """
+
+        parser = argparse.ArgumentParser(
+            description="GPT-NeoX Configuration", allow_abbrev=False
+        )
+
+        group = parser.add_argument_group(title="Training Configuration")
+
+        group.add_argument(
+            "user_script",
+            type=str,
+            help="User script to launch, followed by any required " "arguments.",
+        )
+
+        group.add_argument(
+            "--conf_dir",
+            "-d",
+            type=str,
+            default=None,
+            help="Directory to prefix to all configuration file paths",
+        )
+
+        group.add_argument(
+            "conf_file",
+            type=str,
+            nargs="+",
+            help="Configuration file path. Multiple files can be provided and will be merged.",
+        )
+
+        group = parser.add_argument_group(title="Weights and Biases monitoring args")
+
+        group.add_argument(
+            "--wandb_group",
+            type=str,
+            default=None,
+            help='Weights & Biases group name - used to group together "runs".',
+        )
+        group.add_argument(
+            "--wandb_team",
+            type=str,
+            default=None,
+            help="Weights & Biases team name.",
+        )
+
+        group = parser.add_argument_group(title="Eval args")
+
+        group.add_argument(
+            "--eval_tasks",
+            type=str,
+            nargs="+",
+            default=None,
+            help="Optionally overwrite eval tasks to run for evaluate.py",
+        )
+        group.add_argument(
+            "--iteration",
+            type=int,
+            default=None,
+            help="Iteration to load checkpoint from in evaluate.py / generate.py. If None is provided, uses the latest iteration.",
+        )
+        group.add_argument(
+            "--eval_results_prefix",
+            type=str,
+            default=None,
+            help="prefix to append to eval results file",
+        )
+        parser.add_argument(
+            "-H",
+            "--hostfile",
+            type=str,
+            help="Hostfile path (in MPI style) that defines the "
+            "resource pool available to the job (e.g., "
+            "worker-0 slots=4)",
+        )
+        group = parser.add_argument_group(title="Generation args")
+        group.add_argument(
+            "-i",
+            "--sample_input_file",
+            type=str,
+            default=None,
+            help="Optionally overwrite `sample_input_file` for generate.py",
+        )
+        group.add_argument(
+            "-o",
+            "--sample_output_file",
+            type=str,
+            default=None,
+            help="Optionally overwrite `sample_output_file` for generate.py",
+        )
+
+        tuning = parser.add_argument_group(title="DeepSpeed Autotuning")
+        tuning.add_argument(
+            "--autotuning",
+            type=str,
+            default=None,
+            choices=("tune", "run"),
+            help="Use DeepSpeed's autotuning feature to optimize certain hyperparameters. For more details refer to documentation here: https://www.deepspeed.ai/tutorials/autotuning/",
+        )
+
+        args_parsed = parser.parse_args()
+
+        # Validate user_script exists
+        assert os.path.exists(
+            args_parsed.user_script
+        ), f"User script could not be found: {args_parsed.user_script}"
+
+        # load config files
+        conf_files = args_parsed.conf_file
+        if args_parsed.conf_dir:
+            conf_files = [os.path.join(args_parsed.conf_dir, f) for f in conf_files]
+
+        # enables us to pass in `125M` instead of `125M.yml`
+        conf_files = [
+            (cf if (cf.endswith(".yml") or cf.endswith(".json")) else cf + ".yml")
+            for cf in conf_files
+        ]
+
+        # determine overwrite values
+        overwrite_values = dict()
+        for k, v in vars(args_parsed).items():
+            if k == "autotuning" and v is not None:
+                overwrite_values["autotuning_run"] = v
+            elif k not in ["conf_dir", "conf_file"] and v is not None:
+                overwrite_values[k] = v
+
+        # load args
+        neox_args = cls.from_ymls(
+            paths_to_yml_files=conf_files, overwrite_values=overwrite_values
+        )
+
+        if neox_args.use_wandb:
+            try:
+                import wandb
+
+                # Check if the W&B group name is configured
+                if neox_args.wandb_group is None:
+                    # Set a randomized string as group name if no group name is provided
+                    neox_args.wandb_group = wandb.sdk.lib.runid.generate_id()
+                else:
+                    # Concatenate the W&B group name with a randomized string to ensure uniqueness.
+                    neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id()
+            except ModuleNotFoundError as e:
+                if e.name == "wandb":
+                    e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging."
+                raise e
+
+            neox_args.wandb_group += "_" + wandb.util.generate_id()
+
+        neox_args.print()
+
+        return neox_args
+
+    @classmethod
+    def consume_neox_args(cls, overwrite_values=None):
+        """
+        Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
+
+        In order not to have any problems with different configs being mismatched across machines, we instead read the .yaml configuration file from the main rank,
+        then serialize the arguments to a dictionary, which the deepspeed launcher broadcasts to all machines (`--megatron_config`).
+
+        We then instantiate a new NeoXArgs from the dictionary (`.from_dict`). This should ensure args are never inconsistent across machines.
+        """
+
+        parser = argparse.ArgumentParser(
+            description="GPT-NeoX Configuration", allow_abbrev=False
+        )
+        parser.add_argument(
+            "--megatron_config",
+            type=str,
+            default=None,
+            help="json dict dumped as string in NeoXArgs.get_deepspeed_main_args()",
+        )
+        parser.add_argument(
+            "--deepspeed_config",
+            type=str,
+            default=None,
+            help="Only need this (at this stage) for autotuning",
+        )
+        args_parsed, _ = parser.parse_known_args()
+        with open(args_parsed.megatron_config) as jsonfile:
+            megatron_config = json.load(jsonfile)
+        if args_parsed.deepspeed_config is not None:
+            overwrite_values = cls.set_up_autotuning(
+                args_parsed.deepspeed_config, overwrite_values
+            )
+        if overwrite_values is not None:
+            megatron_config.update(overwrite_values)
+        return cls.from_dict(args_dict=megatron_config)
+
+    @staticmethod
+    def set_up_autotuning(encoded_config, overwrite_values):
+        config = json.loads(base64.urlsafe_b64decode(encoded_config).decode("utf-8"))
+        overwrite_values = overwrite_values if overwrite_values else {}
+        for tuning_param in AUTOTUNING_ARGS:
+            # TODO: This is for autotuning specifically, may cause surprises for someone with a weird setup
+            if tuning_param in config:
+                overwrite_values[tuning_param] = config[tuning_param]
+        return overwrite_values
+
+    @staticmethod
+    def convert_key_value_to_command_line_arg(k, v):
+        if isinstance(v, bool):
+            if v:
+                return [f"--{k}"]
+            else:
+                return []
+        if v is None:
+            return []
+        return [f"--{k}", str(v)]
+
+    def get_extra_deepspeed_args(self):
+        """
+        Sets up the extra arguments for deepspeed. This is done by reading in the `deepspeed_extra_args` dictionary from
+            the configuration file, and then adding any arguments where values differ from those specified in the dataclass.
+        """
+        neox_args = self.get_parent_class_value_dict(
+            *self.__class__.__bases__, only_non_defaults=True
+        )
+
+        extra_ds_args = dict()
+
+        for key, value in self.deepspeed_extra_args.items():
+            # Check to make sure the key is not already changed from defaults, and raise an exception if it is
+            # This is to prevent users from accidentally writing arguments both in deepspeed_extra_args and in the base level
+            # of the configuration file
+            if hasattr(neox_args, key):
+                raise ValueError(
+                    f"Key {key} is already specified elsewhere. Reading in a different value from the 'deepspeed_extra_args' option in the configuration file will cause undefined behavior."
+                )
+            extra_ds_args[key] = value
+
+        return extra_ds_args
+
+    def get_deepspeed_main_args(self):
+
+        args_list = list()
+
+        if self.autotuning_run is not None:
+            args_list.extend(
+                self.convert_key_value_to_command_line_arg(
+                    "autotuning", self.autotuning_run
+                )
+            )
+
+        # get deepspeed runner args, and only pass them in to deepspeed launcher if they differ from defaults
+        for key, default_value in NeoXArgsDeepspeedRunner().defaults():
+            if key == "autotuning_run":
+                continue
+            configured_value = getattr(self, key)
+
+            if key == "force_multi":
+                if self.deepspeed_slurm or self.deepspeed_mpi:
+                    configured_value = True
+            if configured_value != default_value:
+                args_list.extend(
+                    self.convert_key_value_to_command_line_arg(key, configured_value)
+                )
+
+        if self.deepspeed_slurm:
+            comment = getattr(self, "comment")
+            if comment:
+                args_list.extend(
+                    self.convert_key_value_to_command_line_arg("comment", comment)
+                )
+            # master_address = os.environ['SLURM_JOB_NODELIST'].split('\n')[0]
+            # args_list.extend(
+            #    self.convert_key_value_to_command_line_arg('master_addr', master_address)
+            # )
+
+        if "DLTS_HOSTFILE" in os.environ:
+            args_list.extend(
+                self.convert_key_value_to_command_line_arg(
+                    "hostfile", os.environ["DLTS_HOSTFILE"]
+                )
+            )
+
+        if "MASTER_ADDR" in os.environ:
+            args_list.extend(
+                self.convert_key_value_to_command_line_arg(
+                    "master_addr", os.environ["MASTER_ADDR"]
+                )
+            )
+
+        if (
+            "--include" in args_list or "--exclude" in args_list
+        ) and "--num_gpus" in args_list:
+            print(
+                "WARNING: both --include/--exclude and num_gpus were specified simultaneously - overriding num_gpus with --include/--exclude"
+            )
+            # cannot specify these both simultaneously, remove num_gpus from list
+            idx = args_list.index("--num_gpus")
+            # pop twice, once for the arg, once for its value
+            args_list.pop(idx)
+            args_list.pop(idx)
+
+        # add user script
+        args_list.append(self.user_script)
+
+        self.configure_distributed_args()
+        cwd = Path.cwd()
+
+        # get deepspeed_config
+        args_list.append("--deepspeed_config")
+
+        if self.autotuning_run is not None:
+            ds_fp = cwd / Path("ds_config.json")
+            if self.rank == 0:
+                with open(ds_fp, mode="w") as ds_file:
+                    json.dump(self.deepspeed_config, ds_file)
+            args_list.append(str(ds_fp))
+        else:
+            encoded_ds_config = base64.urlsafe_b64encode(
+                json.dumps(self.deepspeed_config).encode("utf-8")
+            ).decode("utf-8")
+            args_list.append(encoded_ds_config)
+
+        megatron_fp = cwd / Path("megatron_config.json")
+        # get all config values
+        args_list.append("--megatron_config")
+        args_list.append(str(megatron_fp))
+        neox_args = self.get_parent_class_value_dict(
+            *self.__class__.__bases__, only_non_defaults=True
+        )
+        if self.rank == 0:
+            with open(megatron_fp, mode="w") as megafile:
+                json.dump(neox_args, megafile)
+        return args_list
+
+    ############################################################################################################################
+    # start of calculated properties
+
+    @property
+    def deepspeed_config(self) -> dict:
+        """
+        returns a dict containing variables within deepspeed config
+        """
+        config = self.get_parent_class_value_dict_extra_ds(
+            NeoXArgsDeepspeedConfig, only_non_defaults=True
+        )
+        return config
+
+    @property
+    def deepspeed_runner(self) -> dict:
+        """
+        returns variables within deepspeed runner
+        """
+        return self.get_parent_class_value_dict(NeoXArgsDeepspeedRunner)
+
+    @property
+    def megatron_config(self) -> dict:
+        """
+        returns variables within megatron args
+        """
+        return self.get_parent_class_value_dict(*NEOX_ARG_CLASSES)
+
+    @property
+    def all_config(self) -> dict:
+        """
+        returns variables of all args
+        """
+        return self.get_parent_class_value_dict(*BASE_CLASSES)
+
+    def get_parent_class_value_dict(
+        self, *parent_classes, only_non_defaults=False
+    ) -> dict:
+        """
+        takes a sequence of parent classes and returns corresponding values (with defaults set)
+        """
+        # TODO no Nones or non-defaults
+        result = dict()
+        for parent in parent_classes:
+            for key, default_value in parent().defaults():
+                if key in ["tokenizer", "tensorboard_writer", "adlr_autoresume_object"]:
+                    continue
+                if only_non_defaults:
+                    value = getattr(self, key)
+                    if value == default_value:
+                        continue
+                result[key] = getattr(self, key)
+        return result
+
+    def get_parent_class_value_dict_extra_ds(
+        self, *parent_classes, only_non_defaults=False
+    ) -> dict:
+        """
+        Takes a sequence of parent classes and returns corresponding values (with defaults set).
+        Also adds in any extra deepspeed arguments that are specified in the configuration file.
+
+        Args:
+            parent_classes: sequence of parent classes
+            only_non_defaults: if True, only returns values that differ from defaults
+
+        Returns:
+            dict of arguments and values
+
+        """
+        # TODO no Nones or non-defaults
+        result = dict()
+        for parent in parent_classes:
+            for key, default_value in parent().defaults():
+                if key in [
+                    "tokenizer",
+                    "tensorboard_writer",
+                    "adlr_autoresume_object",
+                    "deepspeed_extra_args",
+                ]:
+                    continue
+                if only_non_defaults:
+                    value = getattr(self, key)
+                    if value == default_value:
+                        continue
+                result[key] = getattr(self, key)
+
+        if self.deepspeed_extra_args is not None:
+            extra_ds_args = self.get_extra_deepspeed_args()
+            result.update(extra_ds_args)
+
+        return result
+
+    @property
+    def params_dtype(self):
+        """
+        returns the datatype on the basis of configured precision
+        """
+        if self.precision == "fp16":
+            return torch.half
+        elif self.precision == "bfloat16":
+            return torch.bfloat16
+        else:
+            return torch.float
+
+    ############################################################################################################################
+    # start of logging and output
+
+    def enable_logging(self):
+        """
+        enable Tee logs based on the configured logdir
+        """
+        if self.log_dir:
+            os.makedirs(self.log_dir, exist_ok=True)
+            hostname = gethostname()
+            file_prefix = os.path.join(self.log_dir, hostname)
+            Tee(file_prefix + "_stdout.txt", err=False)
+            Tee(file_prefix + "_stderr.txt", err=True)
+
+    def print(self):
+        """Print arguments."""
+        if self.rank == 0 or self.rank is None:
+            print("-------------------- arguments --------------------", flush=True)
+            str_list = []
+            for arg in vars(self):
+                # add arg + value
+                dots = "." * (32 - len(arg))
+                value = getattr(self, arg)
+                print_str = "  {} {} {}".format(arg, dots, value)
+
+                # add info 'default or updated'
+                field_def = self.__dataclass_fields__.get(arg)
+                if field_def is not None:
+                    default_info = (
+                        "default" if value == field_def.default else "updated"
+                    )
+                else:
+                    default_info = ""
+                dots = "." * (64 - len(print_str))
+                print_str += dots
+                str_list.append({"print_str": print_str, "default_info": default_info})
+
+            for arg in sorted(
+                sorted(str_list, key=lambda x: x["print_str"].lower()),
+                key=lambda x: x["default_info"],
+                reverse=True,
+            ):
+                print(arg["print_str"] + arg["default_info"], flush=True)
+            print("---------------- end of arguments ----------------", flush=True)
+
+    ############################################################################################################################
+    # start of calculations and derived values
+
+    def configure_distributed_args(self):
+        """
+        Configures distributed training arguments from local variables set by deepspeed launcher.
+        """
+        if self.deepspeed_mpi:
+            from deepspeed.comm import mpi_discovery
+
+            mpi_discovery()
+
+        if self.deepspeed_slurm:
+            os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"]
+            os.environ["RANK"] = os.environ["SLURM_PROCID"]
+            os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"]
+
+        self.update_value("local_rank", int(os.getenv("LOCAL_RANK", "0")))
+        self.update_value("rank", int(os.getenv("RANK", "0")))
+        self.update_value("world_size", int(os.getenv("WORLD_SIZE", "1")))
+
+        if self.rank == 0:
+            print(
+                self.__class__.__name__
+                + ".configure_distributed_args() using world size: {} and model-parallel size: {} ".format(
+                    self.world_size, self.model_parallel_size
+                ),
+                flush=True,
+            )
+
+    @staticmethod
+    def calculate_batch_parameters(
+        dp_world_size, train_batch=None, micro_batch=None, grad_acc=None
+    ):
+        # all values are provided nothing needs to be set
+        if train_batch is not None and micro_batch is not None and grad_acc is not None:
+            return train_batch, micro_batch, grad_acc
+
+        # gradient_accumulation_steps needs to be set
+        elif train_batch is not None and micro_batch is not None:
+            grad_acc = train_batch // micro_batch
+            grad_acc //= dp_world_size
+
+        # micro_batch_per_gpu needs to be set
+        elif train_batch is not None and grad_acc is not None:
+            micro_batch = train_batch // dp_world_size
+            micro_batch //= grad_acc
+
+        # train_batch_size needs to be set
+        elif micro_batch is not None and grad_acc is not None:
+            train_batch = micro_batch * grad_acc
+            train_batch *= dp_world_size
+
+        # gradient_accumulation_steps and micro_batch_per_gpus is set
+        elif train_batch is not None:
+            grad_acc = 1
+            micro_batch = train_batch // dp_world_size
+
+        # train_batch_size and gradient_accumulation_step is set
+        elif micro_batch is not None:
+            train_batch = micro_batch * dp_world_size
+            grad_acc = 1
+
+        # either none of the three parameters are provided or just gradient_accumulation_step is provided
+        else:
+            assert (
+                False
+            ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
+        return int(train_batch), int(micro_batch), int(grad_acc)
+
+    @staticmethod
+    def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):
+
+        assert (
+            train_batch > 0
+        ), f"Train batch size: {train_batch} has to be greater than 0"
+
+        assert (
+            micro_batch > 0
+        ), f"Micro batch size per gpu: {micro_batch} has to be greater than 0"
+
+        assert (
+            grad_acc > 0
+        ), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
+
+        assert train_batch == micro_batch * grad_acc * dp_world_size, (
+            f"Check batch related parameters. train_batch_size is not equal"
+            " to micro_batch_per_gpu * gradient_acc_step * world_size \n"
+            f"{train_batch} != {micro_batch} * {grad_acc} * {dp_world_size}"
+        )
+
+    def calculate_derived(self):
+        """
+        Derives additional configuration values necessary for training from the current config
+        """
+
+        # number of gpus
+        # Get number of GPUs param or hostfile to determine train_batch_size
+        global_num_gpus = getattr(self, "global_num_gpus", None)
+        if global_num_gpus is None:
+            if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE):
+                hostfile_path = self.hostfile or DLTS_HOSTFILE
+                resources = obtain_resource_pool(
+                    hostfile_path, self.include or "", self.exclude or ""
+                )
+                if self.num_nodes is not None and self.num_nodes > 0:
+                    resources = {
+                        k: resources[k]
+                        for k in list(resources.keys())[: self.num_nodes]
+                    }
+                global_num_gpus = sum(map(len, resources.values()))
+                if self.num_gpus is not None and self.num_gpus > 0:
+                    global_num_gpus = self.num_gpus * len(resources)
+            else:
+                global_num_gpus = torch.cuda.device_count()
+            self.update_value("global_num_gpus", global_num_gpus)
+
+        logging.info(
+            self.__class__.__name__
+            + ".calculate_derived() "
+            + f"Total number of GPUs determined to be: {global_num_gpus}"
+        )
+
+        # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
+        # data-parallel group, or (num_gpus / mp_size) / pp_size
+        pp_size = self.pipe_parallel_size
+        pp_size = pp_size if pp_size >= 1 else 1
+        mp_size = self.model_parallel_size
+        mp_size = mp_size if mp_size >= 1 else 1
+        self.update_value("model_parallel_size", mp_size)
+
+        # pp_size and mp_size are only used here to compute dp world size and nowhere else.
+        dp_world_size = (global_num_gpus / pp_size) / mp_size
+        if not (dp_world_size % 1 == 0):
+            error_message = (
+                self.__class__.__name__
+                + ".calculate_derived() "
+                + f"(global_num_gpus / pp_size) / mp_size [({global_num_gpus} / {pp_size}) / {mp_size}] must be a whole number"
+            )
+            logging.error(error_message)
+            raise AssertionError(error_message)
+
+            # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*global_num_gpus*gradient_accumulation_steps
+        (
+            train_batch_size,
+            train_micro_batch_size_per_gpu,
+            gradient_accumulation_steps,
+        ) = self.calculate_batch_parameters(
+            dp_world_size=dp_world_size,
+            train_batch=self.train_batch_size,
+            micro_batch=self.train_micro_batch_size_per_gpu,
+            grad_acc=self.gradient_accumulation_steps,
+        )
+        self.check_batch_parameters(
+            dp_world_size=dp_world_size,
+            train_batch=train_batch_size,
+            micro_batch=train_micro_batch_size_per_gpu,
+            grad_acc=gradient_accumulation_steps,
+        )
+        self.update_values(
+            {
+                # batch size params
+                "train_batch_size": train_batch_size,
+                "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "batch_size": train_micro_batch_size_per_gpu,
+                # duplicate items
+                "gas": self.gradient_accumulation_steps,
+                "clip_grad": self.gradient_clipping,
+            }
+        )
+
+        # derive steps where checkpoint should be saved
+        if self.checkpoint_factor or self.extra_save_iters:
+            if self.extra_save_iters:
+                save_iters = set(self.extra_save_iters)
+            else:
+                save_iters = set()
+
+            step = self.checkpoint_factor  # don't save step 0 or 1
+            while step < self.train_iters:
+                save_iters.add(step)
+                if self.checkpoint_scale == "log":
+                    step *= self.checkpoint_factor
+                elif self.checkpoint_scale == "linear":
+                    step += self.checkpoint_factor
+
+            save_iters = list(save_iters)
+            save_iters.sort()
+
+            self.update_values(
+                {
+                    "save_iters": save_iters,
+                }
+            )
+
+        # derive precision
+        fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts"
+        if self.fp16 and self.fp16.get("enabled", False):
+            if self.precision is None:
+                self.update_value("precision", "fp16")
+            else:
+                assert self.precision == "fp16", fp16_conflict
+
+        if self.precision == "fp16":
+            if isinstance(self.fp16, dict) and len(self.fp16) > 0:
+                fp16_args = copy.deepcopy(self.fp16)
+                fp16_args["enabled"] = True
+            else:
+                fp16_args = {"type": "fp16", "enabled": True}
+            self.update_value("fp16", fp16_args)
+        elif self.precision == "bfloat16":
+            bf_config = {"bf16": {"enabled": True}}
+            if self.deepspeed_extra_args is None:
+                self.update_value("deepspeed_extra_args", bf_config)
+            else:
+                extra_args = copy.deepcopy(self.deepspeed_extra_args)
+                extra_args.update(bf_config)
+                self.update_value("deepspeed_extra_args", extra_args)
+        else:
+            self.update_value("precision", "fp32")
+
+        # zero optimization
+        if self.zero_optimization is None:
+            self.zero_optimization = copy.deepcopy(
+                ZERO_DEFAULTS
+            )  # a dict is overwritten and not updated key by key
+        try:
+            stage = self.zero_optimization["stage"]
+            if stage in (0, 1, 2, 3):
+                self.update_values(
+                    {
+                        "zero_stage": self.zero_optimization.get(
+                            "stage", ZERO_DEFAULTS["stage"]
+                        ),
+                        "zero_reduce_scatter": self.zero_optimization.get(
+                            "reduce_scatter", ZERO_DEFAULTS["reduce_scatter"]
+                        ),
+                        "zero_contiguous_gradients": self.zero_optimization.get(
+                            "contiguous_gradients",
+                            ZERO_DEFAULTS["contiguous_gradients"],
+                        ),
+                        "zero_reduce_bucket_size": self.zero_optimization.get(
+                            "reduce_bucket_size", ZERO_DEFAULTS["reduce_bucket_size"]
+                        ),
+                        "zero_allgather_bucket_size": self.zero_optimization.get(
+                            "allgather_bucket_size",
+                            ZERO_DEFAULTS["allgather_bucket_size"],
+                        ),
+                    }
+                )
+            else:
+                assert (
+                    self.autotuning is not None
+                ), f"Zero Stage must be an integer unless you are doing autotuning, not {stage}"
+        except KeyError as ke:
+            print(f"Zero Optimization config: {self.zero_optimization}")
+            raise ke
+
+        # optimizer and scheduler
+        opt_params = self.optimizer or {
+            "type": OPT_DEFAULT,
+            "params": OPT_PARAMS_DEFAULTS,
+        }
+        self.update_values(
+            {
+                "optimizer_type": opt_params.get("type", OPT_DEFAULT),
+                "lr": opt_params["params"].get("lr", OPT_PARAMS_DEFAULTS["lr"]),
+            }
+        )
+
+        if self.optimizer_type.lower() == "onebitadam":
+            # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args
+            # for all other optimizers, the scheduling is handled by megatron
+            self.scheduler = {
+                "type": "WarmupDecayLR",  # for now this is the only ds scheduler offering decay
+                "params": {
+                    "warmup_min_lr": 0,
+                    "warmup_max_lr": self.lr,
+                    "warmup_num_steps": int(self.train_iters * self.warmup),
+                    "total_num_steps": self.lr_decay_iters or self.train_iters,
+                },
+            }
+
+        # Fp16 loss scaling.
+        self.update_value("dynamic_loss_scale", self.loss_scale is None)
+
+        # Update 'is pipe parallel' flag
+        # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
+        # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
+        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
+
+        # Attention config
+        if self.attention_config is None:
+            self.update_value("attention_config", [[["global"], self.num_layers]])
+        self.update_value(
+            "attention_config",
+            expand_attention_types(self.attention_config, self.num_layers),
+        )
+        assert (
+            len(self.attention_config) == self.num_layers
+        ), "Length of attention config list must equal num_layers"
+        for item in self.attention_config:
+            assert (
+                item in ATTENTION_TYPE_CHOICES
+            ), f"Attention type {item} not recognized"
+        if "gmlp" in self.attention_config or "amlp" in self.attention_config:
+            assert (
+                not self.partition_activations
+            ), "GMLP Blocks are not compatible with partition activations"
+
+        # Sparsity config
+        if self.sparsity_config is None:
+            # Can't have a default value as an empty dict so need to set it here
+            self.update_value("sparsity_config", {})
+
+        # Adding equal dataset weights if none are provided
+        if self.train_data_paths and (self.train_data_weights is None):
+            self.train_data_weights = [1.0] * len(self.train_data_paths)
+        if self.valid_data_paths and (self.valid_data_weights is None):
+            self.valid_data_weights = [1.0] * len(self.valid_data_paths)
+        if self.test_data_paths and (self.test_data_weights is None):
+            self.test_data_weights = [1.0] * len(self.test_data_paths)
+
+        # if a sample input file is provided, default text_gen_type type to input-file
+        if self.text_gen_type is None:
+            if self.sample_input_file:
+                self.update_value("text_gen_type", "input-file")
+            else:
+                self.update_value("text_gen_type", "unconditional")
+
+    ############################################################################################################################
+    # start of validation functions
+
+    @classmethod
+    def validate_keys(cls):
+        """
+        test that there are no duplicate arguments
+        """
+        source_classes = list(cls.__bases__)
+        defined_properties = dict()
+
+        for source_class in source_classes:
+            source_vars = list(source_class.__dataclass_fields__)
+            for item in source_vars:
+                if item in defined_properties.keys():
+                    logging.error(
+                        f"({cls.__name__}) duplicate of item: {item}, in class {source_class.__name__} and {defined_properties[item]}"
+                    )
+                    return False
+                else:
+                    defined_properties[item] = source_class.__name__
+        return True
+
+    def validate_values(self):
+        # the current codebase assumes running with deepspeed only
+        if not self.deepspeed:
+            return False
+
+        # learning rate
+        if self.lr is None:
+            error_message = self.__class__.__name__ + ".validate_values() lr is None"
+            logging.error(error_message)
+            raise ValueError(error_message)
+            return False
+
+        # required arguments
+        required_args = [
+            "num_layers",
+            "hidden_size",
+            "num_attention_heads",
+            "max_position_embeddings",
+        ]
+        for req_arg in required_args:
+            if getattr(self, req_arg) is None:
+                error_message = (
+                    self.__class__.__name__
+                    + ".validate_values() "
+                    + req_arg
+                    + " is None."
+                )
+                logging.error(error_message)
+                raise ValueError(error_message)
+                return False
+
+        # Checks.
+        if self.hidden_size % self.num_attention_heads != 0:
+            error_message = (
+                self.__class__.__name__
+                + ".validate_values() hidden_size must be divisible by num_attention_heads"
+            )
+            logging.error(error_message)
+            raise ValueError(error_message)
+            return False
+
+        if self.seq_length is not None:
+            if not (self.max_position_embeddings >= self.seq_length):
+                error_message = (
+                    self.__class__.__name__
+                    + ".validate_values() max_position_embeddings must be bigger or equal seq_length"
+                )
+                logging.error(error_message)
+                raise ValueError(error_message)
+                return False
+
+        if not (self.min_lr <= self.lr):
+            error_message = (
+                self.__class__.__name__
+                + ".validate_values() min_lr must be smaller or equal lr"
+            )
+            logging.error(error_message)
+            raise ValueError(error_message)
+            return False
+
+        if (
+            self.save is not None
+            and self.checkpoint_factor is None
+            and self.extra_save_iters is None
+        ):
+            error_message = (
+                self.__class__.__name__
+                + ".validate_values() checkpoint_factor or extra_save_iters must be defined if save is defined"
+            )
+            logging.error(error_message)
+            raise ValueError(error_message)
+            return False
+
+        # Parameters sharing does not work with torch DDP.
+        if (self.num_unique_layers is not None) and (self.num_layers is not None):
+
+            if not (self.num_unique_layers <= self.num_layers):
+                error_message = (
+                    self.__class__.__name__
+                    + ".validate_values() num-unique-layers must be smaller or equal num_layers"
+                )
+                logging.error(error_message)
+                raise ValueError(error_message)
+                return False
+
+            if not (self.num_layers % self.num_unique_layers == 0):
+                error_message = (
+                    self.__class__.__name__
+                    + ".validate_values() num-layers should be divisible by num-unique-layers"
+                )
+                logging.error(error_message)
+                raise ValueError(error_message)
+                return False
+
+        if self.fp16_lm_cross_entropy and self.precision != "fp16":
+            error_message = (
+                self.__class__.__name__
+                + ".validate_values() lm cross entropy in fp16 only support in fp16 mode."
+            )
+            logging.error(error_message)
+            raise ValueError(error_message)
+            return False
+
+        # assert that if one of train/test/valid_data_path are provided, data_path should not be
+        has_separate_path = [
+            data_path is not None
+            for data_path in [
+                self.train_data_paths,
+                self.valid_data_paths,
+                self.test_data_paths,
+            ]
+        ]
+        if all(has_separate_path):
+            assert self.data_path is None, (
+                "Please provide *either* `data_path` or `train/valid/test_data_path` "
+                "in args "
+            )
+
+        # assert that if one of train/test/valid_data_path are provided, all should be
+        assert_error_mess = (
+            "One or more of train/valid/test data_path are not provided:\n\t"
+        )
+        assert_error_mess += "\n\t".join(
+            [
+                f"{name} data paths: {data_path},"
+                for name, data_path in [
+                    ["train", self.train_data_paths],
+                    ["valid", self.valid_data_paths],
+                    ["test", self.test_data_paths],
+                ]
+            ]
+        )
+        assert any(has_separate_path) == all(has_separate_path), assert_error_mess
+
+        # assert that if train / valid / test data path(s) and weights are provided, that the paths and the weights should be equal length
+        if self.train_data_paths is not None:
+            assert len(self.train_data_paths) == len(self.train_data_weights)
+        if self.valid_data_paths is not None:
+            assert len(self.valid_data_paths) == len(self.valid_data_weights)
+        if self.test_data_paths is not None:
+            assert len(self.test_data_paths) == len(self.test_data_weights)
+
+        return True
+
+    def validate_types(self):
+        """
+        At runtime, checks types are actually the type specified.
+        """
+        for field_name, field_def in self.__dataclass_fields__.items():
+            actual_value = getattr(self, field_name)
+            if actual_value is None:
+                continue  # we allow for some values not to be configured
+
+            if self.autotuning is not None and actual_value == "auto":
+                continue
+
+            actual_type = type(actual_value)
+            if actual_type != field_def.type:
+                if (
+                    actual_type == int and field_def.type == float
+                ):  # floats should be able to be configured as ints
+                    continue
+
+                # for typing.Literal (i.e a list of choices) - checks that actual value is in accepted values
+                elif field_def.type.__origin__ == Literal:
+                    accepted_values = field_def.type.__args__
+                    if actual_value in accepted_values:
+                        continue
+                    elif type(actual_value) == str:
+                        # case insensitive checking
+                        lowercase_accepted_values = [
+                            i.lower() for i in accepted_values if isinstance(i, str)
+                        ]
+                        if actual_value.lower() in lowercase_accepted_values:
+                            continue
+                    logging.error(
+                        self.__class__.__name__
+                        + ".validate_types() "
+                        + f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'"
+                    )
+                    return False
+                elif field_def.type.__origin__ == Union:
+                    accepted_types = field_def.type.__args__
+                    if actual_type in accepted_types:
+                        continue
+                    else:
+                        logging.error(
+                            self.__class__.__name__
+                            + ".validate_types() "
+                            + f"{field_name}: '{actual_type}' not in {accepted_types}"
+                        )
+                        return False
+
+                logging.error(
+                    self.__class__.__name__
+                    + ".validate_types() "
+                    + f"{field_name}: '{actual_type}' instead of '{field_def.type}'"
+                )
+                return False
+
+        # validate deepspeed dicts
+        for field_name in ["optimizer", "scheduler"]:
+            value = getattr(self, field_name)
+            if isinstance(
+                value, dict
+            ):  # dict is checked above, only fields are checked here
+                if "type" in value:
+                    if not isinstance(value["type"], str):
+                        logging.error(
+                            self.__class__.__name__
+                            + ".validate_types() "
+                            + f"{field_name}: key 'type' must be a string"
+                        )
+                        return False
+                else:
+                    logging.error(
+                        self.__class__.__name__
+                        + ".validate_types() "
+                        + f"{field_name}: must contain key 'type'"
+                    )
+                    return False
+                if "params" in value:
+                    if not isinstance(value["params"], dict):
+                        logging.error(
+                            self.__class__.__name__
+                            + ".validate_types() "
+                            + f"{field_name}: key 'params' must be a dict"
+                        )
+                        return False
+                else:
+                    logging.error(
+                        self.__class__.__name__
+                        + ".validate_types() "
+                        + f"{field_name}: must contain key 'params'"
+                    )
+                    return False
+
+        for field_name in ["fp16", "amp", "flops_profiler"]:
+            value = getattr(self, field_name)
+            if isinstance(value, dict):
+                if not "enabled" in value:
+                    error_message = (
+                        self.__class__.__name__
+                        + ".validate_types() "
+                        + f"{field_name}: must contain key 'enabled'"
+                    )
+                    logging.error(error_message)
+                    return False
+
+        return True
diff --git a/benchmarks/sizing/megatron/neox_arguments/deepspeed_args.py b/benchmarks/sizing/megatron/neox_arguments/deepspeed_args.py
new file mode 100644
index 0000000..15b35e4
--- /dev/null
+++ b/benchmarks/sizing/megatron/neox_arguments/deepspeed_args.py
@@ -0,0 +1,362 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+try:
+    from .template import NeoXArgsTemplate
+except ImportError:
+    from template import NeoXArgsTemplate
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+@dataclass
+class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
+    """
+    Args for deepspeed config
+    Every argument included here will be included in deepspeed config json
+    As of Mar 8 2023, up to date compared to https://www.deepspeed.ai/docs/config-json/
+    """
+
+    deepspeed: bool = True
+    """boolean flag to enable DeepSpeed (Always True)"""
+
+    train_batch_size: int = None
+    """
+    The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
+    """
+
+    train_micro_batch_size_per_gpu: int = None
+    """
+    Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
+    """
+
+    gradient_accumulation_steps: int = 1
+    """
+    Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON.
+    """
+
+    optimizer: dict = None
+    """
+    dict containing the keys type and params
+
+    type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
+
+    params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam).
+    """
+
+    scheduler: dict = None
+    """
+    dict containing the keys type and params
+
+    type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers.
+
+    params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.
+    """
+
+    fp32_allreduce: bool = False
+    """
+    During gradient averaging perform allreduce with 32 bit values
+    """
+
+    prescale_gradients: bool = False
+    """
+    Scale gradients before doing allreduce
+    """
+
+    gradient_predivide_factor: float = 1.0
+    """
+    Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs
+    """
+
+    sparse_gradients: bool = False
+    """
+    Enable sparse compression of torch.nn.Embedding gradients.
+    """
+
+    # ---FP16 Training Options---
+
+    fp16: dict = None
+    """
+    Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package.
+
+    Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#fp16-training-options
+    """
+
+    bf16: dict = None
+    """
+    Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
+    """
+
+    # ---Automatic Mixed Precision (AMP) Training Options---
+
+    amp: dict = None
+    """
+    Configuration for using automatic mixed precision (AMP) training that leverages NVIDIA’s Apex AMP package.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options
+    """
+
+    gradient_clipping: float = 1.0
+    """
+    Enable gradient clipping with provided value
+    """
+
+    # ---ZeRO Optimization Options---
+
+    zero_optimization: dict = None
+    """
+    Configuration for using ZeRO optimization.
+
+    Multi-level dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#zero-optimization-options
+    """
+
+    # ---Logging Options---
+
+    curriculum_learning: dict = None
+    """"""
+
+    curriculum_seqlen: int = 0
+    """
+    Internal var for tracking the current seqlen
+    """
+
+    steps_per_print: int = 10
+    """
+    Print train loss every N steps.
+    """
+
+    wall_clock_breakdown: bool = False
+    """
+    Enable timing of the latency of forward/backward/update training phases.
+    """
+
+    dump_state: bool = False
+    """
+    Print out state information of DeepSpeed object after initialization.
+    """
+
+    # ---FLOPS Profiler Options---
+
+    flops_profiler: dict = None
+    """
+    Configuration for using FLOPS profiler.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler
+    """
+
+    # ---Communication Options---
+
+    communication_data_type: bool = None
+    """
+    During gradient averaging, perform communication with selected data type. By default it will be determined by selected regime
+    """
+
+    # ---Autotuning Options---
+    autotuning: dict = None
+    """
+    Configuration for using autotuning.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#autotuning
+    """
+
+    # ---Activation Checkpointing Options---
+
+    activation_checkpointing: dict = None
+    """
+    Configuration for using activation checkpointing.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#activation-checkpointing
+    """
+
+    # ---Sparse Attention Options---
+
+    sparse_attention: dict = None
+    """
+    Configuration for using sparse attention.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#sparse-attention
+
+    """
+
+    # ---Data Efficiency Options---
+
+    data_efficiency: dict = None
+    """
+    Configuration for using data efficiency.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-efficiency
+    """
+
+    # ---Monitoring Module Options---
+
+    tensorboard: dict = None
+    """
+    Configuration for using tensorboard.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv
+    """
+
+    wandb: dict = None
+    """
+    Configuration for using wandb.
+    """
+
+    csv_monitor: dict = None
+    """
+    Configuration for using csv_monitor.
+    """
+
+    # ---Elastic Training Options---
+
+    elasticity: dict = None
+    """
+    Configuration for using elastic training.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02
+    """
+
+    # ---Communication Logging Options---
+
+    comms_logger: dict = None
+    """
+    Configuration for using communication logger.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#communication-logging
+    """
+
+    # ---Compression Options---
+
+    compression_training: dict = None
+    """
+    Configuration for using compression training.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#compression
+    """
+
+    # ---Checkpointing Options---
+
+    checkpoint: dict = None
+    """
+    Configuration for using checkpointing.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#checkpoint-options
+    """
+
+    # ---Data Type Options---
+
+    data_types: dict = None
+    """
+    Configuration for using data types.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-type-options
+    """
+
+    # ---EXTRA ARGUMENTS---
+
+    deepspeed_extra_args: dict = None
+    """
+    Dictionary of extra arguments to be included in the yaml config file. This can be used for any argument not included in the above list.
+    """
+
+    autotuning: dict = None
+    """Dictionary as described in DeepSpeed autotuning documentation: https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning"""
+
+
+@dataclass
+class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
+    """
+    Args for deepspeed runner (deepspeed.launcher.runner).
+    Every argument included here will be passed as command line argument to deepspeed.launcher.runner
+    """
+
+    hostfile: str = None
+    """
+    list of hostnames / ssh aliases and the number of GPUs per host
+
+    example file contents:
+    worker-1 slots=4
+    worker-2 slots=4
+    127.0.0 slots=4
+    127.0.1 slots=4
+    """
+
+    include: str = None
+    """
+    Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`.
+    """
+
+    exclude: str = None
+    """
+    Specify hardware resources to NOT use during execution. Same format as include
+    """
+
+    num_nodes: int = -1
+    """
+    Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all.
+    """
+
+    num_gpus: int = None
+    """
+    Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all.
+    """
+
+    master_port: int = 29500
+    """
+    Port used by PyTorch distributed for communication during training.
+    """
+
+    master_addr: str = None
+    """
+    IP address of node 0, will be inferred via 'hostname -I' if not specified.
+    """
+
+    launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh"
+    """
+    Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
+    """
+
+    force_multi: bool = False
+    """
+    Force multi-node training even if only one node is specified.
+    """
+
+    detect_nvlink_pairs: bool = False
+    """
+    If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
+    """
+
+    autotuning_run: str = None
+    """
+    Either "tune", "run", or `None`.
+    """
+
+    no_ssh_check: bool = False
+    """
+    If true, overrides the default check where DeepSpeed confirms that the headnode is accessible via ssh.
+    """
+
+    force_multi: bool = False
+    """
+    If true, Force multi-node launcher mode, helps in cases where user wants to launch on single remote node.
+    """
+
+    comment: str = None
+    """
+    Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
+    """
diff --git a/benchmarks/sizing/megatron/neox_arguments/neox_args.py b/benchmarks/sizing/megatron/neox_arguments/neox_args.py
new file mode 100644
index 0000000..de092a3
--- /dev/null
+++ b/benchmarks/sizing/megatron/neox_arguments/neox_args.py
@@ -0,0 +1,1124 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+from dataclasses import dataclass
+
+try:
+    from .template import NeoXArgsTemplate
+except ImportError:
+    from template import NeoXArgsTemplate
+
+try:
+    from typing import List, Literal, Union
+except ImportError:
+    from typing_extensions import List, Literal, Union
+
+
+ATTENTION_TYPE_CHOICES = [
+    "global",
+    "local",
+    "sparse_fixed",
+    "sparse_variable",
+    "bigbird",
+    "bslongformer",
+    "gmlp",
+    "amlp",
+    "flash",
+]
+
+
+def get_git_commit_hash():
+    """Gets the git commit hash of your current repo (if it exists)"""
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except subprocess.CalledProcessError:
+        git_hash = None
+    return git_hash
+
+
+@dataclass
+class NeoXArgsParallelism(NeoXArgsTemplate):
+    """
+    Parallelism Arguments
+    """
+
+    pipe_parallel_size: int = 0
+    """
+    Number of pipeline parallel stages. Disable with 0.
+    """
+
+    model_parallel_size: int = 1
+    """
+    Size of the model parallelism.
+    """
+
+    pipe_partition_method: str = "type:transformer|mlp"
+    """
+    method used to distribute model layers across pipeline stages. Choose from "parameters", which balances the number
+    of parameters on each pipeline stage, "uniform", which naively balances the number of layers per stage, or
+    "type:[regex]", which balances layers whose class names match [regex]
+    """
+
+    world_size: int = None
+    """
+    Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher
+    """
+
+    is_pipe_parallel: bool = False
+    """
+    flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined
+    according to pipeline parallel size.
+    """
+
+
+@dataclass
+class NeoXArgsModel(NeoXArgsTemplate):
+    """
+    Model Arguments
+    """
+
+    precision: Literal["fp16", "fp32", "bfloat16"] = None
+    """
+    description of the used precision, either one of fp16 or fp32 (and in the future bf16).
+    """
+
+    num_layers: int = None
+    """
+    Number of transformer layers.
+    """
+
+    hidden_size: int = None
+    """
+    Transformer hidden size.
+    """
+
+    num_attention_heads: int = None
+    """
+    Number of transformer attention heads.
+    """
+
+    seq_length: int = None
+    """
+    Maximum sequence length to process.
+    """
+
+    max_position_embeddings: int = None
+    """
+    Maximum number of position embeddings to use. This is the size of position embedding.
+    """
+
+    norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm"
+    """
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+    """
+
+    layernorm_epsilon: float = 1.0e-5
+    """
+    Layer norm epsilon.
+    """
+
+    rms_norm_epsilon: float = 1.0e-8
+    """
+    Root mean squared norm epsilon
+    """
+
+    scalenorm_epsilon: float = 1.0e-8
+    """
+    Scalenorm epsilon
+    """
+
+    pos_emb: Literal[
+        "learned", "rotary", "sinusoidal", "rpe", "alibi", "none"
+    ] = "learned"
+    """
+    Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'
+    """
+
+    rpe_num_buckets: int = 32
+    """
+    T5 relative positional encoding number of buckets, default 32.
+    """
+
+    rpe_max_distance: int = 128
+    """
+    T5 relative positional encoding max distance, default 128.
+    """
+
+    opt_pos_emb_offset: int = 0
+    """
+    Learned position embedding offset (only used by OPT, where it should be set to 2).
+    """
+
+    no_weight_tying: bool = False
+    """
+    Disables weight tying between embedding weights and final Linear layer
+    """
+
+    attention_config: list = None
+
+    """
+    Attention configuration for gpt-neox
+
+    The first item in the list specifies the attention type(s), and should be a list of strings. The second item
+    specifies the number of times to repeat those attention types in the full list.
+
+    attention type choices:  [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird]
+
+    So a 12 layer network with only global attention could be specified like:
+        [[[`global`], 12]]
+
+    or a 12 layer network with alternating global / local like:
+        [[[`global`, `local`], 6]]
+
+    If none is specified, this defaults to
+        [[[`global`], n_layers]]
+    """
+
+    sparsity_config: dict = None
+
+    """
+    Sparsity configuration dict as defined in https://www.deepspeed.ai/docs/config-json/#sparse-attention
+
+    Note that since neox is autoregressive, attention is always "unidirectional" and `horizontal_global_attention` is
+    always false.
+
+    The main difference between our sparsity config and deepspeed's is that `mode` is ignored - since it is instead
+    specified in attention_config defining each layer.
+
+    An example config is given below:
+          "sparse_attention": {
+            "block": 16,
+            "different_layout_per_head": true,
+            "num_local_blocks": 4,
+            "num_global_blocks": 1,
+            "num_different_global_patterns": 4,
+            "num_random_blocks": 0,
+            "local_window_blocks": [4],
+            "global_block_indices": [0],
+            "global_block_end_indices": None,
+            "num_sliding_window_blocks": 3
+          }
+    """
+
+    num_unique_layers: int = None
+    """
+    Number of unique transformer layers. num-layers should be divisible by this value. Currently only has an effect when pipe_parallel_size=0.
+    """
+
+    param_sharing_style: str = "grouped"
+    """
+    Ordering of the shared parameters. For example, for a num-layers=4 and --num-unique-layers=2, we will have the following ordering for two unique layers 1 and 2-: grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].
+    """
+
+    make_vocab_size_divisible_by: int = 128
+    """
+    Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
+    """
+
+    activation: Literal[
+        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+    ] = "gelu"
+    """
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    """
+
+    scaled_upper_triang_masked_softmax_fusion: bool = False
+    """
+    Enable fusion of query_key_value_scaling time (upper diagonal) masking and softmax.
+    """
+
+    scaled_masked_softmax_fusion: bool = False
+    """
+    Enable fusion of query_key_value_scaling general masking and softmax.
+    """
+
+    bias_gelu_fusion: bool = False
+    """
+    Enable bias and gelu fusion.
+    """
+
+    bias_dropout_fusion: bool = False
+    """
+    Enable bias and dropout fusion.
+    """
+
+    fp16_lm_cross_entropy: bool = False
+    """
+    Move the cross entropy unreduced loss calculation for lm head to fp16.
+    """
+
+    init_method_std: float = 0.02
+    """
+    Standard deviation of the zero mean normal distribution used for weight initialization.
+    """
+
+    apply_query_key_layer_scaling: bool = False
+    """
+    Scale Q * K^T by 1 / layer-number. If this flag is set, then it will automatically set attention-softmax-in-fp32 to true
+    """
+
+    use_cpu_initialization: bool = False
+    """
+    If set, affine parallel weights initialization uses CPU
+    """
+
+    attention_softmax_in_fp32: bool = False
+    """
+    Run attention masking and softmax in fp32.
+    """
+
+    rotary_pct: float = 1.0
+    """
+    pct of hidden dims to apply rotary positional embedding to
+    """
+
+    rotary_emb_base: int = 10000
+    """
+    Base for rotary positional embedding
+    """
+
+    init_method: Literal[
+        "normal",
+        "scaled_normal",
+        "orthogonal",
+        "scaled_orthogonal",
+        "xavier_uniform",
+        "xavier_normal",
+        "wang_init",
+        "small_init",
+    ] = "normal"
+    """
+    Init function used on all layers except ff residual outputs - choose from
+    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
+    """
+
+    output_layer_init_method: Literal[
+        "normal",
+        "scaled_normal",
+        "orthogonal",
+        "scaled_orthogonal",
+        "xavier_uniform",
+        "xavier_normal",
+        "wang_init",
+        "small_init",
+    ] = "scaled_normal"
+    """
+    Init function used for ff residual outputs - choose from
+    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
+    """
+
+    gmlp_attn_dim: int = 64
+    """
+    the dimension of the single head self attention in gmlp model (not used in gpt models).
+    If None - gmlp model doesn't use attention.
+    """
+
+    gpt_j_residual: bool = False
+    """
+    If false, we use the conventional residual path:
+      x = x + attn(ln1(x))
+      x = x + mlp(ln2(x))
+    Otherwise, we use the residual path from GPT-J, which offers a slight speedup:
+      x = ln(x)
+      x = x + attn(x) + mlp(x)
+    """
+
+    gpt_j_tied: bool = False
+    """
+    If false, we use
+      x = x + attn(ln1(x)) + mlp(ln2(x))
+    Otherwise, we tie the layer norms
+      y = ln(x)
+      x = x + attn(y) + mlp(y)
+    """
+
+    use_bias_in_norms: bool = True
+    """
+    If false, norms (e.g. LayerNorm) will not have bias terms
+    """
+    use_bias_in_attn_linear: bool = True
+    """
+    If false, attn_linear (e.g. QKVO) will not have bias terms
+    """
+
+    mlp_type: str = "regular"
+    """
+    Types:
+        regular: Megatron implementation
+        llama: LLaMA MLP (SiLU-gated MLP)
+    """
+
+    soft_prompt_tuning: dict = None
+    """
+    Dictionary configuring the soft prompt tuning parameters.
+    If enabled, will train *only* the soft prompt, and freezes the rest of the model.
+    parameters in the dict are:
+        'enabled': bool = True # enables soft prompting
+        'num_tokens': int = 10 # length of the soft prompt in tokens
+        'init_string': str = '' # if provided, initialize the soft prompt with the word embeddings of this string
+        'init_range': float = 0.5 # if no init string is provided, initialize the soft prompt with a uniform distribution between -init_range and init_rang
+    """
+
+    # Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905)
+    output_layer_parallelism: Literal["column"] = "column"
+
+    """
+    Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
+    """
+
+
+@dataclass
+class NeoXArgsOptimizer(NeoXArgsTemplate):
+    """
+    Optimizer Arguments
+    """
+
+    optimizer_type: Literal[
+        "adam", "onebitadam", "cpu_adam", "cpu_torch_adam", "sm3", "madgrad_wd", "sgd"
+    ] = "adam"
+    """
+    Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd']
+    NOTE: sgd will use MuSGD from Mup. Mup must be enabled for this optimizer.
+    """
+
+    use_bnb_optimizer: bool = False
+    """
+    Whether to enable the bitsandbytes optimizers
+    """
+
+    zero_stage: Union[int, List[int], Literal["all"]] = None
+    """
+    Zero Optimizer stage
+    """
+
+    zero_reduce_scatter: bool = None
+    """
+    Zero: Uses reduce or reduce scatter instead of allreduce to average gradients
+    """
+
+    zero_contiguous_gradients: bool = None
+    """
+    Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.
+    """
+
+    zero_reduce_bucket_size: int = None
+    """
+    Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
+    """
+
+    zero_allgather_bucket_size: int = None
+    """
+    Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
+    """
+
+    lr: float = None
+    """
+    Max Learning rate during training
+    """
+
+
+@dataclass
+class NeoXArgsLRScheduler(NeoXArgsTemplate):
+    """
+    LR Scheduler Arguments
+    """
+
+    lr_decay_style: Literal["constant", "linear", "cosine", "exponential"] = "linear"
+    """
+    Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'.
+    """
+
+    lr_decay_iters: int = None
+    """
+    Number of iterations to decay learning rate over, If None defaults to --train-iters
+    """
+
+    min_lr: float = 0.0
+    """
+    Minimum value for learning rate. The scheduler clips values below this threshold.
+    """
+
+    warmup: float = 0.01
+    """
+    Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
+    """
+
+    override_lr_scheduler: bool = False
+    """
+    Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
+    """
+
+    use_checkpoint_lr_scheduler: bool = False
+    """
+    Use checkpoint to set the values of the scheduler (learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style from checkpoint and ignore input arguments.
+    """
+
+
+@dataclass
+class NeoXArgsLogging(NeoXArgsTemplate):
+    """
+    Logging Arguments
+    """
+
+    use_wandb: bool = None
+    """Flag indicating if wandb is to be used."""
+
+    wandb_group: str = None
+    """Weights and Biases group name - used to group together "runs"."""
+
+    wandb_team: str = None
+    """Team name for Weights and Biases."""
+
+    wandb_project: str = "neox"
+    """wandb project name"""
+
+    wandb_host: str = "https://api.wandb.ai"
+    """url of the wandb host"""
+
+    wandb_init_all_ranks: bool = False
+    """Initialize wandb on all ranks."""
+
+    git_hash: str = get_git_commit_hash()
+    """current git hash of repository"""
+
+    log_dir: str = None
+    """
+    Directory to save logs to.
+    """
+
+    tensorboard_writer = None
+    """
+    initialized tensorboard writer
+    """
+
+    tensorboard_dir: str = None
+    """
+    Write TensorBoard logs to this directory.
+    """
+
+    log_interval: int = 100
+    """
+    Interval between logging.
+    """
+
+    log_grad_pct_zeros: bool = False
+    """
+    Log the percentage of zeros for the gradient of each parameter to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
+    """
+
+    log_param_norm: bool = False
+    """
+    Log the frob norm of the parameters to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
+    """
+
+    log_grad_norm: bool = False
+    """
+    Log the frob norm of the gradients to wandb / tensorboard (useful for debugging).
+    (N.B - this will only work with pp = 0 for now, as we don't have access to the gradients of the model because
+    deepspeed.)
+    """
+
+    log_optimizer_states: bool = False
+    """
+    Log the frob norm of the optimizer states to wandb / tensorboard (useful for debugging).
+    """
+
+    log_gradient_noise_scale: bool = False
+    """
+    Whether to log the gradient noise scale when training (cf. https://arxiv.org/abs/1812.06162 for explanation)
+    """
+
+    gradient_noise_scale_n_batches: int = 5
+    """
+    Number of batches to accumulate gradients for in the gradient noise scale logger.
+    """
+
+    gradient_noise_scale_cpu_offload: bool = False
+    """
+    Whether to offload the buffered gradients to cpu when measuring gradient noise scale.
+    """
+
+
+@dataclass
+class NeoXArgsOther(NeoXArgsTemplate):
+    """
+    Misc. Arguments
+    """
+
+    distributed_backend: str = "nccl"
+    """
+    Which backend to use for distributed training.
+    """
+
+    local_rank: int = None
+    """
+    local rank passed from distributed launcher.
+    """
+
+    rank: int = None
+    """
+    global rank of process being run (passed in via distributed launcher)
+    """
+
+    lazy_mpu_init: bool = False
+    """
+    If set to True, initialize_megatron() skips DDP initialization and returns function to complete it instead. Also turns on use-cpu-initialization flag. This is for external DDP manager.
+    """
+
+    short_seq_prob: float = 0.1
+    """
+    Probability of producing a short sequence.
+    """
+
+    eod_mask_loss: bool = False
+    """
+    Mask loss for the end of document tokens.
+    """
+
+    adlr_autoresume: bool = False
+    """
+    Enable auto-resume on adlr cluster.
+    """
+
+    adlr_autoresume_object = None
+    """
+    imported autoresume
+    """
+
+    adlr_autoresume_interval: int = 1000
+    """
+    Intervals over which check for auto-resume termination signal
+    """
+
+    seed: int = 1234
+    """
+    Random seed used for python, numpy, pytorch, and cuda.
+    """
+
+    onnx_safe: bool = False
+    """
+    Use workarounds for known problems with Torch ONNX exporter
+    """
+
+    deepscale: bool = False
+    """
+    (Deprecated) enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
+    """
+
+    deepscale_config: str = None
+    """(Deprecated) deepscale json configuration file."""
+
+    deepspeed_mpi: bool = False
+    """
+    Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment
+    """
+
+    deepspeed_slurm: bool = False
+    """
+    Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
+    """
+
+    user_script: str = None
+    """
+    user script to be run
+    """
+
+    iteration: int = None
+    """
+    Set during training
+    """
+
+    do_train: int = None
+    """
+    Set during training
+    """
+
+    do_valid: int = None
+    """
+    Set during training
+    """
+
+    do_test: int = None
+    """
+    Set during training
+    """
+
+    save_iters: list = None
+    """
+    Set during training
+    """
+
+    global_num_gpus: int = None
+    """
+    Set during launching
+    """
+
+
+@dataclass
+class NeoXArgsTokenizer(NeoXArgsTemplate):
+    """
+    Tokenizer Arguments
+    """
+
+    tokenizer_type: Literal[
+        "GPT2BPETokenizer",
+        "HFTokenizer",
+        "HFGPT2Tokenizer",
+        "SPMTokenizer",
+        "CharLevelTokenizer",
+        "TiktokenTokenizer",
+    ] = "GPT2BPETokenizer"
+    """
+    Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer", "TiktokenTokenizer"]
+    """
+
+    padded_vocab_size: int = None
+    """
+    Total (padded) vocabulary size of tokenizer. Configured after launching of training,
+    as it's dependent on the parallelism size.
+    """
+
+    tokenizer = None
+    """
+    tokenizer object loaded into memory and accessible by other functions
+    """
+
+
+@dataclass
+class NeoXArgsTraining(NeoXArgsTemplate):
+    """
+    Training Arguments
+    """
+
+    data_path: str = None
+    """
+    Path to combined dataset to split.
+    """
+
+    use_shared_fs: bool = True
+    """
+    Whether to use a shared filesystem for data loading. If False, local rank 0 on all nodes will preprocess the data,
+    otherwise only global rank 0 will preprocess the data. This is implemented in megatron/data/gpt2_dataset.py::_build_index_mappings.
+    """
+
+    train_data_paths: list = None
+    """
+    List of paths to train datasets.
+    """
+
+    test_data_paths: list = None
+    """
+    List of paths to test datasets.
+    """
+
+    valid_data_paths: list = None
+    """
+    List of paths to validation datasets.
+    """
+
+    train_data_weights: list = None
+    """
+    List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `train_data_paths`
+    """
+
+    valid_data_weights: list = None
+    """
+    List of 'weights' that decide how often to sample from each validation dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `valid_data_paths`
+    """
+
+    test_data_weights: list = None
+    """
+    List of 'weights' that decide how often to sample from each test dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `test_data_paths`
+    """
+
+    weight_by_num_documents: bool = False
+    """
+    If True, Builds dataset weights from a multinomial distribution over groups of data according to the number of
+    documents in each group.
+
+    WARNING: setting this to True will override any user provided weights
+
+    We sample from a group according to the probability p(L) ∝ |L| ** α,
+    where p(L) is the probability of sampling from a given group,
+          |L| is the number of examples in that datapoint,
+          and α is a coefficient that acts to upsample data from underrepresented groups
+
+    Hence α (`alpha`) allows us to control how much to 'boost' the probability of training on low-resource groups.
+
+    See https://arxiv.org/abs/1911.02116 for more details
+    """
+
+    weighted_sampler_alpha: float = 0.3
+    """
+    Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
+
+    when alpha = 1, the probability of sampling from a given group = n_samples / total_samples
+    as alpha -> 0, the probability of sampling from all groups becomes equal, and number of documents has no effect
+    as alpha -> inf, the probability of sampling from the groups with *the most samples* -> 1
+    """
+
+    data_impl: str = "infer"
+    """
+    Implementation of indexed datasets.
+    """
+
+    mmap_warmup: bool = False
+    """
+    Warm up mmap files.
+    """
+
+    save: str = None
+    """
+    Output directory to save checkpoints to.
+    """
+
+    config_files: dict = None
+    """
+    Store of original config files mapping config filename to file contents
+    """
+
+    load: str = None
+    """
+    Directory containing a model checkpoint.
+    """
+
+    checkpoint_validation_with_forward_pass: bool = False
+    """
+    save input and output of a forward pass with the checkpoint and validate after load
+    """
+
+    checkpoint_scale: Literal["linear", "log"] = "linear"
+    """
+    How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`,
+    while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1.
+    """
+
+    checkpoint_factor: int = None
+    """
+    Acts as a multiplier on either the "log" or "linear" checkpoint spacing.
+
+    With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at
+    steps [20, 40, 60, 80, 100].
+
+    With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at
+    steps [1, 2, 4, 8, 16, 32, 64, 100].
+
+    Note that the last checkpoint step is always saved.
+    """
+
+    extra_save_iters: list = None
+    """
+    Additional iterations when a checkpoint should be saved.
+    Must be a list of ints or `None`.
+    """
+
+    no_save_optim: bool = False
+    """
+    Do not save current optimizer.
+    """
+
+    no_save_rng: bool = False
+    """
+    Do not save current rng state.
+    """
+
+    no_load_optim: bool = False
+    """
+    Do not load optimizer when loading checkpoint.
+    """
+
+    no_load_rng: bool = False
+    """
+    Do not load rng state when loading checkpoint.
+    """
+
+    finetune: bool = False
+    """
+    Load model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.
+    """
+
+    batch_size: int = None
+    """
+    training microbatch size per gpu
+    """
+
+    train_iters: int = None
+    """
+    Number of iterations to run for training.
+    """
+
+    eval_iters: int = 100
+    """
+    Number of iterations to run for evaluation validation/test for.
+    """
+
+    keep_last_n_checkpoints: int = None
+    """
+    Number of last checkpoints to keep
+    """
+
+    eval_interval: int = 1000
+    """
+    Interval between running evaluation on validation set.
+    """
+
+    split: str = "969, 30, 1"
+    """
+    Comma_separated list of proportions for training, validation, and test split. For example the split 90,5,5 will use 90% of data for training, 5% for validation and 5% for test.
+    """
+
+    vocab_file: str = None
+    """
+    Path to the vocab file.
+    """
+
+    merge_file: str = None
+    """
+    Path to the BPE merge file.
+    """
+
+    num_workers: int = 2
+    """
+    Dataloader number of workers.
+    """
+
+    exit_interval: int = None
+    """
+    Exit the program after the iteration is divisible by this value.
+    """
+
+    attention_dropout: float = 0.1
+    """
+    Post attention dropout probability.
+    """
+
+    hidden_dropout: float = 0.1
+    """
+    Dropout probability for hidden state transformer.
+    """
+
+    weight_decay: float = 0.01
+    """
+    Weight decay coefficient for L2 regularization.
+    """
+
+    checkpoint_activations: bool = False
+    """
+    Checkpoint activation to allow for training with larger models, sequences, and batch sizes.
+    """
+
+    checkpoint_num_layers: int = 1
+    """
+    Chunk size (number of layers) for checkpointing.
+    """
+
+    deepspeed_activation_checkpointing: bool = True
+    """
+    DEPRECATED - TODO: remove
+    Uses activation checkpointing from deepspeed
+    """
+
+    contiguous_checkpointing: bool = False
+    """
+    Contiguous memory checkpointing for activations.
+    """
+
+    checkpoint_in_cpu: bool = False
+    """
+    Move the activation checkpoints to CPU.
+    """
+
+    synchronize_each_layer: bool = False
+    """
+    does a synchronize at the beginning and end of each checkpointed layer.
+    """
+
+    profile_backward: bool = False
+    """
+    Enables backward pass profiling for checkpointed layers.
+    """
+
+    partition_activations: bool = False
+    """
+    Partition Activations across GPUs before checkpointing.
+    """
+
+    gas: int = None
+    """gradient_accumulation_steps"""  # TODO this is a duplicate, remove?
+
+    clip_grad: float = None
+    """
+    Gradient clipping based on global L2 norm.
+    """
+
+    hysteresis: int = 2
+    """
+    hysteresis for dynamic loss scaling
+    """
+
+    dynamic_loss_scale: bool = None
+    """
+    flag indicating whether dynamic loss scale is used
+    """
+
+    loss_scale: float = None
+    """
+    Static loss scaling, positive power of 2
+    values can improve fp16 convergence. If None, dynamic loss scaling is used.
+    """
+
+    loss_scale_window: float = 1000.0
+    """
+    Window over which to raise/lower dynamic scale.
+    """
+
+    min_scale: float = 1.0
+    """
+    Minimum loss scale for dynamic loss scale.
+    """
+
+    char_level_ppl: bool = False
+    """
+    Whether to calculate character level perplexity as well as token level perplexity. (may incur a time cost)
+    """
+
+    use_mup: bool = False
+    """
+    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+    """
+
+    coord_check: bool = False
+    """
+    Whether to generate a "coord check" plot to verify mup's implementation in neox
+    """
+
+    save_base_shapes: bool = False
+    """
+    Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file.
+    """
+
+    base_shapes_file: str = None
+    """
+    Path to the base shapes to save to/load from
+    """
+
+    mup_init_scale: float = 1.0
+    """
+    Initialization scale: All the parameters are multiplied by this value
+    """
+
+    mup_attn_temp: float = 1.0
+    """
+    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
+    """
+
+    mup_output_temp: float = 1.0
+    """
+    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
+    produces the distribution over output tokens.
+    """
+
+    mup_embedding_mult: float = 1.0
+    """
+    Scalar by which we multiply the output of the embedding layer
+    """
+
+    mup_rp_embedding_mult: float = 1.0
+    """
+    Scalar by which we multiply vectors representing relative position
+    """
+
+    mup_width_scale: int = 2
+    """
+    What to scale width by when creating the delta model for mup
+    """
+
+
+@dataclass
+class NeoXArgsTextgen(NeoXArgsTemplate):
+    """
+    Text Generation arguments
+    """
+
+    text_gen_type: str = None
+    """
+    How to generate text/sample the model.
+    Options: `unconditional`, `input-file`, `interactive`
+    """
+
+    temperature: float = 0.0
+    """
+    exponential scaling output distribution ("higher == more risk")
+    """
+
+    top_p: float = 0.0
+    """
+    Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+    """
+
+    top_k: int = 0
+    """
+    integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    """
+
+    return_logits: bool = False
+    """
+    Boolean for whether to return the logits for generated tokens
+    """
+
+    maximum_tokens: int = 64
+    """
+    maximum number of tokens to be generated
+    """
+
+    prompt_end: str = "\n"
+    """
+    a single prompt's end. Defaults to newline
+    """
+
+    sample_input_file: str = None
+    """
+    Get input from file instead of interactive mode, each line is an input.
+    """
+
+    sample_output_file: str = "samples.txt"
+    """
+    Output file
+    """
+
+    num_samples: int = 1
+    """
+    Number of samples to generate unconditionally, defaults to 1 and interactive conditional sampling
+    """
+
+    recompute: bool = False
+    """
+    During generation recompute all attention instead of using previously computed keys/values.
+    Should be set to true for sparse attention models
+    """
+
+    eval_results_prefix: str = ""
+    """
+    prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
+    """
+
+    eval_tasks: list = None
+    """
+    Tasks to evaluate on using lm_eval_harness
+    """
diff --git a/benchmarks/sizing/megatron/neox_arguments/template.py b/benchmarks/sizing/megatron/neox_arguments/template.py
new file mode 100644
index 0000000..d021645
--- /dev/null
+++ b/benchmarks/sizing/megatron/neox_arguments/template.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+import logging
+
+
+@dataclass
+class NeoXArgsTemplate:
+    def defaults(self):
+        """
+        generator for getting default values.
+        """
+        for key, field_def in self.__dataclass_fields__.items():
+            yield key, field_def.default
+
+    def update_value(self, key: str, value):
+        """
+        updates a property value if the key already exists
+
+        Problem: a previously non-existing property can be added to the class instance without error.
+        """
+        if hasattr(self, key):
+            setattr(self, key, value)
+        else:
+            error_message = (
+                self.__class__.__name__
+                + ".update_value() to be updated property "
+                + str(key)
+                + " does not exist"
+            )
+            logging.error(error_message)
+            raise ValueError(error_message)
+
+    def update_values(self, d):
+        """
+        Updates multiple values in self if the keys already exists
+        """
+        for k, v in d.items():
+            self.update_value(k, v)
diff --git a/benchmarks/sizing/megatron/optimizers.py b/benchmarks/sizing/megatron/optimizers.py
new file mode 100644
index 0000000..8dc1d32
--- /dev/null
+++ b/benchmarks/sizing/megatron/optimizers.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from torch.optim import Optimizer
+
+
+class SM3(Optimizer):
+    """Implements SM3 algorithm.
+    It has been proposed in `Memory-Efficient Adaptive Optimization`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 0.1)
+        momentum (float, optional): coefficient used to scale prior updates
+            before adding. This drastically increases memory usage if
+            `momentum > 0.0`. This is ignored if the parameter's gradient
+            is sparse. (default: 0.0)
+        beta (float, optional): coefficient used for exponential moving
+            averages (default: 0.0)
+        eps (float, optional): Term added to square-root in denominator to
+            improve numerical stability (default: 1e-30)
+    .. _Memory-Efficient Adaptive Optimization:
+        https://arxiv.org/abs/1901.11150
+    """
+
+    def __init__(self, params, lr=0.1, momentum=0.0, beta=0.0, eps=1e-30):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {0}".format(lr))
+        if not 0.0 <= momentum < 1.0:
+            raise ValueError("Invalid momentum: {0}".format(momentum))
+        if not 0.0 <= beta < 1.0:
+            raise ValueError("Invalid beta: {0}".format(beta))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid eps: {0}".format(eps))
+
+        defaults = {"lr": lr, "momentum": momentum, "beta": beta, "eps": eps}
+        super(SM3, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            momentum = group["momentum"]
+            beta = group["beta"]
+            eps = group["eps"]
+            for p in group["params"]:
+                if p is None:
+                    continue
+                grad = p.grad
+
+                state = self.state[p]
+                shape = grad.shape
+                rank = len(shape)
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["momentum_buffer"] = 0.0
+                    _add_initial_accumulators(state, grad)
+
+                if grad.is_sparse:
+                    # the update is non-linear so indices must be unique
+                    grad.coalesce()
+                    grad_indices = grad._indices()
+                    grad_values = grad._values()
+
+                    # Transform update_values into sparse tensor
+                    def make_sparse(values):
+                        constructor = grad.new
+                        if grad_indices.dim() == 0 or values.dim() == 0:
+                            return constructor().resize_as_(grad)
+                        return constructor(grad_indices, values, grad.size())
+
+                    acc = state[_key(0)]
+                    update_values = _compute_sparse_update(
+                        beta, acc, grad_values, grad_indices
+                    )
+
+                    self._update_sparse_accumulator(
+                        beta, acc, make_sparse(update_values)
+                    )
+
+                    # Add small amount for numerical stability
+                    update_values.add_(eps).rsqrt_().mul_(grad_values)
+
+                    update = make_sparse(update_values)
+                else:
+                    # Get previous accumulators mu_{t-1}
+                    if rank > 1:
+                        acc_list = [state[_key(i)] for i in range(rank)]
+                    else:
+                        acc_list = [state[_key(0)]]
+
+                    # Get update from accumulators and gradients
+                    update = _compute_update(beta, acc_list, grad)
+
+                    # Update accumulators.
+                    self._update_accumulator(beta, acc_list, update)
+
+                    # Add small amount for numerical stability
+                    update.add_(eps).rsqrt_().mul_(grad)
+
+                    if momentum > 0.0:
+                        m = state["momentum_buffer"]
+                        update.mul_(1.0 - momentum).add_(m, alpha=momentum)
+                        state["momentum_buffer"] = update.detach()
+
+                p.sub_(update, alpha=group["lr"])
+                state["step"] += 1
+        return loss
+
+    @staticmethod
+    def _update_accumulator(beta, acc_list, update):
+        for i, acc in enumerate(acc_list):
+            nu_max = _max_reduce_except_dim(update, i)
+            if beta > 0.0:
+                torch.max(acc, nu_max, out=acc)
+            else:
+                # No need to compare - nu_max is bigger because of grad ** 2
+                acc.copy_(nu_max)
+
+    @staticmethod
+    def _update_sparse_accumulator(beta, acc, update):
+        nu_max = _max_reduce_except_dim(update.to_dense(), 0).squeeze()
+        if beta > 0.0:
+            torch.max(acc, nu_max, out=acc)
+        else:
+            # No need to compare - nu_max is bigger because of grad ** 2
+            acc.copy_(nu_max)
+
+
+def _compute_sparse_update(beta, acc, grad_values, grad_indices):
+    # In the sparse case, a single accumulator is used.
+    update_values = torch.gather(acc, 0, grad_indices[0])
+    if beta > 0.0:
+        update_values.mul_(beta)
+    update_values.addcmul_(grad_values, grad_values, value=1.0 - beta)
+    return update_values
+
+
+def _compute_update(beta, acc_list, grad):
+    rank = len(acc_list)
+    update = acc_list[0].clone()
+    for i in range(1, rank):
+        # We rely on broadcasting to get the proper end shape.
+        update = torch.min(update, acc_list[i])
+    if beta > 0.0:
+        update.mul_(beta)
+    update.addcmul_(grad, grad, value=1.0 - beta)
+
+    return update
+
+
+def _key(i):
+    # Returns key used for accessing accumulators
+    return "accumulator_" + str(i)
+
+
+def _add_initial_accumulators(state, grad):
+    # Creates initial accumulators. For a dense tensor of shape (n1, n2, n3),
+    # then our initial accumulators are of shape (n1, 1, 1), (1, n2, 1) and
+    # (1, 1, n3). For a sparse tensor of shape (n, *), we use a single
+    # accumulator of shape (n,).
+    shape = grad.shape
+    rank = len(shape)
+    defaults = {"device": grad.device, "dtype": grad.dtype}
+    acc = {}
+
+    if grad.is_sparse:
+        acc[_key(0)] = torch.zeros(shape[0], **defaults)
+    elif rank == 0:
+        # The scalar case is handled separately
+        acc[_key(0)] = torch.zeros(shape, **defaults)
+    else:
+        for i in range(rank):
+            acc_shape = [1] * i + [shape[i]] + [1] * (rank - 1 - i)
+            acc[_key(i)] = torch.zeros(acc_shape, **defaults)
+
+    state.update(acc)
+
+
+def _max_reduce_except_dim(tensor, dim):
+    # Computes max along all dimensions except the given dim.
+    # If tensor is a scalar, it returns tensor.
+    rank = len(tensor.shape)
+    result = tensor
+    if rank > 0:
+        assert dim < rank
+        for d in range(rank):
+            if d != dim:
+                result = result.max(dim=d, keepdim=True).values
+    return result
+
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# modifications  - 4/4/2021  @lessw2020  (decay issue spotted by @nestordemeure )
+# weight decay has been implemented AdamW style instead of the original madgrad Adam style.
+# in initial image classification testing, this outperformed 0 weight decay or original style weight decay.
+
+# closure is checked if callable or not since some code passes loss directly, rather than in closure param
+
+import math
+from typing import Collection, TYPE_CHECKING, Any, Callable, Optional
+
+import torch
+import torch.optim
+import collections
+
+if TYPE_CHECKING:
+    from torch.optim.optimizer import _params_t
+else:
+    _params_t = Any
+
+
+class madgrad_wd(torch.optim.Optimizer):
+    """
+    MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
+    Optimization.
+
+    .. _MADGRAD: https://arxiv.org/abs/2101.11075
+
+    MADGRAD is a general purpose optimizer that can be used in place of SGD or
+    Adam may converge faster and generalize better. Currently GPU-only.
+    Typically, the same learning rate schedule that is used for SGD or Adam may
+    be used. The overall learning rate is not comparable to either method and
+    should be determined by a hyper-parameter sweep.
+
+    MADGRAD requires less weight decay than other methods, often as little as
+    zero. Momentum values used for SGD or Adam's beta1 should work here also.
+
+    On sparse problems both weight_decay and momentum should be set to 0.
+
+    Arguments:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float):
+            Learning rate (default: 1e-2).
+        momentum (float):
+            Momentum value in  the range [0,1) (default: 0.9).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        eps (float):
+            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6).
+    """
+
+    def __init__(
+        self,
+        params: _params_t,
+        lr: float = 1e-2,
+        momentum: float = 0.9,
+        weight_decay: float = 0,
+        eps: float = 1e-6,
+    ):
+        if momentum < 0 or momentum >= 1:
+            raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
+        if lr <= 0:
+            raise ValueError(f"Learning rate {lr} must be positive")
+        if weight_decay < 0:
+            raise ValueError(f"Weight decay {weight_decay} must be non-negative")
+        if eps < 0:
+            raise ValueError(f"Eps must be non-negative")
+
+        defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
+        super().__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self) -> bool:
+        return False
+
+    @property
+    def supports_flat_params(self) -> bool:
+        return True
+
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None and isinstance(closure, collections.Callable):
+            loss = closure()
+
+        # step counter must be stored in state to ensure correct behavior under
+        # optimizer sharding
+        if "k" not in self.state:
+            self.state["k"] = torch.tensor([0], dtype=torch.long)
+        k = self.state["k"].item()
+
+        for group in self.param_groups:
+            eps = group["eps"]
+            lr = group["lr"] + eps
+            decay = group["weight_decay"]
+            momentum = group["momentum"]
+
+            ck = 1 - momentum
+            lamb = lr * math.pow(k + 1, 0.5)
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                state = self.state[p]
+
+                if "grad_sum_sq" not in state:
+                    state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
+                    state["s"] = torch.zeros_like(p.data).detach()
+                    if momentum != 0:
+                        state["x0"] = torch.clone(p.data).detach()
+
+                if momentum != 0.0 and grad.is_sparse:
+                    raise RuntimeError(
+                        "momentum != 0 is not compatible with sparse gradients"
+                    )
+
+                grad_sum_sq = state["grad_sum_sq"]
+                s = state["s"]
+
+                # Apply weight decay - L2 / AdamW style
+                if decay:
+                    p.data.mul_(1 - lr * decay)
+
+                """ original impl:
+                if decay != 0:
+                    if grad.is_sparse:
+                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+
+                    grad.add_(p.data, alpha=decay)
+                """
+
+                if grad.is_sparse:
+                    grad = grad.coalesce()
+                    grad_val = grad._values()
+
+                    p_masked = p.sparse_mask(grad)
+                    grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
+                    s_masked = s.sparse_mask(grad)
+
+                    # Compute x_0 from other known quantities
+                    rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
+                    x0_masked_vals = p_masked._values().addcdiv(
+                        s_masked._values(), rms_masked_vals, value=1
+                    )
+
+                    # Dense + sparse op
+                    grad_sq = grad * grad
+                    grad_sum_sq.add_(grad_sq, alpha=lamb)
+                    grad_sum_sq_masked.add_(grad_sq, alpha=lamb)
+
+                    rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)
+
+                    s.add_(grad, alpha=lamb)
+                    s_masked._values().add_(grad_val, alpha=lamb)
+
+                    # update masked copy of p
+                    p_kp1_masked_vals = x0_masked_vals.addcdiv(
+                        s_masked._values(), rms_masked_vals, value=-1
+                    )
+                    # Copy updated masked p to dense p using an add operation
+                    p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
+                    p.data.add_(p_masked, alpha=-1)
+                else:
+                    if momentum == 0:
+                        # Compute x_0 from other known quantities
+                        rms = grad_sum_sq.pow(1 / 3).add_(eps)
+                        x0 = p.data.addcdiv(s, rms, value=1)
+                    else:
+                        x0 = state["x0"]
+
+                    # Accumulate second moments
+                    grad_sum_sq.addcmul_(grad, grad, value=lamb)
+                    rms = grad_sum_sq.pow(1 / 3).add_(eps)
+
+                    # Update s
+                    s.data.add_(grad, alpha=lamb)
+
+                    # Step
+                    if momentum == 0:
+                        p.data.copy_(x0.addcdiv(s, rms, value=-1))
+                    else:
+                        z = x0.addcdiv(s, rms, value=-1)
+
+                        # p is a moving average of z
+                        p.data.mul_(1 - ck).add_(z, alpha=ck)
+
+        self.state["k"] += 1
+        return loss
diff --git a/benchmarks/sizing/megatron/text_generation_utils.py b/benchmarks/sizing/megatron/text_generation_utils.py
new file mode 100644
index 0000000..5eb9823
--- /dev/null
+++ b/benchmarks/sizing/megatron/text_generation_utils.py
@@ -0,0 +1,809 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for generating text."""
+
+import copy
+import json
+import os
+import time
+from typing import List, Union
+
+import torch
+import torch.nn.functional as F
+
+from megatron import print_rank_0
+from megatron import mpu
+from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0
+
+
+def get_batch(neox_args, context_tokens: torch.Tensor):
+    """
+    Generate batch from context tokens. Attention mask and position ids are created. Returned tensors will be on CUDA.
+
+    neox_args: NeoXArgs.
+    context_tokens: torch tensor with dimensions [batch, context_size]
+
+    returns: tuple of torch tensors (tokens, attention_mask, position_ids) on CUDA
+    """
+
+    # Move to GPU.
+    tokens = context_tokens.contiguous().cuda()
+    # Get the attention mask and position ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        data=tokens,
+        eod_token=neox_args.tokenizer.eod,
+        eod_mask_loss=neox_args.eod_mask_loss,
+    )
+    return tokens, attention_mask, position_ids
+
+
+def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int):
+    """
+    pads context lengths in context_tokens with pad_id to equal neox_args.seq_length,
+    and returns the padded batch and the new lengths.
+
+    context_tokens: list of lists of tokens
+    pad_id: int, integer to use as padding token
+    pad_len: int, context length to be padded; all batch items will be padded to the same length
+
+    returns: tuple of padded context tokens and a list of unpadded token count
+    """
+
+    context_lengths = []
+    for tokens in context_tokens:
+        context_length = len(tokens)
+        if context_length < pad_len:
+            tokens.extend([pad_id] * (pad_len - context_length))
+        elif context_length > pad_len:
+            raise ValueError("context_length is bigger than to be padded length")
+        context_lengths.append(context_length)
+    return context_tokens, context_lengths
+
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """
+    Filters the logits using top_k / top_p, filling any filtered vocab items with filter_value (defaults to -inf).
+
+    This function has been mostly taken from huggingface conversational ai code at
+    https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    logits: torch.Tensor -> logits of megatron model.
+    top_k: integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p: float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+
+    returns: (filtered) logits"""
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # convert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+
+    return logits
+
+
+def switch(val1, val2, boolean):
+    """
+    replaces items in val1 with items in val2 where boolean = True
+    """
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+def forward_model(model, model_inputs, is_pipe_parallel=False) -> torch.Tensor:
+    """
+    Runs model.forward(model_inputs)
+
+    We need to create a wrapper for this function because deepspeed pipe parallel modules operate differently to normal models.
+
+    model: a Megatron model.
+    model_inputs: tuple containing model args
+
+    returns: torch.Tensor containing the logits of the model
+    """
+    # because someone at deepspeed decided pipeline modules couldn't use kwargs,
+    # we need to forward a pipe model differently to a normal model
+    if not is_pipe_parallel:
+        return model.module(model_inputs)
+    else:
+        # we need to format inputs this way because:
+        # a) deepspeed pipeline only accepts iterables
+        # b) deepspeed pipeline *requires* that you pass in labels for the loss, it's not easy to get around this
+        # so we wrap the inputs in an iterable, and pad them (because internally, we get labels as inputs[:, 1:] and inputs as inputs[:, :-1])
+        model_inputs = iter([{"text": F.pad(model_inputs[0], pad=(0, 1))}])
+
+        # set num microbatches to 1 at inference time
+        micro_batches_before = model.micro_batches
+        model.micro_batches = 1
+
+        # deepspeed sends metadata across pipeline stages only once in the first step, then assumes it will stay
+        # constant. In inference, the metadata of the tensors being sent across pipe stages may change, so we need to set
+        # these two flags in order for deepspeed to send the metadata every step, otherwise torch.distributed hangs
+        # silently. Fun stuff.
+        model.first_output_send = True
+        model.pipe_recv_buf = None
+
+        loss, logits = model.eval_batch(model_inputs, return_logits=True)
+        model.micro_batches = micro_batches_before
+        return logits
+
+
+def broadcast_terminate_signal(terminate_runs: int):
+    """Send signal to all workers to terminate if we've finished the process"""
+    terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+    torch.distributed.broadcast(
+        terminate_runs_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group(),
+    )
+    return terminate_runs_tensor[0].item()
+
+
+def stop_tokens_in_completion(stop_tokens, context_tokens, batch_index, current_index):
+    if stop_tokens is None:
+        return False
+    res = []
+    for token_group in stop_tokens:
+        context = context_tokens[batch_index, : current_index + 1]
+        context = context[-len(token_group) :]
+        if context.shape[0] == token_group.shape[0]:
+            res.append(all(token_group == context))
+        else:
+            res.append(False)
+    return any(res)
+
+
+def stream_tokens(
+    neox_args,
+    model,
+    context_tokens: List[List[int]],
+    eos_token_id: int = None,
+    maximum_tokens: int = None,
+    recompute: bool = False,
+    temperature: float = 0.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+    stop_tokens=None,
+):
+    """
+    iterator producing text completions
+
+    neox_args: NeoXArgs.
+    model: a Megatron model.
+    context_tokens: the prompt to complete; unpadded list of lists of tokens ids
+    context_lengths: lengths of context tokens of dimension [batch]; the context length records for each bach item how many non-padded tokens are provided
+    eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
+    attention_mask: attention mask for megatron model.
+    position_ids: position ids for positional encoding.
+    maximum_tokens: maximum number of tokens to be generated; careful! if a batch input is provided maximum_tokens specifies the maximum number of forwards.
+                    longer batch items get less generated tokens.
+    recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false)
+    temperature (default 0.0): exponential scaling output distribution ("higher == more risk")
+    top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+    note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0
+    yields: (
+                tokens (completions from model),
+                token_generation_start_index (token index per batch item for the first generated token),
+                token_generation_end_index (token index per batch item for the last generated token),
+                logits (logits which are so far computed, zeros otherwise),
+                is_done (flag for each bach item indicating whether an eod token was generated)
+            )
+
+            * each iteration adds a generated token to the context_tokens
+            * output contains both context_tokens from input and generated tokens
+            * if batch items have different lengths, the iterator will start at the first completion and return the unchanged input context token otherwise
+    """
+
+    model.eval()
+
+    # pad batch in order to allow conversion to tensor
+    context_tokens, context_lengths = pad_batch(
+        copy.deepcopy(context_tokens),
+        pad_id=neox_args.tokenizer.eod,
+        pad_len=neox_args.seq_length,
+    )
+
+    # convert to tensor and broadcast
+    context_tokens = torch.cuda.LongTensor(context_tokens)
+    if stop_tokens:
+        if len(stop_tokens) > 0 and type(stop_tokens[0]) is not list:
+            stop_tokens = [stop_tokens]
+        for i in range(0, len(stop_tokens)):
+            stop_tokens[i] = torch.cuda.LongTensor(stop_tokens[i])
+
+    # Make sure context tokens + start tokens are the same across all ranks
+    token_generation_start_index = torch.cuda.LongTensor(context_lengths)
+    torch.distributed.broadcast(
+        context_tokens,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group(),
+    )
+    torch.distributed.broadcast(
+        token_generation_start_index,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group(),
+    )
+
+    # get attention mask / position ids
+    context_tokens, attention_mask, position_ids = get_batch(neox_args, context_tokens)
+
+    # set variables
+    eos_token_id = eos_token_id or neox_args.tokenizer.eod
+    maximum_tokens = maximum_tokens or (
+        neox_args.seq_length - token_generation_start_index.max().item() - 1
+    )
+    batch_size = context_tokens.size(0)
+
+    # get the context_index at which generation is to start
+    # we start generation at the position where the smallest context ends
+    token_index_to_generate = token_generation_start_index.min().item()
+    first_token_index_to_generate = token_index_to_generate
+    last_token_index_to_generate = min(
+        neox_args.seq_length
+        - 1,  # never generate more than the model's sequence length
+        token_index_to_generate + maximum_tokens - 1,
+    )
+
+    with torch.no_grad():
+        # initialize generation variables
+        state_is_done = torch.zeros([batch_size]).byte().cuda()
+        token_generation_end_index = torch.ones([batch_size]).long().cuda() * (-1)
+        generation_logits = (
+            torch.empty(maximum_tokens, neox_args.padded_vocab_size).float().cuda()
+        )
+
+        while token_index_to_generate <= last_token_index_to_generate:
+            if recompute:  # recompute all tokens
+                model_inputs = (
+                    context_tokens,
+                    position_ids,
+                    attention_mask,
+                )
+                logits = forward_model(model, model_inputs, neox_args.is_pipe_parallel)
+                if logits is not None:  # if pipe parallel, not all ranks return logits
+                    generated_token_logits = logits[
+                        :, token_index_to_generate - 1, :
+                    ]  # [bs, seq, vocab_size] -> [bs, vocab_size]
+            else:  # use kv cache
+                if token_index_to_generate == first_token_index_to_generate:
+                    tokens_to_use = context_tokens[:, :token_index_to_generate]
+                    positions_to_use = position_ids[:, :token_index_to_generate]
+                else:
+                    tokens_to_use = context_tokens[:, token_index_to_generate - 1].view(
+                        batch_size, -1
+                    )
+                    positions_to_use = position_ids[
+                        :, token_index_to_generate - 1
+                    ].view(batch_size, -1)
+
+                model_inputs = (
+                    tokens_to_use,  # input_ids
+                    positions_to_use,  # position_ids
+                    attention_mask,  # attention_mask
+                )
+
+                logits = forward_model(model, model_inputs, neox_args.is_pipe_parallel)
+                if logits is not None:  # if pipe parallel, not all ranks return logits
+                    generated_token_logits = (
+                        logits[:, -1].view(batch_size, -1).contiguous()
+                    )  # [bs, seq, vocab_size] -> [bs, vocab_size]
+
+            if logits is not None:
+                # sample token id of the to be generated token
+                if temperature == 0.0 and top_k == 0 and top_p == 0.0:
+                    generated_tokens = torch.argmax(
+                        generated_token_logits, dim=-1
+                    ).view(-1)
+                else:
+                    generated_token_logits = generated_token_logits.float()
+                    if temperature > 0.0:
+                        generated_token_logits /= temperature
+                    generated_token_logits = filter_logits(
+                        generated_token_logits, top_k=top_k, top_p=top_p
+                    )
+                    next_token_log_probs = F.softmax(generated_token_logits, dim=-1)
+                    generated_tokens = torch.multinomial(
+                        next_token_log_probs, num_samples=1
+                    ).view(-1)
+
+                if neox_args.return_logits:
+                    generation_logits[
+                        token_index_to_generate - 1
+                    ] = generated_token_logits[0]
+
+            if neox_args.is_pipe_parallel:
+                # broadcast generated tokens to pipe parallel group
+                src_rank = model.grid.stage_to_global(model.num_stages - 1)
+                generated_tokens = (
+                    generated_tokens
+                    if logits is not None
+                    else torch.zeros(batch_size, dtype=torch.long).cuda()
+                )
+                torch.distributed.broadcast(
+                    tensor=generated_tokens,
+                    src=src_rank,
+                    group=mpu.get_pipe_parallel_group(),
+                )
+
+            # determine if state has started for each batch item
+            state_started = (
+                token_generation_start_index <= token_index_to_generate
+            )  # check which batch items have been started
+
+            # switch out padding tokens for generated tokens
+            context_tokens[:, token_index_to_generate] = switch(
+                context_tokens[:, token_index_to_generate].view(-1),
+                generated_tokens,
+                state_started,
+            )
+
+            # determine if state has finished for each batch item
+            state_done = (
+                generated_tokens == eos_token_id
+            ).byte() & state_started.byte()  # check which batch items produce an eos_token in the current iteration
+            state_just_finished = (state_done & ~state_is_done).bool()
+            state_is_done = state_is_done | state_done
+            stop_tokens_produced = torch.zeros_like(state_is_done)
+            for batch_idx, ctx in enumerate(context_tokens):
+                stop_tokens_produced[batch_idx] = stop_tokens_in_completion(
+                    stop_tokens, context_tokens, batch_idx, token_index_to_generate
+                )
+            state_is_done = state_is_done | stop_tokens_produced
+
+            token_generation_end_index[
+                (state_started.byte() & ~state_is_done).bool()
+            ] = token_index_to_generate
+
+            token_index_to_generate += 1
+
+            yield context_tokens, token_generation_start_index, token_generation_end_index, generation_logits, state_is_done.bool()
+            if torch.all(state_is_done):
+                break
+
+
+def generate_samples_from_prompt(
+    neox_args,
+    model,
+    text: Union[List[str], str],
+    eos_token_id: int = None,
+    maximum_tokens: int = 64,
+    recompute: bool = False,
+    temperature: float = 0.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+    stop_tokens=None,
+):
+    """
+    Generates samples from raw text and returns them in a dictionary.
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+    text: either a single prompt (str) or a list of prompts (List[str]).
+
+    eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
+    maximum_tokens: maximum number of tokens to be generated
+
+    recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false)
+
+    temperature (default 0.0): exponential scaling output distribution ("higher == more risk")
+    top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+    note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0
+
+    returns: List[dict] -> a list of dicts containing the following fields:
+        - 'context' (the input)
+        - 'text' (the completion)
+        - 'length' (the length of the completion in number of tokens)
+        - 'finished':
+        - 'message': a messaged associated with the generation procedure, can be a warning or error
+        - 'duration_seconds': duration of the generation in seconds
+
+    """
+    eos_token_id = eos_token_id or neox_args.tokenizer.eod
+
+    # type check
+    assert any(
+        [isinstance(text, str), isinstance(text, list)]
+    ), "Text should be in string or list form"
+    if isinstance(text, str):
+        text = [text]
+
+    input_count = len(text)
+    input_pos = 0
+
+    # generate completions
+    generated_texts = []
+    while True:
+
+        start_time = time.time()
+        # Tokenize text, and check whether we should terminate process
+        terminate_runs = 0
+        if input_pos == input_count:
+            terminate_runs = 1
+        else:
+            raw_text = text[input_pos]
+            input_pos += 1
+
+            if raw_text == "":
+                context_tokens = [eos_token_id]
+            else:
+                context_tokens = neox_args.tokenizer.tokenize(raw_text)
+            context_length = len(context_tokens)
+
+            if context_length >= (neox_args.seq_length // 2):
+                print_rank_0(
+                    "\nWarning! Context length",
+                    context_length,
+                    "\nPlease give smaller context (e.g. half of the "
+                    "max sequence length)!",
+                )
+        if not is_mp_rank_0():
+            context_tokens = neox_args.tokenizer.tokenize("EMPTY TEXT")
+            context_length = len(context_tokens)
+            terminate_runs = 0
+
+        terminate_runs = broadcast_terminate_signal(terminate_runs)
+        if terminate_runs == 1:
+            return generated_texts
+
+        for (
+            batch_context_tokens,
+            batch_token_generation_start_index,
+            batch_token_generation_end_index,
+            batch_generated_token_logits,
+            is_done,
+        ) in stream_tokens(
+            neox_args=neox_args,
+            model=model,
+            context_tokens=[context_tokens],
+            eos_token_id=eos_token_id,
+            maximum_tokens=maximum_tokens,
+            recompute=recompute,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            stop_tokens=stop_tokens,
+        ):
+            pass  # finish generation and use all results below
+
+        batch_context_tokens = batch_context_tokens.cpu().numpy().tolist()
+        batch_token_generation_start_index = (
+            batch_token_generation_start_index.cpu().numpy().tolist()
+        )
+        batch_token_generation_end_index = (
+            batch_token_generation_end_index.cpu().numpy().tolist()
+        )
+        batch_is_done = is_done.cpu().numpy().tolist()
+
+        for tokens, start_index, end_index, is_done in zip(
+            batch_context_tokens,
+            batch_token_generation_start_index,
+            batch_token_generation_end_index,
+            batch_is_done,
+        ):
+
+            if end_index >= start_index:
+                generated_tokens = tokens[start_index : end_index + 1]
+                try:
+                    generated_text = neox_args.tokenizer.detokenize(generated_tokens)
+                    message = None
+                except KeyError:
+                    generated_text = None
+                    message = "WARNING: generated token which doesn't exist."
+            else:
+                generated_text = None
+                generated_tokens = []
+                # this will happen if the first generated token is a stop token or eos token
+                message = "WARNING: text generation did not start; try different batching or adjust parameters"
+            if is_mp_rank_0():
+                data = {
+                    "context": raw_text,
+                    "text": generated_text,
+                    "length": len(generated_tokens),
+                    "finished": is_done,
+                    "message": message,
+                    "duration_seconds": float(time.time() - start_time),
+                }
+
+                if neox_args.return_logits:
+                    data["logits"] = batch_generated_token_logits.cpu().numpy().tolist()
+
+                generated_texts.append(data)
+
+    return generated_texts
+
+
+def generate_samples_input_from_file(
+    neox_args,
+    model,
+    input_file,
+    output_file=None,
+    eos_token_id: int = None,
+    maximum_tokens: int = 64,
+    prompt_end: str = "\n",
+    recompute: bool = False,
+    temperature: float = 0.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+):
+    """
+    Generates samples from an input file and writes them to an output file.
+
+    Reads prompts from neox_args.sample_input_file and writes completions to neox_args.sample_output_file
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+
+    input_file: path to input file. Each line in the input file will be treated as separate prompt. The line break at the end of the line is not included in the prompt.
+    output_file: file where generation results are to be stored in jsonl format. defaults to input_file+'.output.jsonl' if not defined
+
+    eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
+    maximum_tokens: maximum number of tokens to be generated
+    prompt_end: end of a single input prompt. Defaults to newline character '\n'. Other prompt-end sequences may be useful when generating indent-aware completions (e.g. code)
+
+    recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false)
+
+    temperature (default 0.0): exponential scaling output distribution ("higher == more risk")
+    top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+
+    note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0
+
+
+    returns: List[dict] -> a list of dicts containing the following fields:
+        - 'context' (the input)
+        - 'text' (the completion)
+        - 'length' (the length of the completion in number of tokens)
+        - 'finished':
+        - 'message': a messaged associated with the generation procedure, can be a warning or error
+        - 'duration_seconds': duration of the generation in seconds
+    """
+    # Read the sample file
+    print_rank_0(
+        "generate_samples_input_from_file() loading input from {}".format(input_file)
+    )
+    with open(input_file, "r", encoding="utf-8") as f:
+        prompts = f.read()
+        prompts = prompts.split(prompt_end)
+    prompts = [p.strip() for p in prompts]
+    prompts = [p for p in prompts if len(p) > 0]
+    print_rank_0(
+        "generate_samples_input_from_file() prompts loaded: {}".format(len(prompts))
+    )
+
+    if is_mp_rank_0():
+        if output_file is None:
+            output_file = str(input_file) + ".output.jsonl"
+            print_rank_0(
+                "generate_samples_input_from_file() setting default output file to {}".format(
+                    output_file
+                )
+            )
+
+    print_rank_0("generate_samples_input_from_file() generating...")
+    generated_texts = generate_samples_from_prompt(
+        neox_args=neox_args,
+        model=model,
+        text=prompts,
+        eos_token_id=eos_token_id,
+        maximum_tokens=maximum_tokens,
+        recompute=recompute,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+    )
+
+    if is_mp_rank_0():
+        with open(output_file, "w") as f_out:
+            for item in generated_texts:
+                f_out.write(json.dumps(item) + "\n")
+    print_rank_0("generate_samples_input_from_file() done")
+    return generated_texts
+
+
+def generate_samples_unconditional(
+    neox_args,
+    model,
+    number_of_samples: int = 10,
+    output_file=None,
+    eos_token_id: int = None,
+    maximum_tokens: int = 64,
+    recompute: bool = False,
+    temperature: float = 0.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+):
+    """
+    Generates samples unconditionially (no prompt) and yields them in a dictionary.
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+
+    number_of_samples (default 10): number of unconditional samples to be generated
+
+    output_file: file where generation results are to be stored in jsonl format. no file will be stored if omitted
+
+    eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
+    maximum_tokens: maximum number of tokens to be generated
+    prompt_end: end of a single input prompt. Defaults to newline character '\n'. Other prompt-end sequences may be useful when generating indent-aware completions (e.g. code). The interactive mode will reroll the user-input request until the stop-char is met
+
+    recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false)
+
+    temperature (default 0.0): exponential scaling output distribution ("higher == more risk")
+    top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+
+    note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0
+
+    yields: dict containing the following fields:
+        - 'context' (the input)
+        - 'text' (the completion)
+        - 'length' (the length of the completion in number of tokens)
+        - 'finished':
+        - 'message': a messaged associated with the generation procedure, can be a warning or error
+        - 'duration_seconds': duration of the generation in seconds
+    """
+
+    print_rank_0("generate_samples_unconditional() generating...")
+    assert number_of_samples > 0, "number_of_samples must be > 0"
+    generated_texts = generate_samples_from_prompt(
+        neox_args=neox_args,
+        model=model,
+        text=["" for _ in range(number_of_samples)],
+        eos_token_id=eos_token_id,
+        maximum_tokens=maximum_tokens,
+        recompute=recompute,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+    )
+
+    if is_mp_rank_0():
+        if output_file is not None:
+            with open(output_file, "w") as f_out:
+                for item in generated_texts:
+                    f_out.write(json.dumps(item) + "\n")
+    print_rank_0("generate_samples_unconditional() done")
+    return generated_texts
+
+
+def generate_samples_interactive(
+    neox_args,
+    model,
+    maximum_tokens: int = 64,
+    prompt_end: str = "\n",
+    eos_token_id: int = None,
+    recompute: bool = False,
+    temperature: float = 0.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+):
+    """
+    Generates samples unconditionially (no prompt) and yields them in a dictionary.
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+
+    maximum_tokens: maximum number of tokens to be generated
+    eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
+
+    recompute: flag indicating whether a cache is used for already forwarded tokens (true) or whether all tokens are recomputed at every iteration (false)
+
+    temperature (default 0.0): exponential scaling output distribution ("higher == more risk")
+    top_k (default 0): integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+    top_p (default 0.0): float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+
+    note: greedy decoding is used if temperature is 0.0, top_k is 0 and top_p is 0.0
+
+    yields: dict containing the following fields:
+        - 'context' (the input)
+        - 'text' (the completion)
+        - 'length' (the length of the completion in number of tokens)
+        - 'finished':
+        - 'message': a messaged associated with the generation procedure, can be a warning or error
+        - 'duration_seconds': duration of the generation in seconds
+    """
+
+    while True:
+        model.module.clear_cache()  # clear kv cache between batches
+        torch.distributed.barrier(group=mpu.get_model_parallel_group())
+        terminate_runs = 0
+
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
+            os.system("clear")
+            raw_text = ""
+            while True:
+                current_input = input("Context prompt >>> ")
+                if (
+                    prompt_end == "\n"
+                ):  # we need to handle '\n' case as 'input' strips it and leads to lines being squashed
+                    raw_text += current_input
+                    break
+                if prompt_end in current_input:
+                    raw_text += current_input.split(prompt_end)[0]
+                    break
+                raw_text += (
+                    current_input + "\n"
+                )  # re-add newline since we stripped it on input
+            context_tokens = neox_args.tokenizer.tokenize(raw_text)
+            if len(context_tokens) == 0:
+                context_tokens = [neox_args.tokenizer.eod]
+            context_length = len(context_tokens)
+            if context_length >= (neox_args.seq_length - 1):
+                print_rank_0(
+                    "\nContext length"
+                    + str(context_length)
+                    + "\nReached max sequence length!"
+                )
+                terminate_runs = 1
+        else:
+            context_tokens = neox_args.tokenizer.tokenize("EMPTY TEXT")
+            context_length = len(context_tokens)
+
+        terminate_runs = broadcast_terminate_signal(terminate_runs)
+        if terminate_runs == 1:
+            return
+        for (
+            batch_context_tokens,
+            batch_token_generation_start_index,
+            batch_token_generation_end_index,
+            batch_generated_token_logits,
+            is_done,
+        ) in stream_tokens(
+            neox_args=neox_args,
+            model=model,
+            context_tokens=[context_tokens],
+            eos_token_id=eos_token_id,
+            maximum_tokens=maximum_tokens,
+            recompute=recompute,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        ):
+            if mpu.get_model_parallel_rank() == 0:
+                generated_tokens = (
+                    batch_context_tokens[0]
+                    .cpu()
+                    .numpy()
+                    .tolist()[
+                        batch_token_generation_start_index[0]
+                        .item() : batch_token_generation_end_index[0]
+                        .item()
+                        + 1
+                    ]
+                )
+                generated_text = neox_args.tokenizer.detokenize(generated_tokens)
+                print_rank_0("Generated Text: " + generated_text)
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
+            _ = input("\n<press enter to continue>")
diff --git a/benchmarks/sizing/megatron/tokenizer/__init__.py b/benchmarks/sizing/megatron/tokenizer/__init__.py
new file mode 100644
index 0000000..22b0f7b
--- /dev/null
+++ b/benchmarks/sizing/megatron/tokenizer/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .tokenizer import build_tokenizer
diff --git a/benchmarks/sizing/megatron/tokenizer/gpt2_tokenization.py b/benchmarks/sizing/megatron/tokenizer/gpt2_tokenization.py
new file mode 100644
index 0000000..f8a1128
--- /dev/null
+++ b/benchmarks/sizing/megatron/tokenizer/gpt2_tokenization.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for OpenAI GPT."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+from functools import lru_cache
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    "gpt2": 1024,
+}
+
+VOCAB_NAME = "vocab.json"
+MERGES_NAME = "merges.txt"
+SPECIAL_TOKENS_NAME = "special_tokens.txt"
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
+    ):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(
+                pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
+            )
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info(
+                    "loading special tokens file {}".format(special_tokens_file)
+                )
+        # redirect to the cache, if necessary
+        try:
+            from .file_utils import cached_path
+
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file,
+                    merges_file,
+                )
+            )
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info(
+                "loading vocabulary file {} from cache at {}".format(
+                    vocab_file, resolved_vocab_file
+                )
+            )
+            logger.info(
+                "loading merges file {} from cache at {}".format(
+                    merges_file, resolved_merges_file
+                )
+            )
+        if (
+            pretrained_model_name_or_path
+            in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
+        ):
+            # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path
+            ]
+            kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and "special_tokens" not in kwargs:
+            special_tokens = (
+                open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
+            )
+        else:
+            special_tokens = kwargs.pop("special_tokens", [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs
+        )
+        return tokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        special_tokens=None,
+        max_len=None,
+    ):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """Add a list of additional tokens to the encoder.
+        The additional tokens are indexed starting from the last index of the
+        current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict(
+            (tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
+        )
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    @lru_cache(maxsize=131072)
+    def bpe(self, token):
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        return word
+
+    def tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = "".join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        if isinstance(tokens, str) or (
+            sys.version_info[0] == 2 and isinstance(tokens, unicode)
+        ):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len
+                )
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = "".join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            "utf-8", errors=self.errors
+        )
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error(
+                "Vocabulary path ({}) should be a directory".format(vocab_path)
+            )
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(
+                self.bpe_ranks.items(), key=lambda kv: kv[1]
+            ):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(
+                            merge_file
+                        )
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(
+                self.special_tokens.items(), key=lambda kv: kv[1]
+            ):
+                if index != token_index:
+                    logger.warning(
+                        "Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(
+                            special_tokens_file
+                        )
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/benchmarks/sizing/megatron/tokenizer/tokenizer.py b/benchmarks/sizing/megatron/tokenizer/tokenizer.py
new file mode 100644
index 0000000..627770d
--- /dev/null
+++ b/benchmarks/sizing/megatron/tokenizer/tokenizer.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron tokenizers."""
+
+from abc import ABC
+from abc import abstractmethod
+
+from tokenizers import Tokenizer
+from transformers import GPT2Tokenizer, GPT2TokenizerFast
+import numpy as np
+import sentencepiece as spm
+from typing import List, Union
+from .gpt2_tokenization import GPT2Tokenizer
+
+
+def build_tokenizer(args):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
+
+    # Select and instantiate the tokenizer.
+    if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
+        assert args.vocab_file is not None
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
+        assert args.vocab_file is not None
+        tokenizer = SentencePieceTokenizer(args.vocab_file)
+    elif args.tokenizer_type.lower() == "HFTokenizer".lower():
+        assert args.vocab_file is not None
+        tokenizer = HFTokenizer(args.vocab_file)
+    elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
+        if args.vocab_file is None:
+            print(
+                "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
+            )
+        tokenizer = HFGPT2Tokenizer(args.vocab_file)
+    elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
+        tokenizer = CharLevelTokenizer(vocab_size=512)
+    elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
+        assert args.vocab_file is not None
+        tokenizer = TiktokenTokenizer(args.vocab_file)
+    else:
+        raise NotImplementedError(
+            "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
+        )
+
+    # Add vocab size.
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
+
+    return tokenizer
+
+
+def _vocab_size_with_padding(orig_vocab_size, args):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(
+            " > padded vocab (size: {}) with {} dummy tokens "
+            "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
+            flush=True,
+        )
+    return after
+
+
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+
+    def detokenize(self, token_ids):
+        raise NotImplementedError(
+            "detokenizer is not implemented for {} " "tokenizer".format(self.name)
+        )
+
+    @property
+    def cls(self):
+        raise NotImplementedError(
+            "CLS is not provided for {} " "tokenizer".format(self.name)
+        )
+
+    @property
+    def sep(self):
+        raise NotImplementedError(
+            "SEP is not provided for {} " "tokenizer".format(self.name)
+        )
+
+    @property
+    def pad(self):
+        raise NotImplementedError(
+            "PAD is not provided for {} " "tokenizer".format(self.name)
+        )
+
+    @property
+    def eod(self):
+        raise NotImplementedError(
+            "EOD is not provided for {} " "tokenizer".format(self.name)
+        )
+
+    @property
+    def mask(self):
+        raise NotImplementedError(
+            "MASK is not provided for {} " "tokenizer".format(self.name)
+        )
+
+
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+
+    def __init__(self, vocab_file, merge_file):
+        name = "GPT2 BPE"
+        super().__init__(name)
+
+        self.tokenizer = GPT2Tokenizer(
+            vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
+        )
+        self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class SentencePieceTokenizer(AbstractTokenizer):
+    """Designed to Integrate SP's Tokenizer."""
+
+    def __init__(self, vocab_file):
+        name = "SPM"
+        super().__init__(name)
+
+        self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
+        self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.get_piece_size()
+
+    @property
+    def vocab(self):
+        return {
+            self.tokenizer.id_to_piece(idx): idx
+            for idx in range(self.tokenizer.get_piece_size())
+        }
+
+    @property
+    def inv_vocab(self):
+        return {
+            idx: self.tokenizer.id_to_piece(idx)
+            for idx in range(self.tokenizer.get_piece_size())
+        }
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class HFTokenizer(AbstractTokenizer):
+    """Designed to Integrate HF's Tokenizer library."""
+
+    def __init__(self, vocab_file):
+        name = "HFTokenizer"
+        super().__init__(name)
+        self.tokenizer = Tokenizer.from_file(vocab_file)
+        self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
+        self.pad_id = self.tokenizer.token_to_id("<|padding|>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+
+    @property
+    def vocab(self):
+        return self.tokenizer.get_vocab()
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text: str):
+        return self.tokenizer.encode(text).ids
+
+    def tokenize_batch(self, text_batch: Union[List[str], str]):
+        return self.tokenizer.encode_batch(text_batch)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class HFGPT2Tokenizer(AbstractTokenizer):
+    """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
+
+    def __init__(self, vocab_file=None, fast=True):
+        name = "HFGPT2Tokenizer"
+        if fast:
+            name += "Fast"
+        super().__init__(name)
+        if vocab_file is None:
+            vocab_file = "gpt2"
+        if fast:
+            self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
+        else:
+            self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
+
+        self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
+        self.eod_id = self.tokenizer.eos_token_id
+        self.pad_id = self.tokenizer.pad_token_id
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.get_vocab()
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer._tokenizer.decoder
+
+    def tokenize(self, text: str):
+        return self.tokenizer.encode(text)
+
+    def tokenize_batch(self, text_batch: Union[List[str], str]):
+        if isinstance(text_batch, str):
+            text_batch = [text_batch]
+        return [self.tokenize(t) for t in text_batch]
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class CharLevelTokenizer(AbstractTokenizer):
+    """Character Level Tokenizer"""
+
+    def __init__(self, vocab_size):
+        name = "CharLevelTokenizer"
+        super().__init__(name)
+        self._vocab_size = vocab_size
+        self.eod_id = 0
+        self.pad_id = 1
+
+    def clamp(self, n):
+        return max(32, min(n, self.vocab_size))
+
+    @property
+    def vocab_size(self):
+        return self._vocab_size
+
+    @property
+    def vocab(self):
+        raise NotImplementedError
+
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError
+
+    def decode_token(self, token: int):
+        return str(chr(self.clamp(token)))
+
+    def tokenize(self, text: str):
+        return list(np.fromstring(text, dtype=np.uint8))
+
+    def tokenize_batch(self, text_batch: Union[List[str], str]):
+        if isinstance(text_batch, list):
+            return [self.tokenize(s) for s in text_batch]
+        else:
+            return self.tokenize(text_batch)
+
+    def detokenize(self, token_ids):
+        return "".join(list(map(self.decode_token, token_ids)))
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class TiktokenTokenizer(AbstractTokenizer):
+    """Tokenizer from OpenAI's tiktoken implementation"""
+
+    def __init__(self, vocab_file):
+        try:
+            import tiktoken
+        except ModuleNotFoundError:
+            print("Please install tiktoken: (https://github.com/openai/tiktoken)")
+            raise Exception
+
+        name = "TiktokenTokenizer"
+        super().__init__(name)
+
+        self.tokenizer = tiktoken.get_encoding(vocab_file)
+        self.eod_id = self.tokenizer.eot_token
+        self.pad_id = None
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    @property
+    def vocab(self):
+        raise NotImplementedError(
+            "TiktokenTokenizer does not implement vocabulary access."
+        )
+
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError(
+            "TiktokenTokenizer does not implement vocabulary access. \
+                To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
+        )
+
+    def tokenize(self, text: str):
+        return self.tokenizer.encode(text)  # ,  allowed_special="all")
+
+    def tokenize_batch(self, text_batch: List[str]):
+        return self.tokenizer.encode_batch(text_batch, allowed_special="all")
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(tokens=token_ids, errors="strict")
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+    @property
+    def pad(self):
+        raise NotImplementedError
diff --git a/benchmarks/sizing/megatron/tokenizer/train_tokenizer.py b/benchmarks/sizing/megatron/tokenizer/train_tokenizer.py
new file mode 100644
index 0000000..14a8e29
--- /dev/null
+++ b/benchmarks/sizing/megatron/tokenizer/train_tokenizer.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Assumes a dataset of jsonl files in the same format as the neox training set.
+"""
+
+from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
+from tokenizers.normalizers import NFKC
+
+from glob import glob
+import os
+import json
+import argparse
+
+
+def load_jsonl(input_path, quiet=True) -> list:
+    """
+    Read list of objects from a JSON lines file.
+    """
+    data = []
+    with open(input_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line.rstrip("\n|\r")))
+    if not quiet:
+        print("Loaded {} records from {}".format(len(data), input_path))
+    return data
+
+
+def json_iterator(input_dir, text_key="text"):
+    all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
+    for j in all_jsonls:
+        data = load_jsonl(j)
+        for doc in data:
+            yield doc[text_key]
+
+
+def train_tokenizer(
+    input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
+):
+    """
+    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
+
+    :param input_dir: input directory containing jsonl files
+    :param save_path: path to save tokenizer to
+    :param tokenizer_type: type of tokenizer to train.
+    :param vocab_size: int, size of tokenizer's vocab
+    :return:
+    """
+
+    if tokenizer_type == "BPE":
+        model = models.BPE()
+    else:
+        raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
+    tokenizer = Tokenizer(model)
+
+    # Customize pre-tokenization and decoding
+    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
+    tokenizer.decoder = decoders.ByteLevel()
+    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
+    tokenizer.normalizer = NFKC()
+
+    # And then train
+    trainer = trainers.BpeTrainer(
+        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
+    )
+    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
+
+    # And Save it
+    tokenizer.save(save_path, pretty=True)
+    print(f"Tokenizer saved at {save_path}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="script for training a multilingual "
+        "HF tokenizer on CC dumps with upweighting for low resource languages"
+    )
+    parser.add_argument(
+        "--json_input_dir",
+        type=str,
+        help="Path to folder containing tokenizer training data in jsonl format",
+    )
+    parser.add_argument(
+        "--tokenizer_output_path",
+        type=str,
+        help="Path to which your trained tokenizer will be saved (should end in .json)",
+    )
+    parser.add_argument(
+        "--tokenizer_type",
+        type=str,
+        help="type of tokenizer to train, currently only BPE is supported",
+        choices=["BPE"],
+        default=["BPE"],
+    )
+    parser.add_argument(
+        "-v",
+        "--vocab_size",
+        help="vocabulary size of tokenizer, default=52k",
+        type=int,
+        default=52000,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    train_tokenizer(
+        args.json_input_dir,
+        save_path=args.tokenizer_output_path,
+        tokenizer_type=args.tokenizer_type,
+        vocab_size=args.vocab_size,
+    )
diff --git a/benchmarks/sizing/megatron/training.py b/benchmarks/sizing/megatron/training.py
new file mode 100644
index 0000000..2d47f1d
--- /dev/null
+++ b/benchmarks/sizing/megatron/training.py
@@ -0,0 +1,989 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file has been modified from its original version
+#
+
+"""Pretrain utilities."""
+from datetime import datetime
+from functools import partial
+
+import math
+import sys
+
+import torch
+import deepspeed
+from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
+import numpy as np
+
+from megatron.utils import (
+    Timers,
+    init_wandb,
+    get_ltor_masks_and_position_ids,
+    reduce_losses,
+)
+
+from megatron import print_rank_0, mpu
+from megatron.model import (
+    GPT2ModelPipe,
+    SoftEmbedding,
+    get_params_for_weight_decay_optimization,
+)
+from megatron.checkpointing import load_checkpoint, save_checkpoint
+from megatron.data.data_utils import build_train_valid_test_data_iterators
+from megatron.initialize import initialize_megatron
+from megatron.learning_rates import AnnealingLR
+from megatron.logging import tb_wandb_log, training_log
+from megatron.utils import (
+    OverflowMonitor,
+    get_noise_scale_logger,
+    get_total_params,
+    CharCounter,
+)
+from megatron.model.gpt2_model import cross_entropy
+from eval_tasks import run_eval_harness
+
+
+def mup_weights_reinit(neox_args, model):
+    def has_method(o, name):
+        return callable(getattr(o, name, None))
+
+    for layer in model.modules():
+        # This normally would happen in set_base_shapes if we actually were able to use the MuReadout class
+        if hasattr(layer, "mup_rescale_parameters") and layer.mup_rescale_parameters:
+            layer._rescale_parameters()
+
+        if has_method(layer, "mup_reinitialize_weights"):
+            layer.mup_reinitialize_weights(neox_args)
+
+
+def save_base_shapes(neox_args, base_shapes, use_cache):
+
+    # Instantiation of the base model fails in the init function (init_functions.py) because we haven't called set_base_shapes on it at this point, so disable it temporarily here
+    neox_args.use_mup = False
+
+    base_model = GPT2ModelPipe(
+        neox_args=neox_args,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=mpu.get_topology(),
+        use_cache=use_cache,
+    )
+
+    if not neox_args.is_pipe_parallel:
+        base_model = base_model.to_sequential()
+
+    try:
+        import mup
+    except ModuleNotFoundError:
+        print("Please install mup https://github.com/microsoft/mup")
+        raise Exception
+
+    base_shapes = mup.get_shapes(base_model)
+
+    del base_model
+
+    old_hidden_size = neox_args.hidden_size
+    neox_args.hidden_size = neox_args.hidden_size * neox_args.mup_width_scale
+
+    delta_model = GPT2ModelPipe(
+        neox_args=neox_args,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=mpu.get_topology(),
+        use_cache=use_cache,
+    )
+
+    if not neox_args.is_pipe_parallel:
+        delta_model = delta_model.to_sequential()
+
+    delta_shapes = mup.get_shapes(delta_model)
+
+    # change back
+    neox_args.use_mup = True
+    neox_args.hidden_size = old_hidden_size
+
+    save_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
+    print(f"saving base shapes at {save_shapes}")
+    mup.make_base_shapes(base_shapes, delta_shapes, savefile=save_shapes)
+    print(f"base shapes saved...exiting")
+    sys.exit(1)
+
+
+def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator):
+    from megatron.mup_substitute import get_coord_data
+    from mup.coord_check import plot_coord_data
+
+    def lazy_model(hidden_size):
+        def gen():
+            old_hidden_size = neox_args.hidden_size
+            neox_args.hidden_size = hidden_size
+
+            model, optimizer, _ = setup_model_and_optimizer(
+                neox_args=neox_args, use_cache=False
+            )
+
+            neox_args.hidden_size = old_hidden_size
+
+            return model
+
+        return gen
+
+    models = {}
+
+    # Hidden size needs to be divisible by num attention heads
+    for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
+        models[hidden_size] = lazy_model(hidden_size)
+
+    neox_args.use_mup = True
+    df_up = get_coord_data(
+        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True
+    )
+    neox_args.use_mup = False
+    df_sp = get_coord_data(
+        neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False
+    )
+
+    plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
+    plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")
+
+    print_rank_0("Saved coord check plots... exiting")
+    sys.exit(1)
+
+
+def pretrain(neox_args):
+    """Main training program.
+
+    This function will run the following in the order provided:
+        1) initialize Megatron.
+        2) setup model, optimizer and lr schedule
+        3) call train_val_test_data_provider to get train/val/test datasets.
+        4) train the model.
+
+    Arguments:
+        neox_args: an instance of NeoXArgs containing the configuration for pretrain
+
+    """
+    # setup logging and timers
+    init_wandb(neox_args=neox_args)
+    timers = Timers(
+        use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
+    )
+
+    # Initialize and get arguments, timers, and Tensorboard writer.
+    initialize_megatron(neox_args=neox_args)
+
+    # Model, optimizer, and learning rate.
+    timers("model and optimizer").start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+        neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
+    )
+    timers("model and optimizer").stop()
+
+    # Data stuff.
+    timers("train/valid/test data iterators").start()
+    (
+        train_data_iterator,
+        valid_data_iterator,
+        test_data_iterator,
+    ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+    timers("train/valid/test data iterators").stop()
+
+    if neox_args.use_mup and neox_args.coord_check:
+        mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator)
+
+    # Print setup timing.
+    print_rank_0("done with setups ...")
+    timers.log(["model and optimizer", "train/valid/test data iterators"])
+    print_rank_0("training ...")
+
+    iteration = neox_args.iteration
+    if neox_args.do_train and neox_args.train_iters > 0:
+        # edge case: save step 0 checkpoint if requested and we're starting from step 0
+        if neox_args.save and 0 in neox_args.save_iters and iteration == 0:
+            save_checkpoint(
+                neox_args=neox_args,
+                iteration=iteration,
+                model=model,
+                optimizer=optimizer,
+                lr_scheduler=lr_scheduler,
+            )
+
+        iteration = train(
+            neox_args=neox_args,
+            timers=timers,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            train_data_iterator=train_data_iterator,
+            valid_data_iterator=valid_data_iterator,
+        )
+
+    if neox_args.do_valid:
+        prefix = "the end of training for val data"
+        evaluate_and_print_results(
+            neox_args=neox_args,
+            prefix=prefix,
+            forward_step_func=forward_step,
+            data_iterator=valid_data_iterator,
+            model=model,
+            iteration=iteration,
+            verbose=False,
+            timers=timers,
+        )
+
+    if neox_args.save and iteration != 0:
+        save_checkpoint(
+            neox_args=neox_args,
+            iteration=iteration,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+        )
+
+    if neox_args.do_test:
+        # Run on test data.
+        prefix = "the end of training for test data"
+        evaluate_and_print_results(
+            neox_args=neox_args,
+            prefix=prefix,
+            forward_step_func=forward_step,
+            data_iterator=test_data_iterator,
+            model=model,
+            iteration=iteration,
+            verbose=True,
+            timers=timers,
+            chart_name="test",
+        )
+
+
+def _get_batch(neox_args, tokenizer, keys, data, datatype):
+    """Support function for get_batch / get_batch pipe (to avoid code repetition)"""
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b["text"].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and position ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        data=tokens,
+        eod_token=neox_args.tokenizer.eod,
+        eod_mask_loss=neox_args.eod_mask_loss,
+    )
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+
+def get_batch(neox_args, data_iterator):
+    """Generate a batch"""
+
+    # Items and their type.
+    keys = ["text"]
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    return _get_batch(
+        neox_args=neox_args,
+        tokenizer=neox_args.tokenizer,
+        keys=keys,
+        data=data,
+        datatype=datatype,
+    )
+
+
+def get_batch_pipe(data, neox_args, curr_scheduler=None):
+    """A modification of get_batch() to work with the latest batch instead of an iterator."""
+    # Items and their type.
+    keys = ["text"]
+    datatype = torch.int64
+
+    tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(
+        neox_args, neox_args.tokenizer, keys, data, datatype
+    )
+    if curr_scheduler is not None:
+        # iteration + 1 to align with how/when DeepSpeed updates the buffers
+        curriculum_seqlen = curr_scheduler.update_difficulty(neox_args.iteration + 1)
+        if curriculum_seqlen < tokens.size()[1]:
+            # seqlen-based curriculum learning
+            # input_ids, position_ids, labels have size [batch size, seqlen]
+            # input_ids = input_ids[:, :curriculum_seqlen].contiguous()
+            tokens = tokens[:, :curriculum_seqlen].contiguous()
+            position_ids = position_ids[:, :curriculum_seqlen].contiguous()
+            if labels is not None:
+                labels = labels[:, :curriculum_seqlen].contiguous()
+            if loss_mask is not None:
+                loss_mask = loss_mask[:, :curriculum_seqlen].contiguous()
+            # attention_mask has size [1, 1, seqlen, seqlen]
+            attention_mask = attention_mask[
+                :, :, :curriculum_seqlen, :curriculum_seqlen
+            ].contiguous()
+
+    # unpack data
+    return (tokens, position_ids, attention_mask), (labels, loss_mask)
+
+
+def forward_step(
+    data_iterator, model, neox_args, timers, return_logits=False, is_train=False
+):
+    """Forward step."""
+    if neox_args.is_pipe_parallel:
+        return model.eval_batch(data_iterator, return_logits=return_logits)
+
+    # Get the batch.
+    if timers is not None:
+        timers("batch generator").start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        neox_args=neox_args, data_iterator=data_iterator
+    )
+
+    if timers is not None:
+        timers("batch generator").stop()
+
+    outputs = model((tokens, position_ids, attention_mask), neox_args=neox_args)
+    if (
+        is_train
+        and neox_args.curriculum_learning
+        and neox_args.curriculum_seqlen < neox_args.seq_length
+    ):
+        loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
+        labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
+    loss = cross_entropy(
+        outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
+    )
+    if return_logits:
+        return loss, outputs
+    return loss
+
+
+def get_model(neox_args, use_cache=False):
+    """Build the model."""
+
+    # Build model on cpu.
+    print_rank_0("building GPT2 model ...")
+
+    # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below.
+    # If mup isn't being used anyways, this has no effect.
+    old_use_mup = neox_args.use_mup
+    neox_args.use_mup = False
+    model = GPT2ModelPipe(
+        neox_args=neox_args,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=mpu.get_topology(),
+        use_cache=use_cache,
+    )
+
+    ### soft prompt tuning stuff ###
+    if neox_args.soft_prompt_tuning is not None and neox_args.soft_prompt_tuning.get(
+        "enabled", False
+    ):
+        soft_prompt = SoftEmbedding(
+            neox_args,
+            wte=getattr(model, "0").word_embeddings,
+            n_tokens=neox_args.soft_prompt_tuning.get("n_tokens", 10),
+            init_string=neox_args.soft_prompt_tuning.get("init_string", ""),
+            init_range=neox_args.soft_prompt_tuning.get("init_range", 0.5),
+        )
+        model.insert_layers(
+            layers=soft_prompt, idx=1
+        )  # insert the soft prompt layer directly after the word embeddings
+
+        # freeze everything but the soft prompt
+        for name, param in model.named_parameters():
+            if not "soft_embedding" in name:
+                param.requires_grad = False
+
+    if not neox_args.is_pipe_parallel:
+        # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
+        model = model.to_sequential()
+
+    neox_args.use_mup = old_use_mup
+
+    if neox_args.use_mup:
+        try:
+            import mup
+        except ModuleNotFoundError:
+            print("Please install mup https://github.com/microsoft/mup")
+            raise Exception
+
+        base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
+
+        if neox_args.save_base_shapes:
+            save_base_shapes(neox_args, base_shapes, use_cache)
+
+        mup.set_base_shapes(model, base_shapes)
+
+        # Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute
+        mup_weights_reinit(neox_args, model)
+
+    if neox_args.deepspeed:
+        # DeepSpeed handles CUDA, FP16, and DDP components.
+        return model
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def get_optimizer(model, neox_args):
+    """Set up the optimizer."""
+    if neox_args.no_load_optim:
+        return None, None
+
+    if neox_args.optimizer is None:
+        print_rank_0(
+            f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
+        )
+        exit()
+    # Build parameter groups (weight decay and non-decay).
+    param_groups = get_params_for_weight_decay_optimization(model, neox_args)
+    print_rank_0(
+        f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}'
+    )
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        for param in param_group["params"]:
+            if not hasattr(param, "model_parallel"):
+                param.model_parallel = False
+
+    # Filter out params that don't require a grad (for soft prompt tuning, etc.)
+    _param_groups = []
+    for param_group in param_groups:
+        trainable_params = [p for p in param_group["params"] if p.requires_grad]
+        param_group["params"] = trainable_params
+        _param_groups.append(param_group)
+    param_groups = _param_groups
+
+    # If we're using mup, then the optimizer must be adam or sgd
+    assert not neox_args.use_mup or (
+        neox_args.optimizer_type.lower() == "adam"
+        or neox_args.optimizer_type.lower() == "sgd"
+    ), f"If use_mup == True, you must specify either the adam or sgd optimizers. You passed: {neox_args.optimizer_type.lower()}"
+
+    if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]:
+        if neox_args.optimizer == "cpu_torch_adam":
+            cpu_adam_optimizer = torch.optim.Adam
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "onebitadam":
+        assert neox_args.deepspeed
+        optimizer = None
+        # onebitadam needs to be instantiated within the deepspeed engine to work :|
+    elif neox_args.optimizer_type.lower() == "sm3":
+        from .optimizers import SM3
+
+        optimizer = SM3(param_groups, **neox_args.optimizer["params"])
+    elif neox_args.optimizer_type.lower() == "madgrad_wd":
+        from .optimizers import madgrad_wd
+
+        optimizer = madgrad_wd(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "adam":
+        # Use Adam
+        if neox_args.use_mup:
+            try:
+                from mup import MuAdam
+
+                adam_optimizer = MuAdam
+            except ModuleNotFoundError:
+                print("Please install mup https://github.com/microsoft/mup")
+                raise Exception
+        else:
+            if neox_args.use_bnb_optimizer:
+                try:
+                    import bitsandbytes as bnb
+
+                    adam_optimizer = bnb.optim.Adam8bit
+                except ModuleNotFoundError:
+                    print(
+                        "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
+                    )
+                    raise Exception
+            else:
+                try:
+                    # default to apex as it's slightly faster
+                    from apex.optimizers import FusedAdam as Adam
+                except ImportError:
+                    # if apex isn't installed, use deepspeed's FusedAdam
+                    print(
+                        "WARNING: APEX not installed - defaulting to deepspeed's fused adam"
+                    )
+                    from deepspeed.ops.adam import FusedAdam as Adam
+                adam_optimizer = Adam
+        optimizer = adam_optimizer(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "sgd":
+        try:
+            from mup import MuSGD
+        except ModuleNotFoundError:
+            print("Please install mup https://github.com/microsoft/mup")
+            raise Exception
+        optimizer = MuSGD(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    else:
+        raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized")
+
+    if neox_args.deepspeed:
+        # fp16 wrapper is not required for DeepSpeed.
+        return optimizer, param_groups
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def get_learning_rate_scheduler(optimizer, neox_args):
+    """Build the learning rate scheduler."""
+    if neox_args.no_load_optim:
+        # TODO: this should be configured as a separate arg
+        return None
+    if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam":
+        print_rank_0(
+            "WARNING: onebitadam requires the lr scheduler be built by deepspeed - "
+            "Make sure one is added to your deepspeed config"
+        )
+        return None
+
+    # Add linear learning rate scheduler.
+    if neox_args.lr_decay_iters is not None:
+        num_iters = neox_args.lr_decay_iters
+    else:
+        num_iters = neox_args.train_iters
+    num_iters = max(1, num_iters)
+    init_step = 0
+    warmup_iter = neox_args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=neox_args.lr,
+        warmup_iter=warmup_iter,
+        total_iters=num_iters,
+        decay_style=neox_args.lr_decay_style,
+        last_iter=init_step,
+        min_lr=neox_args.min_lr,
+        use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
+        override_lr_scheduler=neox_args.override_lr_scheduler,
+        use_mup=neox_args.use_mup,
+    )
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
+    """Setup model and optimizer."""
+    model = get_model(neox_args=neox_args, use_cache=use_cache)
+    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
+    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
+
+    if neox_args.deepspeed:
+        print_rank_0("DeepSpeed is enabled.")
+        if neox_args.no_load_optim:
+            assert optimizer is None
+            _model_params = None
+            _lr_scheduler = None
+        else:
+            _model_params = param_groups if optimizer is None else None
+            _lr_scheduler = lr_scheduler
+
+        model, optimizer, _, lr_scheduler = deepspeed.initialize(
+            model=model,
+            optimizer=optimizer,
+            args=neox_args,
+            lr_scheduler=_lr_scheduler,
+            dist_init_required=False,
+            model_parameters=_model_params,
+            # Need to remove the below so that it doesn't conflict with --deepspeed_config required by autotuning
+            # config_params=neox_args.deepspeed_config,
+            mpu=mpu if not neox_args.is_pipe_parallel else None,
+        )
+        model.total_params = get_total_params(model.module)
+        print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')
+
+        if neox_args.is_pipe_parallel:
+            model.set_has_attention_mask(True)
+            if neox_args.curriculum_learning:
+                curr_scheduler = CurriculumScheduler(neox_args.curriculum_learning)
+                if iteration is not None and iteration > 0:
+                    curr_scheduler.update_difficulty(iteration)
+            else:
+                curr_scheduler = None
+            model.set_batch_fn(
+                partial(
+                    get_batch_pipe, neox_args=neox_args, curr_scheduler=curr_scheduler
+                )
+            )
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+    if neox_args.load is not None:
+        neox_args.iteration = load_checkpoint(
+            neox_args=neox_args,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            iteration=iteration,
+        )
+        print_rank_0(
+            f"Loading checkpoint and starting from iteration {neox_args.iteration}"
+        )
+    else:
+        neox_args.iteration = 0
+
+    return model, optimizer, lr_scheduler
+
+
+def backward_step(neox_args, timers, optimizer, model, loss):
+    """Backward step."""
+
+    # Backward pass.
+    timers("backward-backward").start()
+    if neox_args.deepspeed:
+        model.backward(loss)
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+    timers("backward-backward").stop()
+
+    if neox_args.deepspeed:
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers("backward-allreduce").reset()
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
+    """Single training step."""
+
+    # Pipeline parallelism schedules forward/backward/step
+    if neox_args.is_pipe_parallel:
+        reduced_loss = train_step_pipe(
+            neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
+        )
+    else:
+        losses = []
+        for _ in range(neox_args.gradient_accumulation_steps):
+            # Forward model for one step.
+            timers("forward").start()
+            loss = forward_step(
+                neox_args=neox_args,
+                timers=timers,
+                data_iterator=data_iterator,
+                model=model,
+                is_train=True,
+            )
+            timers("forward").stop()
+            losses.append(loss)
+            # Calculate gradients, reduce across processes, and clip.
+            timers("backward").start()
+            backward_step(
+                neox_args=neox_args,
+                timers=timers,
+                optimizer=optimizer,
+                model=model,
+                loss=loss,
+            )
+            timers("backward").stop()
+            # Update parameters.
+            timers("optimizer").start()
+            if neox_args.deepspeed:
+                model.step()
+            else:
+                raise ValueError("Must be using deepspeed to run neox")
+            timers("optimizer").stop()
+        reduced_loss = {
+            "lm_loss": reduce_losses(losses).mean()
+        }  # reduces losses across machines for logging
+
+    if neox_args.precision == "fp16" and model.optimizer.overflow:
+        skipped_iter = 1
+    else:
+        skipped_iter = 0
+
+    return reduced_loss, skipped_iter
+
+
+def train_step_pipe(neox_args, timers, model, data_iterator):
+    """Single training step with DeepSpeed's pipeline parallel engine."""
+
+    assert neox_args.deepspeed
+    loss = model.train_batch(data_iter=data_iterator)
+    loss_dict = {"lm_loss": loss}
+    # Don't break Megatron's timers because we changed code paths.
+    for t in [
+        "forward",
+        "backward",
+        "allreduce",
+        "optimizer",
+        "batch generator",
+        "data loader",
+    ]:
+        timers(t).reset()
+    return loss_dict
+
+
+def train(
+    neox_args,
+    timers,
+    model,
+    optimizer,
+    lr_scheduler,
+    train_data_iterator,
+    valid_data_iterator,
+):
+    """Train the model function."""
+
+    # Turn on training mode which enables dropout.
+    model.train()
+
+    # Tracking loss.
+    total_loss_dict = {}
+
+    # Iterations.
+    iteration = neox_args.iteration
+
+    timers("interval time").start()
+    report_memory_flag = True
+
+    # get noise scale logger (if neox_args.log_gradient_noise_scale is True)
+    noise_scale_logger = get_noise_scale_logger(neox_args)
+
+    # to monitor if we've skipped many iterations in a row and trigger an early exit
+    overflow_monitor = OverflowMonitor(optimizer)
+    while iteration < neox_args.train_iters:
+        loss_dict, skipped_iter = train_step(
+            neox_args=neox_args,
+            timers=timers,
+            data_iterator=train_data_iterator,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+        )
+        iteration += 1
+        neox_args.iteration = iteration
+        if neox_args.precision == "fp16":
+            overflow_monitor.check(skipped_iter)  # check for repeated overflow
+        if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
+            noise_scale_logger.update()
+
+        # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
+        # may have no tunable parameters on a specific rank
+        if optimizer.param_groups:
+            lr = optimizer.param_groups[0].get("lr", 0)
+        else:
+            lr = 0
+
+        # Logging.
+        report_memory_flag = training_log(
+            neox_args=neox_args,
+            timers=timers,
+            loss_dict=loss_dict,
+            total_loss_dict=total_loss_dict,
+            learning_rate=lr,
+            iteration=iteration,
+            loss_scale=optimizer.cur_scale if neox_args.precision == "fp16" else None,
+            report_memory_flag=report_memory_flag,
+            skipped_iter=skipped_iter,
+            model=model,
+            optimizer=optimizer,
+            noise_scale_logger=noise_scale_logger,
+        )
+
+        # Checkpointing
+        if neox_args.save and iteration in neox_args.save_iters:
+            save_checkpoint(
+                neox_args=neox_args,
+                iteration=iteration,
+                model=model,
+                optimizer=optimizer,
+                lr_scheduler=lr_scheduler,
+            )
+
+        # Evaluation
+        if (
+            neox_args.eval_interval
+            and iteration % neox_args.eval_interval == 0
+            and neox_args.do_valid
+        ):
+            prefix = "iteration {}".format(iteration)
+            evaluate_and_print_results(
+                neox_args=neox_args,
+                prefix=prefix,
+                forward_step_func=forward_step,
+                data_iterator=valid_data_iterator,
+                model=model,
+                iteration=iteration,
+                verbose=False,
+                timers=timers,
+            )
+
+        if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
+            torch.distributed.barrier()
+            time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            rank = torch.distributed.get_rank()
+            print_rank_0(
+                "rank: {} | time: {} | exiting the program at iteration {}".format(
+                    rank, time_str, iteration
+                )
+            )
+            sys.exit()
+
+    return iteration
+
+
+def evaluate(
+    neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None
+):
+    """Evaluation.
+    neox_args: NeoX Arguments
+    forward_step_fn: function with args `neox_args, timers,
+                    data_iterator & model that will run a forward pass on the model
+    data_iterator: Iterator that iterates over batches of data. Should return data in the form:
+                    {'text': np.array([tokens], dtype=np.int64)}
+                    where the size of the array is the model's context size + 1
+                    (`get_batch` transforms it into inputs / labels)
+    """
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    losses = []
+    if neox_args.char_level_ppl:
+        data_iterator = CharCounter(data_iterator, neox_args.tokenizer)
+
+    with torch.no_grad():
+        iteration = 0
+        while iteration < neox_args.eval_iters:
+            iteration += 1
+            if verbose and iteration % neox_args.log_interval == 0:
+                print_rank_0(
+                    "Evaluating iter {}/{}".format(iteration, neox_args.eval_iters)
+                )
+
+            # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
+            # to be consistent with deepspeed's pipe parallel engine
+            # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true
+            for _ in range(
+                1
+                if neox_args.is_pipe_parallel
+                else neox_args.gradient_accumulation_steps
+            ):
+                # Forward evaluation
+                loss = forward_step_fn(
+                    model=model,
+                    data_iterator=data_iterator,
+                    neox_args=neox_args,
+                    timers=timers,
+                )
+                losses.append(loss)
+
+            # When contiguous memory optimizations are enabled, the buffers
+            # allocated by the optimizations are deallocated during backward pass
+            # in the absence of backward pass the buffers should be reset after each
+            # forward pass
+            if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
+                deepspeed.checkpointing.reset()
+
+    # reduces losses across processes for logging & run eval harness tasks
+    eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
+    eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
+
+    if neox_args.char_level_ppl:
+        # calculate character level perplexity, if specified
+        # if neox_args.char_level_ppl:
+        # unwrap the data_iterator
+        tokens_per_char = data_iterator.tokens_per_char()
+        print_rank_0(f"Counting chars took {data_iterator.total_time} seconds")
+
+        data_iterator = data_iterator.data_iterator
+        eval_results["lm_loss_char_lvl_ppl"] = math.exp(
+            eval_results["lm_loss"] * tokens_per_char
+        )
+
+    if neox_args.eval_tasks:
+        eval_results.update(
+            run_eval_harness(
+                model, forward_step_fn, neox_args, eval_tasks=neox_args.eval_tasks
+            ).get("results")
+        )
+    # Move model back to the train mode.
+    model.train()
+    return eval_results
+
+
+def evaluate_and_print_results(
+    neox_args,
+    prefix,
+    forward_step_func,
+    data_iterator,
+    model,
+    iteration,
+    verbose=False,
+    timers=None,
+    chart_name="validation",
+):
+    """Helper function to evaluate and dump results on screen."""
+    total_loss_dict = evaluate(
+        neox_args=neox_args,
+        forward_step_fn=forward_step_func,
+        data_iterator=data_iterator,
+        model=model,
+        verbose=verbose,
+        timers=timers,
+    )
+    string = f" {chart_name} results at {prefix} | "
+    for k, v in total_loss_dict.items():
+        if isinstance(v, dict):
+            for k2, v2 in v.items():
+                k3 = "_".join([k, k2])
+                string += f"{k3} value: {v2:.6E} | "
+                tb_wandb_log(
+                    f"{chart_name}/{k3}",
+                    v2,
+                    iteration,
+                    use_wandb=neox_args.use_wandb,
+                    tensorboard_writer=neox_args.tensorboard_writer,
+                )
+        else:
+            string += f"{k} value: {v:.6E} | "
+            tb_wandb_log(
+                f"{chart_name}/{k}",
+                v,
+                iteration,
+                use_wandb=neox_args.use_wandb,
+                tensorboard_writer=neox_args.tensorboard_writer,
+            )
+
+    length = len(string) + 1
+    print_rank_0("-" * length)
+    print_rank_0(string)
+    print_rank_0("-" * length)
diff --git a/benchmarks/sizing/megatron/utils.py b/benchmarks/sizing/megatron/utils.py
new file mode 100644
index 0000000..ebb08f9
--- /dev/null
+++ b/benchmarks/sizing/megatron/utils.py
@@ -0,0 +1,482 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""General utilities."""
+import os
+import sys
+import re
+import time
+import socket
+from typing import Dict, List
+
+import requests
+
+try:
+    import wandb
+except ModuleNotFoundError:
+    pass
+
+import torch
+
+from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion
+
+from megatron import print_rank_0
+from megatron import mpu
+
+from collections import deque
+
+
+def reduce_losses(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    reduced_losses = torch.cat([loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(reduced_losses)
+    reduced_losses = reduced_losses / torch.distributed.get_world_size()
+    return reduced_losses
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+    mega_bytes = 1024.0 * 1024.0
+    string = name + " memory (MB)"
+    string += " | allocated: {}".format(torch.cuda.memory_allocated() / mega_bytes)
+    string += " | max allocated: {}".format(
+        torch.cuda.max_memory_allocated() / mega_bytes
+    )
+    string += " | reserved: {}".format(torch.cuda.memory_reserved() / mega_bytes)
+    string += " | max reserved: {}".format(
+        torch.cuda.max_memory_reserved() / mega_bytes
+    )
+    print_rank_0(string)
+
+
+def get_attn_mask(seq_length, device):
+    """
+    Get triangular attention mask for a given sequence length / device.
+    """
+    # lower triangular attention mask
+    mask = torch.tril(torch.ones((1, seq_length, seq_length), device=device)).view(
+        1, 1, seq_length, seq_length
+    )
+
+    # convert to binary
+    return mask < 0.5
+
+
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    eod_mask_loss=False,
+):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    attention_mask = get_attn_mask(
+        seq_length=seq_length,
+        device=data.device,
+    )
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+
+    return attention_mask, loss_mask, position_ids
+
+
+def local_rank():
+    """Local rank of process"""
+    local_rank = os.environ.get("LOCAL_RANK")
+
+    if local_rank is None:
+        local_rank = os.environ.get("SLURM_LOCALID")
+
+    if local_rank is None:
+        print(
+            "utils.local_rank() environment variable LOCAL_RANK not set, defaulting to 0",
+            flush=True,
+        )
+        local_rank = 0
+    return int(local_rank)
+
+
+def is_bnb_available():
+    """True if bitsandbytes optimizers are available"""
+    return importlib.util.find_spec("bitsandbytes") is not None
+
+
+def is_local_main():
+    """True if is the local main process"""
+    return local_rank() == 0
+
+
+def is_mp_rank_0():
+    """True if mp rank == 0"""
+    return mpu.get_model_parallel_rank() == 0
+
+
+def get_wandb_api_key(neox_args):
+    """Get Weights and Biases API key from ENV or .netrc file. Otherwise return None"""
+    if "WANDB_LOCAL" in os.environ:
+        return "LOCAL"
+    if "WANDB_API_KEY" in os.environ:
+        return os.environ["WANDB_API_KEY"]
+
+    wandb_token = requests.utils.get_netrc_auth(neox_args.wandb_host)
+
+    if wandb_token is not None:
+        return wandb_token[1]
+
+
+def init_wandb(neox_args):
+    # Wandb. (one worker per machine)
+    if neox_args.use_wandb == False:
+        return
+
+    if not neox_args.wandb_init_all_ranks:
+        use_wandb = is_local_main() and (
+            get_wandb_api_key(neox_args=neox_args) is not None
+        )
+        neox_args.update_value("use_wandb", use_wandb)
+    if neox_args.use_wandb:
+        group_name = neox_args.wandb_group
+        name = f"{socket.gethostname()}-{local_rank()}" if group_name else None
+        try:
+            wandb.init(
+                project=neox_args.wandb_project,
+                group=group_name,
+                name=name,
+                save_code=False,
+                force=False,
+                entity=neox_args.wandb_team,
+            )
+        except wandb.UsageError as e:
+            neox_args.update_value("use_wandb", False)
+            print(e)
+            print(
+                "Skipping wandb. Execute `wandb login` on local or main node machine to enable.",
+                flush=True,
+            )
+        wandb.config.update(neox_args.all_config)
+
+
+def obtain_resource_pool(
+    hostfile_path, include_arg, exclude_arg
+) -> Dict[str, List[int]]:
+    """
+    Get dict of `resource_pool[hostname] = [list of GPU ranks]` using hostfile, include and exclude args.
+    Modified from: `deepspeed.launcher.runner.main`
+    """
+    resource_pool = fetch_hostfile(hostfile_path)
+    if not resource_pool:
+        resource_pool = {}
+        device_count = torch.cuda.device_count()
+        if device_count == 0:
+            raise RuntimeError("Unable to proceed, no GPU resources available")
+        resource_pool["localhost"] = device_count
+
+    active_resources = parse_inclusion_exclusion(
+        resource_pool, include_arg, exclude_arg
+    )
+    return active_resources
+
+
+def natural_sort(l):
+    convert = lambda text: int(text) if text.isdigit() else text.lower()
+    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
+    return sorted(l, key=alphanum_key)
+
+
+def ddb(rank=0):
+    """
+    Distributed Debugger that will insert a py debugger on rank `rank` and
+    pause all other distributed processes until debugging is complete.
+    :param rank:
+    """
+    if torch.distributed.get_rank() == rank:
+        from pdb import Pdb
+
+        pdb = Pdb(skip=["torch.distributed.*"])
+        pdb.set_trace(sys._getframe().f_back)
+    torch.distributed.barrier()
+
+
+class Timer:
+    """Timer."""
+
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, "timer has already been started"
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, "timer is not started"
+        torch.cuda.synchronize()
+        self.elapsed_ += time.time() - self.start_time
+        self.started_ = False
+
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self, use_wandb, tensorboard_writer):
+        self.timers = {}
+        self.use_wandb = use_wandb
+        self.tensorboard_writer = tensorboard_writer
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = Timer(name)
+        return self.timers[name]
+
+    def write(self, names, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # pollutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+
+            if self.tensorboard_writer:
+                self.tensorboard_writer.add_scalar(f"timers/{name}", value, iteration)
+
+            if self.use_wandb:
+                wandb.log({f"timers/{name}": value}, step=iteration)
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = "time (ms)"
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
+            string += " | {}: {:.2f}".format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == 0:
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
+
+
+def expand_attention_types(attention_config, num_layers):
+    """
+    Expands an `attention_config` list in the following format:
+
+        [
+        [['attention_type_1', ..., `attention_type_n`], 12]
+        ]
+
+    to a flattened list of length `num_layers`.
+
+    :param params_list:
+    :return:
+    """
+    # if only strings are found in the config, we assume it's already expanded
+    if all([isinstance(i, str) for i in attention_config]):
+        return attention_config
+    newlist = []
+    for item in attention_config:
+        # instead of specifying a number - we can specify 'all' to extend this pattern across all layers
+        if item[1] == "all":
+            assert num_layers % len(item[0]) == 0, (
+                f"Number of layers ({num_layers}) is not divisible by the length "
+                f"of pattern: {item[0]}"
+            )
+            return item[0] * (num_layers // len(item[0]))
+        for _ in range(item[1]):
+            newlist.extend(item[0])
+    return newlist
+
+
+class OverflowMonitor:
+
+    """
+    Checks if the past n iterations have been skipped due to overflow, and exits
+    training if that happens.
+    """
+
+    def __init__(self, optimizer, n=50):
+        self.optimizer = optimizer
+        self.n = n
+        self.history = deque(maxlen=n)
+
+    def check(self, skipped):
+        self.history.append(skipped)
+        if (
+            self.optimizer.overflow
+            and len(self.history) == self.n
+            and all(self.history)
+        ):
+            raise Exception(
+                f"Skipped {self.n} iterations in a row due to Overflow - Exiting training."
+            )
+
+
+def get_noise_scale_logger(neox_args):
+    if neox_args.log_gradient_noise_scale:
+        if neox_args.zero_stage >= 1:
+            raise NotImplementedError(
+                "Gradient Noise Scale logging does not work with zero stage 2+, as the "
+                "gradients are distributed across ranks."
+            )
+        noise_scale_logger = GradientNoiseScale(
+            model=model,
+            batch_size_small=neox_args.train_batch_size,
+            n_batches=neox_args.gradient_noise_scale_n_batches,
+            cpu_offload=neox_args.gradient_noise_scale_cpu_offload,
+            neox_args=neox_args,
+            mpu=mpu,
+        )
+    else:
+        noise_scale_logger = None
+    return noise_scale_logger
+
+
+def get_total_params(model):
+    # Print number of parameters.
+    if mpu.get_data_parallel_rank() == 0:
+        params = sum([p.nelement() for p in model.parameters()])
+        print(
+            " > number of parameters on model parallel rank {}: {}".format(
+                mpu.get_model_parallel_rank(), params
+            ),
+            flush=True,
+        )
+    else:
+        params = 0
+
+    total_n_parameters = torch.tensor([params]).cuda(torch.cuda.current_device())
+    torch.distributed.all_reduce(total_n_parameters)
+    total_n_parameters = total_n_parameters.item()
+    return total_n_parameters
+
+
+def setup_for_inference_or_eval(
+    use_cache=True,
+    overwrite_values=None,
+):
+    """
+    Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args.
+
+    use_cache: bool
+        Whether to use key value caching in inference.
+    overwrite_values: dict
+        Optional Values to overwrite in the model config.
+    """
+
+    from megatron.neox_arguments import NeoXArgs
+    from megatron.initialize import initialize_megatron
+    from megatron.training import setup_model_and_optimizer
+
+    _overwrite_values = {
+        "checkpoint_activations": False,
+        "partition_activations": False,
+        "no_load_optim": True,
+        "zero_optimization": None,  # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors)
+    }
+    if overwrite_values:
+        _overwrite_values.update(overwrite_values)
+    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
+    neox_args.configure_distributed_args()
+    neox_args.build_tokenizer()
+
+    if neox_args.load is None:
+        raise ValueError("`load` parameter must be supplied to load a model`")
+
+    # initialize wandb
+    init_wandb(neox_args=neox_args)
+
+    # initialize megatron
+    initialize_megatron(neox_args)
+
+    # set up model and load checkpoint.
+    model, _, _ = setup_model_and_optimizer(
+        neox_args=neox_args,
+        use_cache=use_cache,
+        iteration=neox_args.iteration,
+    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed
+    print_rank_0("Finished loading model")
+
+    model.module.inference_mode(use_cache=use_cache)
+    return model, neox_args
+
+
+class CharCounter:
+    """
+    Wraps the data_iterator to count the number of characters in a batch
+    """
+
+    def __init__(self, data_iterator, tokenizer):
+        self.tokenizer = tokenizer
+        self.data_iterator = data_iterator
+        self.char_count = 0
+        self.batch_count = 0
+        self.token_count = 0
+        self.total_time = 0
+
+    def tokens_per_char(self):
+        return self.token_count / self.char_count
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        start = time.time()
+        batch = self.data_iterator.__next__()
+        for b in batch["text"]:
+            self.token_count += len(b)
+            self.char_count += len(self.tokenizer.detokenize(b.tolist()))
+        self.batch_count += 1
+        end = time.time()
+        self.total_time += end - start
+        return batch
diff --git a/benchmarks/sizing/megatron_wrapper.py b/benchmarks/sizing/megatron_wrapper.py
new file mode 100644
index 0000000..37b3153
--- /dev/null
+++ b/benchmarks/sizing/megatron_wrapper.py
@@ -0,0 +1,115 @@
+import contextlib
+from dataclasses import dataclass,asdict
+import os
+import torch
+
+import megatron
+
+
+@dataclass
+class Arguments:
+    precision: str = "fp16"
+    #fp16 : bool = True
+    #bf16 : bool = False
+    apply_query_key_layer_scaling : bool = True
+    attention_softmax_in_fp32 : bool = False
+    scaled_masked_softmax_fusion : bool = True
+    attention_dropout : float = 0.0
+    #kv_channels : int = 128
+    num_attention_heads : int = 8
+    hidden_size : int = 1024
+    rank : int = 0
+    local_rank : int = 0
+    distributed_backend : str = "nccl"
+    world_size : int = 1
+    model_parallel_size : int = 1
+    pipe_parallel_size : int = 1
+    global_num_gpus : int = 1
+    virtual_pipeline_model_parallel_size = None
+    pipeline_model_parallel_split_rank = None
+    #no_async_tensor_model_parallel_allreduce : bool = True
+    seq_length : int = 2048
+    train_micro_batch_size_per_gpu : int = 2
+    #train_batch_size : int = 2048
+    #gradient_accumulation_steps: int = None
+    use_cpu_initialization : bool = False
+    params_dtype = torch.float16
+    #ffn_hidden_size : int = 4096
+    num_layers : int = 2
+    bias_gelu_fusion : bool = True
+    #openai_gelu : bool = False
+    onnx_safe = None
+    #apply_residual_connection_post_layernorm : bool = False
+    #fp32_residual_connection : bool = False
+    bias_dropout_fusion : bool = True
+    layernorm_epsilon : float = 1e-5
+    hidden_dropout : float = 0.0
+    fp16_lm_cross_entropy : bool = False
+    init_method_std : float = 0.02
+    padded_vocab_size : int = 51200
+    max_position_embeddings : int = 2048 #originally 1024
+    activations_checkpoint_method = None
+    checkpoint_num_layers : int = 1
+    #distribute_checkpointed_activations : bool = False
+    #no_persist_layer_norm : bool = False
+    #DDP_impl : str = "local"
+    #accumulate_allreduce_grads_in_fp32 : bool = False
+    #use_contiguous_buffers_in_local_ddp : bool = True
+    optimizer_type : str = "adam"
+    lr : float = 0.00015
+    weight_decay : float = 0.01
+    #adam_beta1 : float = 0.9
+    #adam_beta2 : float = 0.999
+    #adam_eps : float = 1e-08
+    loss_scale = None
+    #initial_loss_scale : int = 4294967296
+    #min_loss_scale : float = 1.0
+    loss_scale_window : int = 1000
+    hysteresis : int = 2
+    clip_grad : float = 1.0
+    #log_num_zeros_in_grad : bool = False
+    train_iters : int = 20000
+    lr_decay_iters : int = 20000
+    lr_decay_style : str = "cosine"
+    #train_batch_size : int = 512
+    #lr_warmup_fraction : float = 0.01
+    #min_lr : float = 1e-05
+    use_checkpoint_lr_scheduler : bool = False
+    override_lr_scheduler : bool = False
+    load = None
+    save : str = None
+    seed : int = 1234
+    #data_parallel_random_init : bool = False
+
+
+def initialize_megatron(configuration):
+    with open("/dev/null", 'w') as f:
+        with contextlib.redirect_stdout(f):
+            os.environ["MASTER_ADDR"] = "localhost"
+            os.environ["MASTER_PORT"] = "6000"
+            os.environ["RANK"] = "0"
+            os.environ["WORLD_SIZE"] = "1"
+            args = get_megatron_args(configuration, override_tensor_mp_size=True)
+            #megatron.global_vars._GLOBAL_ARGS = args
+            neox_args = megatron.NeoXArgs.from_dict(asdict(args))
+            megatron.initialize._initialize_distributed(neox_args=neox_args)
+            megatron.initialize._set_random_seed(neox_args.seed)
+            #megatron.initialize._compile_dependencies()
+
+
+def get_megatron_args(configuration, override_tensor_mp_size=False):
+    (microbatch_size, hidden_size, (tensor_mp_size, pipeline_mp_size, dp_size), num_attention_heads,vocab_size,seq_length,train_batch_size) = configuration
+    args = Arguments()
+    args.params_dtype = torch.half
+    if not override_tensor_mp_size:
+        args.tensor_model_parallel_size = tensor_mp_size
+    args.train_micro_batch_size_per_gpu = microbatch_size
+    args.hidden_size = hidden_size
+    args.ffn_hidden_size = 4 * args.hidden_size
+    args.num_attention_heads = num_attention_heads
+    args.kv_channels = args.hidden_size // args.num_attention_heads
+    args.padded_vocab_size=vocab_size
+    args.attention_config = [[["flash"], 0]]
+    #megatron.global_vars._GLOBAL_ARGS = args
+    neox_args = megatron.NeoXArgs.from_dict(asdict(args))
+    return neox_args
diff --git a/benchmarks/sizing/mm_flops.py b/benchmarks/sizing/mm_flops.py
new file mode 100644
index 0000000..96d0ff9
--- /dev/null
+++ b/benchmarks/sizing/mm_flops.py
@@ -0,0 +1,51 @@
+import time
+import torch
+import sys
+import numpy as np
+import argparse
+
+from utils import benchmark_mm
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    m_group = parser.add_mutually_exclusive_group(required=True)
+    m_group.add_argument("-m", nargs="+", type=int, help='The first dimension of the GEMM, enter any number of arguments')
+    m_group.add_argument("--m_range", nargs='+', type=int, help="The first dimension of the GEMM, [start,stop,step]")
+
+    n_group = parser.add_mutually_exclusive_group(required=True)
+    n_group.add_argument("-n", nargs="*", type=int, help='The shared dimension of the GEMM, enter any number of arguments')
+    n_group.add_argument("--n_range", nargs='+', type=int, help="The shared dimension of the GEMM, [start,stop,step]")
+
+    k_group = parser.add_mutually_exclusive_group(required=True)
+    k_group.add_argument("-k", nargs="*", type=int, help='The last dimension of the GEMM, enter any number of arguments')
+    k_group.add_argument("--k_range", nargs='+', type=int, help="The last dimension of the GEMM, [start,stop,step]")
+
+    parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each GEMM')
+    parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
+    parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
+    parser.add_argument("--output_file", type=str, default="../results/mm.out")
+    args = parser.parse_args()
+
+    m = args.m
+    n = args.n
+    k = args.k
+
+    if m is None:
+        start,stop,step = args.m_range
+        m = np.arange(start,stop,step)
+    if n is None:
+        start,stop,step = args.n_range
+        n = np.arange(start,stop,step)
+    if k is None:
+        start,stop,step = args.k_range
+        k = np.arange(start,stop,step)
+    
+    # set cuda device
+    torch.cuda.set_device(f"cuda:{args.cuda_device}")
+
+    # loop through all sizes to benchmark
+    with open(args.output_file, 'w') as sys.stdout:
+        for M in m:
+            for N in n:
+                for K in k:
+                    benchmark_mm(M, N, K, args.num_iterations, args.num_warmup_iterations)
\ No newline at end of file
diff --git a/benchmarks/sizing/plotting/convert_to_csv.py b/benchmarks/sizing/plotting/convert_to_csv.py
new file mode 100644
index 0000000..30a9b45
--- /dev/null
+++ b/benchmarks/sizing/plotting/convert_to_csv.py
@@ -0,0 +1,192 @@
+import argparse
+from pylab import *
+import pandas as pd
+
+def read_transformer_logfile(logfile_name):
+    all_values = []
+    value_labels = ["num_attention_heads", "hidden_size", "train_micro_batch_size_per_gpu", "seq_length", "vocab_size", "train_batch_size", "tensor_mp_size", "pipeline_mp_size", "dp_size"]
+    with open(logfile_name, 'r') as f:
+        reading_estimate = False
+        for line in f:
+            line = line.strip()
+            if line == "Estimate":
+                reading_estimate = True
+            elif line == "Actual":
+                reading_estimate = False
+            match = re.match(r'num_attention_heads: (\d+), hidden_size: (\d+), '
+                             r'train_micro_batch_size_per_gpu: (\d+), seq_length: (\d+), '
+                             r'vocab_size: (\d+), train_batch_size: (\d+), '
+                             r'tensor_mp_size: (\d+), pipeline_mp_size: (\d+), '
+                             r'dp_size: (\d+)', line)
+            
+            if match is not None:
+                values = {}
+                for i in range(1, 10):
+                    values[value_labels[i-1]] = int(match.group(i))
+                all_values.append(values)
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for qkv_transform \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_key_value_query_transform"] = throughput
+            
+            match = re.match(r'Throughput \(in TFLOP/s\) for attention_score \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_key_query_prob"] = throughput
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for attention_over_value \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_prob_times_values"] = throughput
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for attention_dropout \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_dropout"] = throughput
+            
+            match = re.match(r'Elapsed time for attention_softmax \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_softmax"] = throughput
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for attention_linear_projection \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["attention_linear_projection"] = throughput 
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for mlp_h_to_4h \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["mlp_h_to_4h"] = throughput  
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for mlp_4h_to_h \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["mlp_4h_to_h"] = throughput 
+
+            match = re.match(r'Elapsed time for mlp_fused_gelu \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["mlp_fused_gelu"] = throughput
+
+            match = re.match(r'Elapsed time for transformer_add_bias_dropout \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["transformer_add_bias_dropout"] = throughput
+
+            match = re.match(r'Elapsed time for transformer_layer_norm \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["transformer_layer_norm"] = throughput
+
+            match = re.match(r'Throughput \(in TFLOP/s\) for logit_block \((.*)\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(2))
+                if reading_estimate:
+                    all_values[-1]["logit_block"] = throughput
+                
+            match = re.match(r'Attention duration \(in seconds\): (\d+\.\d+)', line)
+            if match is not None:
+                duration = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_attention_duration"] = duration
+                else:
+                   all_values[-1]["actual_attention_duration"] = duration
+
+            match = re.match(r'Attention throughput \(in TFLOP/s\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_attention_throughput"] = throughput
+                else:
+                   all_values[-1]["actual_attention_throughput"] = throughput 
+        
+            match = re.match(r'MLP duration \(in seconds\): (\d+\.\d+)', line)
+            if match is not None:
+                duration = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_mlp_duration"] = duration
+                else:
+                   all_values[-1]["actual_mlp_duration"] = duration
+
+            match = re.match(r'MLP throughput \(in TFLOP/s\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_mlp_throughput"] = throughput
+                else:
+                   all_values[-1]["actual_mlp_throughput"] = throughput
+
+            match = re.match(r'Transformer duration \(in seconds\): (\d+\.\d+)', line)
+            if match is not None:
+                duration = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_duration"] = duration
+                else:
+                    all_values[-1]["actual_duration"] = duration
+            match = re.match(r'Transformer throughput \(in TFLOP/s\): (\d+\.\d+)', line)
+            if match is not None:
+                throughput = float(match.group(1))
+                if reading_estimate:
+                    all_values[-1]["estimated_throughput"] = throughput
+                else:
+                    all_values[-1]["actual_throughput"] = throughput
+    return all_values
+
+def read_mm_logfile(logfile_name):
+    throughputs = []
+    with open(logfile_name, 'r') as f:
+        for line in f:
+            line = line.strip()
+            match = re.match(r'Throughput \(in TFLOP/s\) for (\d+)x(\d+)x(\d+): (\d+\.\d+)', line)
+            if match is not None:
+                m, n, k = int(match.group(1)), int(match.group(2)), int(match.group(3))
+                throughput = float(match.group(4))
+                throughputs.append({'m': m, 'n': n, 'k': k,
+                                    'throughput': throughput})
+    return throughputs
+
+def read_bmm_logfile(logfile_name):
+    throughputs = []
+    with open(logfile_name, 'r') as f:
+        for line in f:
+            line = line.strip()
+            match = re.match(r'Throughput \(in TFLOP/s\) for bmm \((\d+)x(\d+)x(\d+)x(\d+)\): (\d+\.\d+)', line)
+            if match is not None:
+                b, m, n, k = int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4))
+                throughput = float(match.group(5))
+                throughputs.append({'b': b, 'm': m, 'n': n, 'k': k,
+                                    'throughput': throughput})
+    return throughputs
+
+def to_pandas(filename):
+    all_values_transformer = read_transformer_logfile(filename)
+    all_values_mm = read_mm_logfile(filename)
+    all_values_bmm = read_bmm_logfile(filename)
+    if len(all_values_transformer) > 0:
+        df = pd.DataFrame(all_values_transformer)
+    elif len(all_values_mm) > 0:
+        df = pd.DataFrame(all_values_mm)
+    else:
+        df = pd.DataFrame(all_values_bmm)
+    return df
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file_name", type=str, help="Input log file")
+    parser.add_argument("--output_file", type=str, help="Output csv file")
+    args = parser.parse_args()
+    df = to_pandas(args.file_name)
+    df.to_csv(args.output_file, index=False)
\ No newline at end of file
diff --git a/benchmarks/sizing/plotting/transformer_figures.ipynb b/benchmarks/sizing/plotting/transformer_figures.ipynb
new file mode 100644
index 0000000..2961494
--- /dev/null
+++ b/benchmarks/sizing/plotting/transformer_figures.ipynb
@@ -0,0 +1,55 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Figures of Transformer Components"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f9547c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from convert_to_csv import to_pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "ffde8279",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = to_pandas('../results/mm.out')\n",
+    "df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/benchmarks/sizing/results/bmm.out b/benchmarks/sizing/results/bmm.out
new file mode 100644
index 0000000..731dd46
--- /dev/null
+++ b/benchmarks/sizing/results/bmm.out
@@ -0,0 +1,6 @@
+Elapsed time for bmm (128x1024x1024x1024): 0.0012
+Throughput (in TFLOP/s) for bmm (128x1024x1024x1024): 227.295
+--------------------------------------------------------------------------------
+Elapsed time for bmm (128x1024x2048x1024): 0.0023
+Throughput (in TFLOP/s) for bmm (128x1024x2048x1024): 243.258
+--------------------------------------------------------------------------------
diff --git a/benchmarks/sizing/results/mm.out b/benchmarks/sizing/results/mm.out
new file mode 100644
index 0000000..9c43458
--- /dev/null
+++ b/benchmarks/sizing/results/mm.out
@@ -0,0 +1,12 @@
+Elapsed time for 1024x1024x1024: 0.000
+Throughput (in TFLOP/s) for 1024x1024x1024: 59.919
+--------------------------------------------------------------------------------
+Elapsed time for 1024x1280x1024: 0.000
+Throughput (in TFLOP/s) for 1024x1280x1024: 68.985
+--------------------------------------------------------------------------------
+Elapsed time for 1024x1536x1024: 0.000
+Throughput (in TFLOP/s) for 1024x1536x1024: 52.429
+--------------------------------------------------------------------------------
+Elapsed time for 1024x1792x1024: 0.000
+Throughput (in TFLOP/s) for 1024x1792x1024: 63.276
+--------------------------------------------------------------------------------
diff --git a/benchmarks/sizing/results/transformer.out b/benchmarks/sizing/results/transformer.out
new file mode 100644
index 0000000..f9063a3
--- /dev/null
+++ b/benchmarks/sizing/results/transformer.out
@@ -0,0 +1,33 @@
+num_attention_heads: 16, hidden_size: 4096, train_micro_batch_size_per_gpu: 4, seq_length: 2048, vocab_size: 51200, train_batch_size: 256, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Estimate
+--------
+Elapsed time for qkv_transform (4x4096x12288, b=2048): 0.0034
+Throughput (in TFLOP/s) for qkv_transform (4x4096x12288, b=2048): 241.616
+Elapsed time for attention_score (64x2048x256x2048): 0.0011
+Throughput (in TFLOP/s) for attention_score (64x2048x256x2048): 130.182
+Elapsed time for attention_over_value (64x2048x2048x256): 0.0006
+Throughput (in TFLOP/s) for attention_over_value (64x2048x2048x256): 218.952
+Elapsed time for attention_dropout (4x16x2048x2048): 0.0012
+Elapsed time for attention_softmax (4x16x2048x2048): 0.0076
+Elapsed time for attention_linear_projection (4x4096x4096, b=2048): 0.0012
+Throughput (in TFLOP/s) for attention_linear_projection (4x4096x4096, b=2048): 234.852
+Elapsed time for mlp_h_to_4h (4x4096x16384, b=2048): 0.0046
+Throughput (in TFLOP/s) for mlp_h_to_4h (4x4096x16384, b=2048): 241.128
+Elapsed time for mlp_fused_gelu (2048x4x16384): 0.0005
+Elapsed time for mlp_4h_to_h (4x16384x4096, b=2048): 0.0043
+Throughput (in TFLOP/s) for mlp_4h_to_h (4x16384x4096, b=2048): 252.764
+Elapsed time for transformer_add_bias_dropout (2048x4x4096): 0.0003
+Elapsed time for transformer_layer_norm (2048x4x4096): 0.0001
+Elapsed time for logit_block (4x51200x4096, b=2048): 0.0135
+Throughput (in TFLOP/s) for logit_block (4x51200x4096, b=2048): 255.380
+
+Attention duration (in seconds): 0.0285
+Attention throughput (in TFLOP/s): 48.167
+MLP duration (in seconds): 0.0094
+MLP throughput (in TFLOP/s): 234.186
+Transformer duration (in seconds): 0.0388
+Transformer throughput (in TFLOP/s): 91.991
+Transformer - MLP - Attention (in seconds): 0.0009
+========================================================================================================================
diff --git a/benchmarks/sizing/run_all.sh b/benchmarks/sizing/run_all.sh
new file mode 100644
index 0000000..368ba1e
--- /dev/null
+++ b/benchmarks/sizing/run_all.sh
@@ -0,0 +1,4 @@
+python bmm_flops.py
+python logit.py
+python mm_flops.py
+python transformer_flops.py
\ No newline at end of file
diff --git a/benchmarks/sizing/transformer_flops.py b/benchmarks/sizing/transformer_flops.py
new file mode 100644
index 0000000..6287994
--- /dev/null
+++ b/benchmarks/sizing/transformer_flops.py
@@ -0,0 +1,295 @@
+import time
+import torch
+import os
+import numpy as np
+import megatron_wrapper
+import megatron
+from megatron.model.transformer import ParallelSelfAttention, ParallelMLP, ParallelTransformerLayer
+from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
+import sys
+from utils import *
+import argparse
+from megatron.model import LayerNorm
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes
+from megatron.model.transformer import ParallelSelfAttention, ParallelMLP, ParallelTransformerLayer
+from megatron.model.transformer import bias_dropout_add_fused_train
+from megatron.model.activations import bias_gelu_impl
+from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
+from megatron.model.word_embeddings import Embedding
+
+# benchmarks the individual components of the transformer.  Will only be used if --layers is specified and will only benchmark the layers specified
+def benchmark_transformer_from_mm_and_bmm(args, configuration, seq_length, global_batch_size, num_iterations, num_warmup_iterations):
+
+    (microbatch_size, hidden_size, (tensor_mp_size, pipeline_mp_size, dp_size), num_attention_heads,vocab_size,seq_length,train_batch_size) = configuration
+    print("\n\nEstimate")
+    print("--------")
+    elapsed_attention_time = 0.0
+    elapsed_mlp_time = 0.0
+    elapsed_add_bias_dropout_time = 0.0
+    elapsed_layer_norm_time = 0.0
+    
+    if 'qkv_transform' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_mm_b(
+            microbatch_size, hidden_size,
+            3 * hidden_size // tensor_mp_size,
+            'qkv_transform',
+            seq_length, num_iterations, num_warmup_iterations)
+    if 'attention_score' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_bmm(
+            microbatch_size * num_attention_heads // tensor_mp_size,
+            seq_length, hidden_size // num_attention_heads,
+            seq_length, 'attention_score',
+            num_iterations, num_warmup_iterations)
+    if 'attention_over_value' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_bmm(
+            microbatch_size * num_attention_heads // tensor_mp_size,
+            seq_length, seq_length, hidden_size // num_attention_heads,
+            'attention_over_value',
+            num_iterations, num_warmup_iterations)
+    if 'dropout' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_dropout(
+            (microbatch_size, num_attention_heads // tensor_mp_size, seq_length, seq_length),
+            'attention_dropout',
+            num_iterations, num_warmup_iterations)
+    if 'softmax' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_softmax(
+            (microbatch_size, num_attention_heads // tensor_mp_size, seq_length, seq_length),
+            seq_length, 'attention_softmax',
+            num_iterations, num_warmup_iterations)
+    if 'attention_linear_projection' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_mm_b(
+            microbatch_size, hidden_size // tensor_mp_size,
+            hidden_size, 'attention_linear_projection',
+            seq_length,
+            num_iterations, num_warmup_iterations)
+    if 'mlp_h_to_4h' in args.blocks or 'all' in args.blocks:
+        elapsed_mlp_time += benchmark_mm_b(
+            microbatch_size, hidden_size,
+            4 * hidden_size // tensor_mp_size, 'mlp_h_to_4h',
+            seq_length,
+            num_iterations, num_warmup_iterations)
+    if 'gelu' in args.blocks or 'all' in args.blocks:
+        elapsed_mlp_time += benchmark_fused_gelu(
+            (seq_length, microbatch_size, 4 * hidden_size // tensor_mp_size),
+            (4 * hidden_size // tensor_mp_size,),
+            'mlp_fused_gelu', num_iterations, num_warmup_iterations)
+    if 'mlp_4h_to_h' in args.blocks or 'all' in args.blocks:
+        elapsed_mlp_time += benchmark_mm_b(
+            microbatch_size, 4 * hidden_size // tensor_mp_size,
+            hidden_size, 'mlp_4h_to_h',
+            seq_length,
+            num_iterations, num_warmup_iterations)
+    if 'add_bias_dropout' in args.blocks or 'all' in args.blocks:
+        elapsed_add_bias_dropout_time = 2 * benchmark_add_bias_dropout(
+            (seq_length, microbatch_size, hidden_size),
+            'transformer_add_bias_dropout',
+            num_iterations, num_warmup_iterations)
+    if 'layer_norm' in args.blocks or 'all' in args.blocks:
+        elapsed_layer_norm_time = 2 * benchmark_layer_norm(
+            (seq_length, microbatch_size, hidden_size),
+            hidden_size,
+            'transformer_layer_norm',
+            num_iterations, num_warmup_iterations)
+    if 'logit_block' in args.blocks or 'all' in args.blocks:
+        elapsed_attention_time += benchmark_mm_b(
+            microbatch_size, vocab_size,
+            hidden_size, 'logit_block',
+            seq_length,
+            num_iterations, num_warmup_iterations)
+    
+    elapsed_total_time = elapsed_attention_time + elapsed_mlp_time + elapsed_add_bias_dropout_time + \
+            elapsed_layer_norm_time
+
+    num_attention_floating_point_operations =  \
+        (4 * microbatch_size * seq_length * hidden_size / tensor_mp_size) * (
+            2 * hidden_size + seq_length)
+    num_mlp_floating_point_operations = \
+        16 * microbatch_size * seq_length * hidden_size * hidden_size / tensor_mp_size
+    num_total_floating_point_operations = num_attention_floating_point_operations + \
+        num_mlp_floating_point_operations
+    attention_throughput = num_attention_floating_point_operations / (elapsed_attention_time * 10**12)
+    mlp_throughput = num_mlp_floating_point_operations / (elapsed_mlp_time * 10**12)
+    total_throughput = num_total_floating_point_operations / (elapsed_total_time * 10**12)
+
+    print()
+    for (elapsed_time, throughput, label) in \
+        zip([elapsed_attention_time, elapsed_mlp_time, elapsed_total_time],
+            [attention_throughput, mlp_throughput, total_throughput],
+            ["Attention", "MLP", "Transformer"]):
+        print(f"{label} duration (in seconds): {elapsed_time:.4f}")
+        print(f"{label} throughput (in TFLOP/s): {throughput:.3f}")
+    print("Transformer - MLP - Attention (in seconds): "
+          f"{(elapsed_total_time - elapsed_attention_time - elapsed_mlp_time):.4f}")
+
+    num_microbatches_in_pipeline = global_batch_size // (microbatch_size * dp_size)
+    pipeline_bubble_fraction = (pipeline_mp_size - 1) / num_microbatches_in_pipeline
+    elapsed_time *= (1 + pipeline_bubble_fraction)
+    # Throughput if considering pipeline bubble.
+    throughput = num_total_floating_point_operations / (elapsed_time * 10**12 / 10**3)
+
+# benchmarks the entire transformer using megatron
+def benchmark_transformer(c_args,configuration, seq_length, global_batch_size, num_iterations,num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    (microbatch_size, hidden_size,
+     (tensor_mp_size, pipeline_mp_size, dp_size), num_attention_heads,vocab_size,seq_length,train_batch_size) = configuration
+    print("\n\nActual")
+    print("------")
+
+    args = megatron_wrapper.get_megatron_args(configuration)
+    fn_args = [megatron.model.init_functions.init_method_normal(args.init_method_std),
+               megatron.model.init_functions.init_method_normal(args.init_method_std)]
+    init_method = megatron.model.init_functions.init_method_normal(args.init_method_std)
+    if c_args.use_flash:
+        args.attention_config=["flash","global"]
+    attention_layer = ParallelSelfAttention(args,attention_mask_func=attention_mask_func, init_method=init_method,output_layer_init_method=init_method, layer_number=0).half().to("cuda")
+    mlp_layer = ParallelMLP(args,init_method=init_method,output_layer_init_method=init_method).half().to("cuda")
+    transformer_layer = ParallelTransformerLayer(args,attention_mask_func=attention_mask_func,init_method=init_method,output_layer_init_method=init_method,layer_number=0).half().to("cuda")
+    inp = torch.randn((args.seq_length, args.batch_size, args.hidden_size)).half().to("cuda")
+    attention_mask = torch.tril(torch.ones(
+        (1, args.seq_length, args.seq_length), device="cuda")).view(
+        1, 1, args.seq_length, args.seq_length)
+    attention_mask = attention_mask < 0.5
+
+    num_embedding_floating_point_operations = \
+        (2*vocab_size -1) * seq_length * microbatch_size * hidden_size
+    num_attention_floating_point_operations =  \
+        (4 * microbatch_size * seq_length * hidden_size / tensor_mp_size) * (
+            2 * hidden_size + seq_length)
+    num_mlp_floating_point_operations = \
+        16 * microbatch_size * seq_length * hidden_size * hidden_size / tensor_mp_size
+    num_total_floating_point_operations = num_attention_floating_point_operations + \
+        num_mlp_floating_point_operations
+    
+    for layer, label, need_attention_mask, num_floating_point_operations in \
+        zip([ attention_layer, mlp_layer, transformer_layer],
+            [ "Attention", "MLP", "Transformer"],
+            [ True, False, True],
+            [num_attention_floating_point_operations, num_mlp_floating_point_operations,
+             num_total_floating_point_operations]):
+        layer.train()
+        
+        times = np.zeros(num_iterations+num_warmup_iterations)
+        for i in range(num_warmup_iterations + num_iterations):
+            with torch.no_grad():
+                start.record()
+                if need_attention_mask:
+                    out = layer(inp, attention_mask)
+                    torch.cuda.empty_cache()
+                else:
+                    out = layer(inp)
+                end.record()
+            torch.cuda.synchronize()
+            times[i] = start.elapsed_time(end)
+
+        times = times[num_warmup_iterations:]
+        elapsed_time = np.amax(times)/1000 # get to seconds from milliseconds
+
+        throughput = num_floating_point_operations / (elapsed_time * 10**12)
+        print(f"{label} duration (in seconds): {elapsed_time:.4f}")
+        print(f"{label} throughput (in TFLOP/s): {throughput:.3f}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    h_group = parser.add_mutually_exclusive_group(required=True)
+    h_group.add_argument("--hidden_size", nargs="+", type=int, help='The hidden dimension, enter any number of arguments')
+    h_group.add_argument("--hidden_size_range", nargs='+', type=int, help="The hidden dimension, [start,stop,step]")
+
+    a_group = parser.add_mutually_exclusive_group(required=True)
+    a_group.add_argument("--num_attention_heads", nargs="+", type=int, help='The number of attention heads, enter any number of arguments')
+    a_group.add_argument("--num_attention_heads_range", nargs='+', type=int, help="The number of attention heads, [start,stop,step]")
+
+    v_group = parser.add_mutually_exclusive_group(required=True)
+    v_group.add_argument("--vocab_size", nargs="+", type=int, help='The vocabulary size, enter any number of arguments')
+    v_group.add_argument("--vocab_size_range", nargs='+', type=int, help="The vocabulary size, [start,stop,step]")
+
+    s_group = parser.add_mutually_exclusive_group(required=True)
+    s_group.add_argument("--seq_length", nargs="+", type=int, help='The sequence length, enter any number of arguments')
+    s_group.add_argument("--seq_length_range", nargs='+', type=int, help="The sequence length, [start,stop,step]")
+
+    b_group = parser.add_mutually_exclusive_group(required=True)
+    b_group.add_argument("--microbatch_size", nargs="+", type=int, help='The microbatch size, enter any number of arguments')
+    b_group.add_argument("--microbatch_size_range", nargs='+', type=int, help="The microbatch size, [start,stop,step]")
+
+    gb_group = parser.add_mutually_exclusive_group(required=True)
+    gb_group.add_argument("--global_batch_size", nargs="+", type=int, help='The global batch size, enter any number of arguments')
+    gb_group.add_argument("--global_batch_size_range", nargs='+', type=int, help="The global batch size, [start,stop,step]")
+
+    t_group = parser.add_mutually_exclusive_group(required=True)
+    t_group.add_argument("--tensor_mp_size", nargs="+", type=int, help='The tensor parallel size, enter any number of arguments')
+    t_group.add_argument("--tensor_mp_size_range", nargs='+', type=int, help="The tensor parallel size, [start,stop,step]")
+
+    parser.add_argument("--blocks", nargs="+", type=str, help='The transformer blocks to benchmark, enter "all" or any number of [qkv_transform, attention_score, \
+                          attention_over_value, attention_linear_projection, mlp_h_to_4h, mlp_4h_to_h, logit_block, layer_norm, dropout, add_bias_dropout, softmax, gelu]')
+
+    parser.add_argument("--use_flash", action="store_true", help="Use flash  attention")
+    parser.add_argument("--num_iterations", type=int, default=200, help='The number of iterations used to benchmark each BMM')
+    parser.add_argument("--num_warmup_iterations", type=int, default=50, help='The number of warmup iterations')
+    parser.add_argument("--cuda_device", type=int, default=0, help="The cuda device to run the benchmark on")
+    parser.add_argument("--output_file", type=str, default="../results/transformer.out")
+    args = parser.parse_args()
+
+    h = args.hidden_size
+    a = args.num_attention_heads
+    v = args.vocab_size
+    s = args.seq_length
+    t = args.tensor_mp_size
+    b = args.microbatch_size
+    global_batch_size = args.global_batch_size
+
+    if h is None:
+        start,stop,step = args.hidden_size_range
+        h = np.arange(start,stop,step)
+    if a is None:
+        start,stop,step = args.num_attention_heads_range
+        a = np.arange(start,stop,step)
+    if v is None:
+        start,stop,step = args.vocab_size_range
+        v = np.arange(start,stop,step)
+    if s is None:
+        start,stop,step = args.seq_length_range
+        s = np.arange(start,stop,step)
+    if t is None:
+        start,stop,step = args.tensor_mp_size_range
+        t = np.arange(start,stop,step)
+    if b is None:
+        start,stop,step = args.microbatch_size_range
+        b = np.arange(start,stop,step)
+    if global_batch_size is None:
+        start,stop,step = args.global_batch_size_range
+        global_batch_size = np.arange(start,stop,step)
+
+    torch.cuda.set_device(f"cuda:{args.cuda_device}")
+    with open(args.output_file, 'w') as sys.stdout:
+
+        configurations = []
+        for train_batch_size in global_batch_size:
+            for seq_length in s:
+                for tensor_mp_size in t:
+                    for num_attention_heads in a:
+                        for hidden_size in h:
+                            for microbatch_size in b:
+                                for vocab_size in v:
+                                    configurations.append((microbatch_size, hidden_size,
+                                            (tensor_mp_size, 1, 1), num_attention_heads,vocab_size,seq_length,train_batch_size))
+            megatron_wrapper.initialize_megatron(configurations[0])
+            for configuration in configurations:
+                (microbatch_size, hidden_size,
+                        (tensor_mp_size, pipeline_mp_size, dp_size), num_attention_heads,vocab_size,seq_length,train_batch_size) = configuration
+                label = {'num_attention_heads': num_attention_heads,
+                        'hidden_size': hidden_size,
+                        'train_micro_batch_size_per_gpu': microbatch_size,
+                        'seq_length': seq_length,
+                        'vocab_size': vocab_size,
+                        'train_batch_size': train_batch_size,
+                        'tensor_mp_size': tensor_mp_size,
+                        'pipeline_mp_size': pipeline_mp_size,
+                        'dp_size': dp_size}
+                label_str = ", ".join([f"{k}: {v}" for (k, v) in label.items()])
+                print(label_str)
+                if args.blocks is None:
+                    benchmark_transformer(args,configuration, seq_length, train_batch_size, args.num_iterations, args.num_warmup_iterations)
+                else:
+                    benchmark_transformer_from_mm_and_bmm(args,configuration, seq_length, train_batch_size, args.num_iterations, args.num_warmup_iterations)
+                print("=" * 120)
diff --git a/benchmarks/sizing/utils.py b/benchmarks/sizing/utils.py
new file mode 100644
index 0000000..3abd5b1
--- /dev/null
+++ b/benchmarks/sizing/utils.py
@@ -0,0 +1,181 @@
+import torch
+import numpy as np
+from megatron.model import LayerNorm
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes
+from megatron.model.transformer import ParallelSelfAttention, ParallelMLP, ParallelTransformerLayer
+from megatron.model.transformer import bias_dropout_add_fused_train
+from megatron.model.activations import bias_gelu_impl
+from megatron.model.gpt2_model import gpt2_attention_mask_func as attention_mask_func
+from megatron.model.word_embeddings import Embedding
+
+def display(shape):
+    return "x".join([str(dim) for dim in shape])
+
+# Benchmark of a basic GEMM
+def benchmark_mm(m, n, k, num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn(m, n).half().to("cuda")
+    B = torch.randn(n, k).half().to("cuda")
+    C = torch.empty(m, k).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            torch.mm(A, B, out=C)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {m}x{n}x{k}: {elapsed_time:.3f}")
+    print(f"Throughput (in TFLOP/s) for {m}x{n}x{k}: {(2 * m * n * k) / (elapsed_time * 10**12):.3f}")
+    print("-" * 80)
+    return elapsed_time
+
+# Benchmark of a GEMM with a single batched operator
+def benchmark_mm_b(m, n, k, label, b, num_iterations,num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    B = torch.randn((k, n)).half().to("cuda")
+    if b is None:
+        A = torch.randn((m, n)).half().to("cuda")
+        C = torch.empty((m, k)).half().to("cuda")
+        b = 1
+    else:
+        A = torch.randn((b, m, n)).half().to("cuda")
+        C = torch.empty((b, m, k)).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            torch.nn.functional.linear(A, B, out=C)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({m}x{n}x{k}, b={b}): {elapsed_time :.4f}")
+    print(f"Throughput (in TFLOP/s) for {label} ({m}x{n}x{k}, b={b}): "
+          f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
+    return elapsed_time
+
+def benchmark_bmm(b, m, n, k, label,num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn((b, m, n)).half().to("cuda")
+    B = torch.randn((b, n, k)).half().to("cuda")
+    C = torch.empty((b, m, k)).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            torch.bmm(A, B, out=C)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({b}x{m}x{n}x{k}): {elapsed_time :.4f}")
+    print(f"Throughput (in TFLOP/s) for {label} ({b}x{m}x{n}x{k}): "
+          f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
+    return elapsed_time
+
+def benchmark_dropout(A_dim, label, num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn(A_dim).half().to("cuda")
+    dropout = torch.nn.Dropout(0.5).to("cuda")
+
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            dropout(A)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
+    return elapsed_time
+
+def benchmark_softmax(scores_shape, seq_length, label, num_iterations,num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    scores = torch.randn(scores_shape).half().to("cuda")
+    attention_mask = torch.tril(torch.ones(
+        (1, seq_length, seq_length), device="cuda")).view(
+        1, 1, seq_length, seq_length)
+    attention_mask = attention_mask < 0.5
+    softmax = FusedScaleMaskSoftmax(
+        True, False,
+        SoftmaxFusionTypes.none, #attentionmasktype.padding=1,True
+        attention_mask_func, True, 1)
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            softmax(scores, attention_mask)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({display(scores_shape)}): {elapsed_time :.4f}")
+    return elapsed_time
+
+def benchmark_fused_gelu(A_dim, b_dim, label, num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn(A_dim).half().to("cuda")
+    b = torch.randn(b_dim).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            bias_gelu_impl(A, b)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
+    return elapsed_time
+
+def benchmark_layer_norm(A_dim, normalized_shape, label, num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn(A_dim).half().to("cuda")
+    layer_norm = LayerNorm(normalized_shape).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            layer_norm(A)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
+    return elapsed_time
+
+def benchmark_add_bias_dropout(shape, label, num_iterations, num_warmup_iterations):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    A = torch.randn(shape).half().to("cuda")
+    bias = torch.randn(shape).half().to("cuda")
+    residue = torch.randn(shape).half().to("cuda")
+    times = np.zeros(num_iterations+num_warmup_iterations)
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            bias_dropout_add_fused_train(A, bias, residue, 0.0)
+            end.record()
+        torch.cuda.synchronize()
+        times[i] = start.elapsed_time(end)
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amax(times)/1000 
+    print(f"Elapsed time for {label} ({display(shape)}): {elapsed_time :.4f}")
+    return elapsed_time