apple · apoorvtintin · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 24, 2024
@@ -1176,7 +1176,7 @@ def create_device_mesh(
     # Check if the devices are part of a multi-granule configuration.
     # <https://github.com/google/jax/blob/b81b79c1b0d2ec/jax/experimental/mesh_utils.py#L313>
     device_platform = devices[0].platform
-    attr = "process_index" if device_platform != "tpu" else "slice_index"
+    attr = "process_index" if device_platform == "gpu" else "slice_index"
     is_multi_granule_env = hasattr(devices[0], attr)
     if not all(el.platform == device_platform for el in devices):
         raise NotImplementedError(f"Not all devices had platform: {device_platform}.")
@@ -1193,6 +1193,10 @@ def create_device_mesh(
         logging.warning("Falling back to ICI-only mesh on GPU, performance may be reduced.")
         return build_standard_mesh(mesh_shape, devices=devices)
 
+    # Neuron also only uses standard mesh
+    if device_platform == "neuron":
+        return build_standard_mesh(mesh_shape, devices=devices)
+
     # We only break the first device axis (the least communication intensive) across granules.
     assert (
         ici_mesh_shape[0] % num_granules == 0

@@ -13,7 +13,9 @@
 import math
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import jax
 import jax.numpy as jnp
+import numpy as np
 import tensorflow as tf
 from jax.sharding import PartitionSpec
 
@@ -267,12 +269,17 @@ def model_config(
         batch_axis_names=batch_axis_names,
         seq_axis_names="seq",
     )
+
+    device_platform = np.asarray(jax.devices())[0].platform
 devices = jax.devices() 
 devices = jax.devices() 
+    # Trainium will have FSDP support soon, for now use Zero 3.
+    fsdp_axis_names = ("expert", "fsdp", "seq") if device_platform != "neuron" else ("data")
+
     cfg.dtype = jnp.float32
     # Shard some FFN and attention weights over multiple axes.
     set_double_shard_weights_config(
         cfg.decoder.transformer.layer,
         batch_axis_names=batch_axis_names,
-        fsdp_axis_names=("expert", "fsdp", "seq"),
+        fsdp_axis_names=fsdp_axis_names,
         tp_axis_names="model",
         seq_axis_names=("seq",),
     )

@@ -103,7 +103,6 @@ def get_trainer_kwargs(
         num_kv_heads = 8
 
     rope_theta = ROPE_THETA[version]
-
     # dict() is more readable here.
     # pylint: disable=use-dict-literal
     if model_size == "test":
@@ -167,6 +166,10 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge)-(256|512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=8),
                 ),
+                (
+                    "neuron-(trn1.32xlarge|trn1n.32xlarge)-(32|64|256|512|1024|2048)",
+                    mesh_shape_from_axes(data=-1, model=8),
+                ),
             ),
         )
     elif model_size == "70B":