apple · apoorvtintin · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 24, 2024
@@ -1193,6 +1193,10 @@ def create_device_mesh(
         logging.warning("Falling back to ICI-only mesh on GPU, performance may be reduced.")
         return build_standard_mesh(mesh_shape, devices=devices)
 
+    # Neuron also only uses standard mesh 
+    if device_platform == "neuron":
+        return build_standard_mesh(mesh_shape, devices=devices)
+
     # We only break the first device axis (the least communication intensive) across granules.
     assert (
         ici_mesh_shape[0] % num_granules == 0

@@ -11,8 +11,10 @@
 """
 
 import math
+import numpy as np
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import jax
 import jax.numpy as jnp
 import tensorflow as tf
 from jax.sharding import PartitionSpec
@@ -267,12 +269,17 @@ def model_config(
         batch_axis_names=batch_axis_names,
         seq_axis_names="seq",
     )
+
+    device_platform = np.asarray(jax.devices())[0].platform
 devices = jax.devices() 
 devices = jax.devices() 
+    # neuron uses Zero 3
+    fsdp_axis_names = ("expert", "fsdp", "seq") if device_platform != 'neuron' else ("data", "expert", "fsdp", "seq") 
+
     cfg.dtype = jnp.float32
     # Shard some FFN and attention weights over multiple axes.
     set_double_shard_weights_config(
         cfg.decoder.transformer.layer,
         batch_axis_names=batch_axis_names,
-        fsdp_axis_names=("expert", "fsdp", "seq"),
+        fsdp_axis_names=fsdp_axis_names,
         tp_axis_names="model",
         seq_axis_names=("seq",),
     )

@@ -83,6 +83,7 @@ class Version(enum.Enum):
     },
 }
 
+TRN_MODEL_AXIS_SIZE=8
 
 def get_trainer_kwargs(
     model_size: str,
@@ -103,7 +104,6 @@ def get_trainer_kwargs(
         num_kv_heads = 8
 
     rope_theta = ROPE_THETA[version]
-
     # dict() is more readable here.
     # pylint: disable=use-dict-literal
     if model_size == "test":
@@ -167,6 +167,10 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge)-(256|512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=8),
                 ),
+                (   
+                    "neuron-(trn1.32xlarge|trn1n.32xlarge)-(32|64|256|512|1024|2048)",
+                    mesh_shape_from_axes(data=-1, model=TRN_MODEL_AXIS_SIZE),
+                ),
             ),
         )
     elif model_size == "70B":