upgrade to jax 0.4.34 (apple#817)

matthew-e-hopkins · web-flow · commit 14054b4ebb1f · 2024-11-06T19:35:21.000Z
* upgrade jax to 0.4.34

* add workaround for change to jax cluster autodetection
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Change Log
 
+## 0.1.4
+
+* Changes
+  * Upgrade Jax from 0.4.33 to 0.4.34.
+
 ## 0.1.3
 
 * Changes
diff --git a/axlearn/common/learner_test.py b/axlearn/common/learner_test.py
@@ -1219,7 +1219,7 @@ def test_learner_masking(test_self):
         pre-existing `CompositeLearner` implementation.
 
         """
-        updates = axlearn.common.update_transformation_test.mock_updates()
+        updates = axlearn.common.update_transformation_test.mock_updates(state_param_none=False)
 
         param_keys = updates.opt_params.keys()
         state_keys = updates.inplace_updates.keys()
diff --git a/axlearn/common/optimizers.py b/axlearn/common/optimizers.py
@@ -544,11 +544,13 @@ def update_fn(updates: NestedTensor, state: AddDecayedWeightsState, params: Nest
             lr_scale = lr**learning_rate_exponent
 
         param_scales = _weight_decay_scales(params, per_param_scale=per_param_scale)
+        f = lambda g, p, s: g + weight_decay * lr_scale * p.value * s
         updates = jax.tree.map(
-            lambda g, p, s: g + weight_decay * lr_scale * p.value * s,
+            lambda x, y, z: None if x is None else f(x, y, z),
             updates,
             params,
             param_scales,
+            is_leaf=lambda x: x is None,
         )
         if learning_rate_exponent is None:
             updated_state = state
@@ -1882,9 +1884,10 @@ def _smoothed_updates(
         # First compute raw updates.
         raw_updates, pps_tree = _split_update_results(
             jax.tree.map(
-                lambda g, s: _raw_updates(grad=g, pps=s),
+                lambda g, s: None if g is None else _raw_updates(grad=g, pps=s),
                 grads,
                 state.pps,
+                is_leaf=lambda x: x is None,
             )
         )
         # Clip raw updates if necessary.
@@ -1966,7 +1969,12 @@ def _update2(u: Tensor, param: OptParam):
                 context.add_summary("weight_decay_rate", weight_decay * schedule_scale)
             return -schedule_scale * updates_with_wd
 
-        updates2 = jax.tree.map(lambda u, p: _update2(u, param=p), updates, params)
+        updates2 = jax.tree.map(
+            lambda u, p: None if u is None else _update2(u, param=p),
+            updates,
+            params,
+            is_leaf=lambda x: x is None,
+        )
         return updates2, optax.safe_int32_increment(step)
 
     # Stage 1.
diff --git a/axlearn/common/update_transformation_test.py b/axlearn/common/update_transformation_test.py
@@ -166,9 +166,11 @@ def mock_params() -> Nested[Tensor]:
     )
 
 
-def mock_updates() -> axlearn.common.update_transformation.Updates:
+def mock_updates(state_param_none: bool = True) -> axlearn.common.update_transformation.Updates:
     """Create an updates object with various semi-reasonable values."""
     model_params = mock_params()
+    if state_param_none:
+        model_params["state"] = None
     opt_params = jax.tree.map(
         lambda p: OptParam(
             value=p,
@@ -197,6 +199,7 @@ def test_param_values(self):
         updates = mock_updates()
         actual = updates.param_values()
         expected = mock_params()
+        expected["state"] = None
         chex.assert_trees_all_equal_structs(actual, expected)
         self.assertNestedAllClose(actual, expected)
 
@@ -218,12 +221,7 @@ def test_param_specs(self):
                     weight_decay_scale=0.1,
                 )
             ),
-            state=ParameterSpec(
-                shape=(2,),
-                dtype=jnp.int32,
-                factorization=FactorizationSpec([None]),
-                weight_decay_scale=0.1,
-            ),
+            state=None,
             more_state=ParameterSpec(
                 shape=(3,),
                 dtype=jnp.int32,
diff --git a/axlearn/common/utils_spmd.py b/axlearn/common/utils_spmd.py
@@ -87,6 +87,10 @@ def setup(
                 num_processes=num_processes,
                 process_id=process_id,
             )
+            if jax_backend == "gpu":
+                # jax 0.4.34 introduced a change to cluster auto-detection behavior, supplying
+                # local_device_ids arg allows us to maintain expected behavior
+                init_kwargs["local_device_ids"] = list(range(8))
 
         jax.distributed.initialize(**init_kwargs)
         _jax_distributed_initialized = True
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "axlearn"
-version = "0.1.3"
+version = "0.1.4"
 description = "AXLearn"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -23,8 +23,8 @@ core = [
     "absl-py==2.1.0",
     "chex==0.1.86",  # chex 0.1.86 is required for jax 0.4.25.
     "importlab==0.7",  # breaks pytype on 0.8
-    "jax==0.4.33",
-    "jaxlib==0.4.33",
+    "jax==0.4.34",
+    "jaxlib==0.4.34",
     "nltk==3.7",  # for text preprocessing
     "optax==0.1.7",  # optimizers (0.1.0 has known bugs).
     "portpicker",
@@ -101,7 +101,7 @@ gcp = [
 # Note: Specify -f https://storage.googleapis.com/jax-releases/libtpu_releases.html during install.
 tpu = [
     "axlearn[gcp]",
-    "jax[tpu]==0.4.33",  # must be >=0.4.19 for compat with v5p.
+    "jax[tpu]==0.4.34",  # must be >=0.4.19 for compat with v5p.
 ]
 # Vertex AI tensorboard. TODO(markblee): Merge with `gcp`.
 vertexai_tensorboard = [
@@ -125,7 +125,7 @@ dataflow = [
 # GPU custom kernel dependency.
 gpu = [
     "triton==2.1.0",
-    "jax[cuda12_pip]==0.4.33",
+    "jax[cuda12]==0.4.34",
 ]
 # Open API inference.
 open_api = [