Use old code for preprocessing

abheesht17 · abheesht17 · commit 9ced4ed85710 · 2025-10-19T15:48:57.000+05:30
diff --git a/keras_rs/src/layers/embedding/jax/distributed_embedding.py b/keras_rs/src/layers/embedding/jax/distributed_embedding.py
@@ -265,7 +265,7 @@ def _add_table_variable(
         table_specs: Sequence[embedding_spec.TableSpec],
         num_shards: int,
         add_slot_variables: bool,
-    ) -> embedding.EmbeddingVariables:
+    ) -> tuple[keras.Variable, tuple[keras.Variable, ...] | None]:
         stacked_table_spec = typing.cast(
             embedding_spec.StackedTableSpec, table_specs[0].stacked_table_spec
         )
@@ -334,7 +334,7 @@ def _add_table_variable(
                 slot_initializers, slot_variables
             )
 
-        return embedding.EmbeddingVariables(table_variable, slot_variables)
+        return table_variable, slot_variables
 
     @keras_utils.no_automatic_dependency_tracking
     def _sparsecore_init(
@@ -441,8 +441,8 @@ def sparsecore_build(
         )
 
         # Collect all stacked tables.
-        table_specs = embedding.get_table_specs(feature_specs)
-        table_stacks = jte_table_stacking.get_table_stacks(table_specs)
+        table_specs = embedding_utils.get_table_specs(feature_specs)
+        table_stacks = embedding_utils.get_table_stacks(table_specs)
 
         # Create variables for all stacked tables and slot variables.
         with sparsecore_distribution.scope():
@@ -515,8 +515,10 @@ def _sparsecore_symbolic_preprocess(
         del inputs, weights, training
 
         # Each stacked-table gets a ShardedCooMatrix.
-        table_specs = embedding.get_table_specs(self._config.feature_specs)
-        table_stacks = jte_table_stacking.get_table_stacks(table_specs)
+        table_specs = embedding_utils.get_table_specs(
+            self._config.feature_specs
+        )
+        table_stacks = embedding_utils.get_table_stacks(table_specs)
         stacked_table_specs = {
             stack_name: stack[0].stacked_table_spec
             for stack_name, stack in table_stacks.items()
@@ -580,18 +582,12 @@ def _sparsecore_preprocess(
         )
 
         layout = self._sparsecore_layout
-        print(f"-->{layout=}")
         mesh = layout.device_mesh.backend_mesh
-        print(f"-->{mesh=}")
         global_device_count = mesh.devices.size
-        print(f"-->{global_device_count=}")
         local_device_count = mesh.local_mesh.devices.size
-        print(f"{local_device_count=}")
         num_sc_per_device = jte_utils.num_sparsecores_per_device(
             mesh.devices.item(0)
         )
-        print(f"-->{num_sc_per_device=}")
-        print(f"-->{jax.process_count()=}")
 
         preprocessed, stats = embedding_utils.stack_and_shard_samples(
             self._config.feature_specs,
@@ -600,51 +596,44 @@ def _sparsecore_preprocess(
             global_device_count,
             num_sc_per_device,
         )
-        print(f"-->{stats=}")
 
         if training:
             # Synchronize input statistics across all devices and update the
             # underlying stacked tables specs in the feature specs.
+            prev_stats = embedding_utils.get_stacked_table_stats(
+                self._config.feature_specs
+            )
+
+            # Take the maximum with existing stats.
+            stats = keras.tree.map_structure(max, prev_stats, stats)
+
+            # Flatten the stats so we can more efficiently transfer them
+            # between hosts.  We use jax.tree because we will later need to
+            # unflatten.
+            flat_stats, stats_treedef = jax.tree.flatten(stats)
 
-            # Aggregate stats across all processes/devices via pmax.
+            # In the case of multiple local CPU devices per host, we need to
+            # replicate the stats to placate JAX collectives.
             num_local_cpu_devices = jax.local_device_count("cpu")
-            print(f"-->{num_local_cpu_devices=}")
-
-            def pmax_aggregate(x: Any) -> Any:
-                if not hasattr(x, "ndim"):
-                    x = np.array(x)
-                jax.debug.print("--> x.shape={}", x.shape)
-                tiled_x = np.tile(x, (num_local_cpu_devices, *([1] * x.ndim)))
-                jax.debug.print("--> tiled_x.shape={}", tiled_x.shape)
-                return jax.pmap(
-                    lambda y: jax.lax.pmax(y, "all_cpus"),  # type: ignore[no-untyped-call]
-                    axis_name="all_cpus",
-                    backend="cpu",
-                )(tiled_x)[0]
-
-            full_stats = jax.tree.map(pmax_aggregate, stats)
-
-            # Check if stats changed enough to warrant action.
-            stacked_table_specs = embedding.get_stacked_table_specs(
-                self._config.feature_specs
+            tiled_stats = np.tile(
+                np.array(flat_stats, dtype=np.int32), (num_local_cpu_devices, 1)
             )
-            changed = any(
-                np.max(full_stats.max_ids_per_partition[stack_name])
-                > spec.max_ids_per_partition
-                or np.max(full_stats.max_unique_ids_per_partition[stack_name])
-                > spec.max_unique_ids_per_partition
-                or (
-                    np.max(full_stats.required_buffer_size_per_sc[stack_name])
-                    * num_sc_per_device
-                )
-                > (spec.suggested_coo_buffer_size_per_device or 0)
-                for stack_name, spec in stacked_table_specs.items()
+
+            # Aggregate variables across all processes/devices.
+            max_across_cpus = jax.pmap(
+                lambda x: jax.lax.pmax(  # type: ignore[no-untyped-call]
+                    x, "all_cpus"
+                ),
+                axis_name="all_cpus",
+                backend="cpu",
             )
+            flat_stats = max_across_cpus(tiled_stats)[0].tolist()
+            stats = jax.tree.unflatten(stats_treedef, flat_stats)
 
             # Update configuration and repeat preprocessing if stats changed.
-            if changed:
-                embedding.update_preprocessing_parameters(
-                    self._config.feature_specs, full_stats, num_sc_per_device
+            if stats != prev_stats:
+                embedding_utils.update_stacked_table_stats(
+                    self._config.feature_specs, stats
                 )
 
                 # Re-execute preprocessing with consistent input statistics.
@@ -729,8 +718,8 @@ def _sparsecore_set_tables(self, tables: Mapping[str, ArrayLike]) -> None:
 
         config = self._config
         num_table_shards = config.mesh.devices.size * config.num_sc_per_device
-        table_specs = embedding.get_table_specs(config.feature_specs)
-        sharded_tables = jte_table_stacking.stack_and_shard_tables(
+        table_specs = embedding_utils.get_table_specs(config.feature_specs)
+        sharded_tables = embedding_utils.stack_and_shard_tables(
             table_specs,
             tables,
             num_table_shards,
@@ -749,8 +738,8 @@ def _sparsecore_set_tables(self, tables: Mapping[str, ArrayLike]) -> None:
         # Assign stacked table variables to the device values.
         keras.tree.map_structure_up_to(
             device_tables,
-            lambda embedding_variables,
-            table_value: embedding_variables.table.assign(table_value),
+            lambda table_and_slot_variables,
+            table_value: table_and_slot_variables[0].assign(table_value),
             self._table_and_slot_variables,
             device_tables,
         )
@@ -761,19 +750,17 @@ def _sparsecore_get_embedding_tables(self) -> dict[str, ArrayLike]:
 
         config = self._config
         num_table_shards = config.mesh.devices.size * config.num_sc_per_device
-        table_specs = embedding.get_table_specs(config.feature_specs)
+        table_specs = embedding_utils.get_table_specs(config.feature_specs)
 
         # Extract only the table variables, not the gradient slot variables.
         table_variables = {
-            name: jax.device_get(embedding_variables.table.value)
-            for name, embedding_variables in (
-                self._table_and_slot_variables.items()
-            )
+            name: jax.device_get(table_and_slots[0].value)
+            for name, table_and_slots in self._table_and_slot_variables.items()
         }
 
         return typing.cast(
             dict[str, ArrayLike],
-            jte_table_stacking.unshard_and_unstack_tables(
+            embedding_utils.unshard_and_unstack_tables(
                 table_specs, table_variables, num_table_shards
             ),
         )