Support for uneven heterogenous sharding for inference sharded tensor pool

faran928 · facebook-github-bot · commit 3f268a10db3c · 2025-11-10T20:45:29.000-08:00
Summary:
A few changes in the diff:
1. Support to proportionally shard the tensor pool based on memory capacity per rank. 
2. Using block_bucketize_sparse_features_inference to return bucket_mapping that can be used during request batching in inference w/ custom sigrid predictor engine
3. Wrapping some of the operations with fx wrappers to make it compatible with model split boundaries for DLRM serving where embeddings are sharded and split onto different pytorch modules
4. Exposing set_device() api to some of the modules if we want to place some shards to cpu while others to cuda.
5. Move _get_unbucketize_tensor_via_length_alignment to common util files.

Differential Revision: D79603009
diff --git a/torchrec/distributed/keyed_jagged_tensor_pool.py b/torchrec/distributed/keyed_jagged_tensor_pool.py
@@ -630,7 +630,7 @@ def _lookup_values_dist(
 
     # pyre-ignore
     def forward(self, ids: torch.Tensor) -> KeyedJaggedTensor:
-        dist_input, unbucketize_permute = self._lookup_ids_dist(ids)
+        dist_input, unbucketize_permute, _, _ = self._lookup_ids_dist(ids)
         lookup = self._lookup_local(dist_input)
         # Here we are playing a trick to workaround a fx tracing issue,
         # as proxy is not iteratable.
diff --git a/torchrec/distributed/quant_embedding.py b/torchrec/distributed/quant_embedding.py
@@ -84,6 +84,7 @@
 from torchrec.modules.utils import (
     _fx_trec_get_feature_length,
     _get_batching_hinted_output,
+    _get_unbucketize_tensor_via_length_alignment,
 )
 from torchrec.quant.embedding_modules import (
     EmbeddingCollection as QuantEmbeddingCollection,
@@ -96,6 +97,7 @@
 torch.fx.wrap("len")
 torch.fx.wrap("_get_batching_hinted_output")
 torch.fx.wrap("_fx_trec_get_feature_length")
+torch.fx.wrap("_get_unbucketize_tensor_via_length_alignment")
 
 try:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
@@ -278,16 +280,6 @@ def _fx_trec_wrap_length_tolist(length: torch.Tensor) -> List[int]:
     return length.long().tolist()
 
 
-@torch.fx.wrap
-def _get_unbucketize_tensor_via_length_alignment(
-    lengths: torch.Tensor,
-    bucketize_length: torch.Tensor,
-    bucketize_permute_tensor: torch.Tensor,
-    bucket_mapping_tensor: torch.Tensor,
-) -> torch.Tensor:
-    return bucketize_permute_tensor
-
-
 @torch.fx.wrap
 def _fx_split_embeddings_per_feature_length(
     embeddings: torch.Tensor,
diff --git a/torchrec/distributed/sharding/rw_pool_sharding.py b/torchrec/distributed/sharding/rw_pool_sharding.py
@@ -166,6 +166,9 @@ class InferRwObjectPoolInputDist(torch.nn.Module):
         block_size (torch.Tensor): tensor containing block sizes for each rank.
             e.g. if block_size=torch.tensor(100), then IDs 0-99 will be assigned to rank
             0, 100-199 to rank 1, and so on.
+        block_bucketize_row_pos (torch.Tensor]): tensor containing shard/row offsets for each
+            rank in case of uneven sharding of the tensor pool across ranks. If not provided,
+            then block_size will be used to permute the IDs across ranks.
 
     Example:
         device = torch.device("cpu")
@@ -179,22 +182,27 @@ class InferRwObjectPoolInputDist(torch.nn.Module):
     _world_size: int
     _device: torch.device
     _block_size: torch.Tensor
+    _block_bucketize_row_pos: list[torch.Tensor]
 
     def __init__(
         self,
         env: ShardingEnv,
         device: torch.device,
         block_size: torch.Tensor,
+        block_bucketize_row_pos: Optional[list[torch.Tensor]] = None,
     ) -> None:
         super().__init__()
         self._world_size = env.world_size
         self._device = device
         self._block_size = block_size
+        self._block_bucketize_row_pos: list[torch.Tensor] = (
+            [] if block_bucketize_row_pos is None else block_bucketize_row_pos
+        )
 
     def forward(
         self,
         ids: torch.Tensor,
-    ) -> Tuple[List[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Bucketizes ids tensor into a list of tensors each containing ids
         for the corresponding rank. Places each tensor on the appropriate device.
@@ -203,24 +211,34 @@ def forward(
             ids (torch.Tensor): Tensor with ids
 
         Returns:
-           Tuple[List[torch.Tensor], torch.Tensor]: Tuple containing list of ids tensors
-            for each rank given the bucket sizes, and the tensor containing indices
-            to permute the ids to get the original order before bucketization.
+           Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]:
+           Tuple containing
+           1. list of ids tensors for each rank given the bucket sizes
+           2. the tensor containing indices to permute the ids to get the original order before bucketization.
+           3. the tensor containing the bucket mapping for each id
+           4. the tensor containing the bucketized lengths
         """
         (
             bucketized_lengths,
             bucketized_indices,
-            _bucketized_weights,
-            _bucketize_permute,
+            _,  # bucketized_weights
+            _,  # _bucketize_permute
             unbucketize_permute,
-        ) = torch.ops.fbgemm.block_bucketize_sparse_features(
-            _get_bucketize_shape(ids, ids.device),
-            ids.long(),
+            bucket_mapping,
+        ) = torch.ops.fbgemm.block_bucketize_sparse_features_inference(
+            lengths=_get_bucketize_shape(ids, ids.device),
+            indices=ids.long(),
             bucketize_pos=False,
             sequence=True,
             block_sizes=self._block_size.long(),
             my_size=self._world_size,
             weights=None,
+            block_bucketize_pos=(
+                self._block_bucketize_row_pos
+                if len(self._block_bucketize_row_pos) > 0
+                else None
+            ),
+            return_bucket_mapping=True,
         )
 
         id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(bucketized_lengths)
@@ -236,7 +254,13 @@ def forward(
             )
 
         assert unbucketize_permute is not None, "unbucketize permute must not be None"
-        return dist_ids, unbucketize_permute
+        assert bucket_mapping is not None, "bucket mapping must not be None"
+        return (
+            dist_ids,
+            unbucketize_permute,
+            bucket_mapping,
+            bucketized_lengths,
+        )
 
     def update(
         self,
@@ -270,6 +294,11 @@ def update(
             block_sizes=self._block_size.long(),
             my_size=self._world_size,
             weights=None,
+            block_bucketize_pos=(
+                self._block_bucketize_row_pos
+                if len(self._block_bucketize_row_pos) > 0
+                else None
+            ),
         )
 
         id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(bucketized_lengths)
diff --git a/torchrec/distributed/sharding/rw_tensor_pool_sharding.py b/torchrec/distributed/sharding/rw_tensor_pool_sharding.py
@@ -219,11 +219,16 @@ def __init__(
         device: torch.device,
     ) -> None:
         super().__init__()
-        self._device: Optional[torch.device] = device
+        self._device: torch.device = device
         self._world_size: int = env.world_size
         self._cat_dim = 0
         self._placeholder: torch.Tensor = torch.ones(1, device=device)
 
+    @torch.jit.export
+    def set_device(self, device_str: str) -> None:
+        self._device = torch.device(device_str)
+        self._placeholder = torch.ones(1, device=self._device)
+
     def forward(
         self,
         lookups: List[torch.Tensor],
@@ -256,12 +261,16 @@ def __init__(
         pool_size: int,
         env: ShardingEnv,
         device: torch.device,
+        memory_capacity_per_rank: Optional[list[int]] = None,
     ) -> None:
-        super().__init__(pool_size, env, device)
+        super().__init__(pool_size, env, device, memory_capacity_per_rank)
 
     def create_lookup_ids_dist(self) -> InferRwObjectPoolInputDist:
         return InferRwObjectPoolInputDist(
-            self._env, device=self._device, block_size=self._block_size_t
+            self._env,
+            device=self._device,
+            block_size=self._block_size_t,
+            block_bucketize_row_pos=self._block_bucketize_row_pos,
         )
 
     def create_lookup_values_dist(
diff --git a/torchrec/distributed/tensor_pool.py b/torchrec/distributed/tensor_pool.py
@@ -32,7 +32,14 @@
 )
 from torchrec.modules.object_pool_lookups import TensorLookup, TensorPoolLookup
 from torchrec.modules.tensor_pool import TensorPool
-from torchrec.modules.utils import deterministic_dedup
+from torchrec.modules.utils import (
+    _get_batching_hinted_output,
+    _get_unbucketize_tensor_via_length_alignment,
+    deterministic_dedup,
+)
+
+torch.fx.wrap("_get_unbucketize_tensor_via_length_alignment")
+torch.fx.wrap("_get_batching_hinted_output")
 
 
 @torch.fx.wrap
@@ -44,6 +51,17 @@ def index_select_view(
     return output[unbucketize_permute].view(-1, dim)
 
 
+@torch.fx.wrap
+def _fx_item_unwrap_optional_tensor(optional: Optional[torch.Tensor]) -> torch.Tensor:
+    assert optional is not None, "Expected optional to be non-None Tensor"
+    return optional
+
+
+@torch.fx.wrap
+def _get_id_length_sharded_tensor_pool(ids: torch.Tensor) -> torch.Tensor:
+    return torch.tensor([ids.size(dim=0)], device=ids.device, dtype=torch.long)
+
+
 class TensorPoolAwaitable(LazyAwaitable[torch.Tensor]):
     def __init__(
         self,
@@ -271,6 +289,8 @@ class LocalShardPool(torch.nn.Module):
         # out is tensor([1,2,3]) i.e. first row of the shard
     """
 
+    current_device: torch.device
+
     def __init__(
         self,
         shard: torch.Tensor,
@@ -280,6 +300,12 @@ def __init__(
             shard,
             requires_grad=False,
         )
+        self.current_device = self._shard.device
+
+    @torch.jit.export
+    def set_device(self, device_str: str) -> None:
+        self.current_device = torch.device(device_str)
+        self._shard.to(self.current_device)
 
     def forward(self, rank_ids: torch.Tensor) -> torch.Tensor:
         """
@@ -291,7 +317,7 @@ def forward(self, rank_ids: torch.Tensor) -> torch.Tensor:
         Returns:
             torch.Tensor: Tensor of values corresponding to the given rank ids.
         """
-        return self._shard[rank_ids]
+        return self._shard[rank_ids.to(self.current_device)]
 
     def update(self, rank_ids: torch.Tensor, values: torch.Tensor) -> None:
         _ = update(self._shard, rank_ids, values)
@@ -337,6 +363,11 @@ def __init__(
                 env=self._sharding_env,
                 device=self._device,
                 pool_size=self._pool_size,
+                memory_capacity_per_rank=(
+                    self._sharding_plan.memory_capacity_per_rank
+                    if self._sharding_plan.memory_capacity_per_rank is not None
+                    else None
+                ),
             )
         else:
             raise NotImplementedError(
@@ -356,6 +387,7 @@ def __init__(
                 if device == torch.device("cpu")
                 else torch.device("cuda", rank)
             )
+
             self._local_shard_pools.append(
                 LocalShardPool(
                     torch.empty(
@@ -409,7 +441,7 @@ def create_context(self) -> ObjectPoolShardingContext:
     def _lookup_ids_dist(
         self,
         ids: torch.Tensor,
-    ) -> Tuple[List[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]:
         return self._lookup_ids_dist_impl(ids)
 
     # pyre-ignore
@@ -439,18 +471,54 @@ def _lookup_values_dist(
 
     # pyre-ignore
     def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        dist_input, unbucketize_permute = self._lookup_ids_dist(ids)
+        dist_input, unbucketize_permute, bucket_mapping, bucketized_lengths = (
+            self._lookup_ids_dist(ids)
+        )
+        unbucketize_permute_non_opt = _fx_item_unwrap_optional_tensor(
+            unbucketize_permute
+        )
+
         lookup = self._lookup_local(dist_input)
 
         # Here we are playing a trick to workaround a fx tracing issue,
         # as proxy is not iteratable.
         lookup_list = []
-        for i in range(self._world_size):
-            lookup_list.append(lookup[i])
+        # In case of non-heterogenous even sharding keeping the behavior
+        # consistent with existing logic to ensure that additional fx wrappers
+        # do not impact the model split logic during inference in anyway
+        if self._sharding_plan.memory_capacity_per_rank is None:
+            for i in range(self._world_size):
+                lookup_list.append(lookup[i])
+        else:
+            # Adding fx wrappers in case of uneven heterogenous sharding to
+            # make it compatible with model split boundaries during inference
+            for i in range(self._world_size):
+                lookup_list.append(
+                    _get_batching_hinted_output(
+                        _get_id_length_sharded_tensor_pool(dist_input[i]), lookup[i]
+                    )
+                )
+
+            features_before_input_dist_length = _get_id_length_sharded_tensor_pool(ids)
+            bucketized_lengths_col_view = bucketized_lengths.view(self._world_size, -1)
+            unbucketize_permute_non_opt = _fx_item_unwrap_optional_tensor(
+                unbucketize_permute
+            )
+            bucket_mapping_non_opt = _fx_item_unwrap_optional_tensor(bucket_mapping)
+            unbucketize_permute_non_opt = _get_unbucketize_tensor_via_length_alignment(
+                features_before_input_dist_length,
+                bucketized_lengths_col_view,
+                unbucketize_permute_non_opt,
+                bucket_mapping_non_opt,
+            )
 
         output = self._lookup_values_dist(lookup_list)
 
-        return index_select_view(output, unbucketize_permute, self._dim)
+        return index_select_view(
+            output,
+            unbucketize_permute_non_opt.to(device=output.device),
+            self._dim,
+        )
 
     # pyre-ignore
     def _update_values_dist(self, ctx: ObjectPoolShardingContext, values: torch.Tensor):
diff --git a/torchrec/distributed/tensor_sharding.py b/torchrec/distributed/tensor_sharding.py
@@ -102,6 +102,7 @@ def __init__(
         pool_size: int,
         env: ShardingEnv,
         device: torch.device,
+        memory_capacity_per_rank: Optional[list[int]] = None,
     ) -> None:
         self._pool_size = pool_size
         self._env = env
@@ -117,13 +118,40 @@ def __init__(
         self._last_block_size: int = self._pool_size - self._block_size * (
             self._world_size - 1
         )
-        self.local_pool_size_per_rank: List[int] = [self._block_size] * (
-            self._world_size - 1
-        ) + [self._last_block_size]
-
+        # only used for uneven sharding case when memory_capacity_per_rank is provided
+        row_offset_per_rank = []
+
+        if memory_capacity_per_rank is None:
+            self.local_pool_size_per_rank: List[int] = [self._block_size] * (
+                self._world_size - 1
+            ) + [self._last_block_size]
+        else:
+            row_offset_per_rank = [0]
+            self.local_pool_size_per_rank: List[int] = []
+            row_offset = 0
+            assert (
+                len(memory_capacity_per_rank) == self._world_size
+            ), "If memory_capacity_per_rank is provided for sharded tensor pool, it must have the same length as world_size"
+            total_mem_cap = sum(memory_capacity_per_rank)
+            for cap in memory_capacity_per_rank[:-1]:
+                rows_per_shard = int(cap / total_mem_cap * self._pool_size)
+                self.local_pool_size_per_rank.append(rows_per_shard)
+                row_offset += rows_per_shard
+                row_offset_per_rank.append(row_offset)
+            self.local_pool_size_per_rank.append(
+                self._pool_size - sum(self.local_pool_size_per_rank)
+            )
+            row_offset_per_rank.append(self._pool_size)
         self._block_size_t: torch.Tensor = torch.tensor(
             [self._block_size], device=self._device, dtype=torch.long
         )
+        # for uneven sharding case, we get the row offsets for each rank to
+        # enable input_dist and lookup of ids to correct rank
+        self._block_bucketize_row_pos: Optional[List[torch.Tensor]] = (
+            None
+            if memory_capacity_per_rank is None
+            else [torch.tensor(row_offset_per_rank, device=self._device)]
+        )
 
     @abstractmethod
     def create_lookup_ids_dist(self) -> torch.nn.Module:
diff --git a/torchrec/distributed/tests/test_tensor_pool_rw_sharding.py b/torchrec/distributed/tests/test_tensor_pool_rw_sharding.py
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
diff --git a/torchrec/modules/utils.py b/torchrec/modules/utils.py