pytorch
diff --git a/‎helion/_compat.py‎
Lines changed: 27 additions & 0 deletions b/‎helion/_compat.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 65 additions & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 129 additions & 35 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 129 additions & 35 deletions
@@ -3,6 +3,7 @@
 import contextlib
 import functools
 import re
+from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
 from typing import cast
@@ -16,6 +17,9 @@
 import triton.language as tl
 import triton.runtime.jit as triton_jit
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
 NativeSpecializeImpl = Callable[
     [type[BaseBackend], object, bool, bool, bool], tuple[object, ...]
 ]
@@ -306,3 +310,26 @@ def supports_amd_cdna_tunables() -> bool:
         return match is not None and int(match.group(1), 16) >= 0x908
     except Exception:
         return False
+
+
+@contextlib.contextmanager
+def patch_fake_tensor_ctor() -> Generator[None, None, None]:
+    """Context manager that patches FakeTensor.__new__ for the following purpose:
+    - Add _tile_index_block_id attribute with None as initial value.
+      This ensures all FakeTensors have a _tile_index_block_id attribute,
+      which is used to track which block a tile.index tensor originated from.
+    """
+    from torch._subclasses.fake_tensor import FakeTensor
+
+    original_new = FakeTensor.__new__
+
+    def patched_new(*args: Any, **kwargs: Any) -> FakeTensor:  # noqa: ANN401
+        result = original_new(*args, **kwargs)
+        result._tile_index_block_id = None  # type: ignore[attr-defined]
+        return result
+
+    FakeTensor.__new__ = staticmethod(patched_new)  # type: ignore[method-assign]
+    try:
+        yield
+    finally:
+        FakeTensor.__new__ = original_new  # type: ignore[method-assign]
@@ -20,6 +20,7 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from .. import exc
+from .._compat import patch_fake_tensor_ctor
 from ..language.constexpr import ConstExpr
 from .loop_dependency_checker import LoopDependencyChecker
 from .source_location import SourceLocation
@@ -272,6 +273,67 @@ def cached_create_unbacked_symint(
             self._symint_cache[key] = result
         return result
 
+    def get_tile_index_tensor_block_id(self, tensor: torch.Tensor) -> int | None:
+        """Return the originating ``tile.index`` block id if present."""
+        return tensor._tile_index_block_id  # type: ignore[attr-defined]
+
+    def should_broadcast_tensor_indexers(
+        self, tensors: typing.Sequence[torch.Tensor]
+    ) -> bool:
+        """Check whether tensor indexers need broadcasting."""
+        if not tensors:
+            return False
+        # tile.index tensors don't need broadcasting
+        if all(self.get_tile_index_tensor_block_id(t) for t in tensors):
+            return False
+        # Single 1D tensor doesn't need broadcast handling
+        return not (len(tensors) == 1 and tensors[0].ndim == 1)
+
+    def tensor_indexer_broadcast_shape(
+        self, tensors: typing.Sequence[torch.Tensor]
+    ) -> list[int | torch.SymInt]:
+        """Compute broadcast shape for tensor indexers."""
+        shapes = [list(t.size()) for t in tensors]
+        if all(len(s) == 1 for s in shapes) and len(shapes) > 1:  # Cartesian
+            return [s[0] for s in shapes]
+        max_ndim = max(len(s) for s in shapes)
+        padded = [([1] * (max_ndim - len(s)) + s) for s in shapes]
+        return [
+            next((d for d in dims if self.size_hint(d) != 1), 1)
+            for dims in zip(*padded, strict=True)
+        ]
+
+    def tensor_indexer_dims(
+        self, indexer_tensor: torch.Tensor
+    ) -> list[int | torch.SymInt]:
+        """Return dims contributed by a tensor indexer (non-broadcast case)."""
+        non_trivial = [d for d in indexer_tensor.size() if self.size_hint(d) != 1]
+        bid = self.get_tile_index_tensor_block_id(indexer_tensor) or (
+            self.get_block_id(non_trivial[0]) if non_trivial else None
+        )
+        if bid:
+            return [self.block_sizes[bid].var]
+        return non_trivial or [1]  # type: ignore[return-value]
+
+    def new_index_result(
+        self, tensor: torch.Tensor, output_shape: typing.Sequence[int | torch.SymInt]
+    ) -> torch.Tensor:
+        """Create tensor for indexing ops, preserving tile index provenance."""
+        shape = list(output_shape)
+        non_trivial = [i for i, s in enumerate(shape) if self.size_hint(s) != 1]
+        if len(non_trivial) > 1:
+            return tensor.new_empty(shape)
+        bid = self.get_tile_index_tensor_block_id(tensor)
+        if non_trivial:
+            if bid is None:
+                bid = self.get_block_id(shape[non_trivial[0]])
+            if bid:
+                shape[non_trivial[0]] = self.block_sizes[bid].var
+        result = tensor.new_empty(shape)
+        if bid:
+            result._tile_index_block_id = bid  # type: ignore[attr-defined]
+        return result
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if obj is None:
             return None
@@ -418,6 +480,8 @@ def sympy_debug(self, expr: sympy.Expr) -> str:
 
     def __enter__(self) -> Self:
         assert getattr(tls, "env", None) is None, "CompileEnvironment already active"
+        self.fake_tensor_ctor_patch_ctx = patch_fake_tensor_ctor()
+        self.fake_tensor_ctor_patch_ctx.__enter__()
         self.fake_mode.__enter__()
         tls.env = self
         self.loop_dependency_checker = LoopDependencyChecker()
@@ -431,6 +495,7 @@ def __exit__(
     ) -> None:
         tls.env = None
         self.fake_mode.__exit__(exc_type, exc_value, traceback)
+        self.fake_tensor_ctor_patch_ctx.__exit__(exc_type, exc_value, traceback)
 
     @staticmethod
     def current() -> CompileEnvironment:
 
@@ -575,6 +575,10 @@ def compute_shape(
         input_size = collections.deque(tensor.size())
         output_size = []
         env = CompileEnvironment.current()
+
+        tensor_indexers = [k for k in index if isinstance(k, torch.Tensor)]
+        should_broadcast = env.should_broadcast_tensor_indexers(tensor_indexers)
+
         k_index = 0
         for k in index:
             if k is None:
@@ -617,11 +621,14 @@ def compute_shape(
                 else:
                     output_size.append(1)
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and (
-                k.ndim == 1 or (len(index) == 1 and tensor.ndim == 1)
-            ):
-                input_size.popleft()
-                output_size.extend(k.size())
+            elif isinstance(k, torch.Tensor):
+                base_dim = input_size.popleft()
+                if not should_broadcast:
+                    output_size.extend(env.tensor_indexer_dims(k))
+                elif k is tensor_indexers[0]:
+                    output_size.extend(
+                        env.tensor_indexer_broadcast_shape(tensor_indexers)
+                    )
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(k)
@@ -667,13 +674,99 @@ def create(
         output_size = SubscriptIndexing.compute_shape(fake_value, index, state)
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
+        tensor_indexers = [k for k in index if isinstance(k, torch.Tensor)]
+        should_broadcast = env.should_broadcast_tensor_indexers(tensor_indexers)
+        broadcast_dims = 0
+        if should_broadcast:
+            broadcast_dims = len(env.tensor_indexer_broadcast_shape(tensor_indexers))
+            is_cartesian = (
+                broadcast_dims >= 2
+                and len(tensor_indexers) == broadcast_dims
+                and all(
+                    t.ndim == 1
+                    or sum(1 for d in t.size() if env.size_hint(d) != 1) <= 1
+                    for t in tensor_indexers
+                )
+            )
         if dtype == "tl.int32" and SubscriptIndexing._needs_int64(fake_value):
             raise exc.IndexOffsetOutOfRangeForInt32(env.index_dtype)
 
         def _is_size_one(size: int | torch.SymInt) -> bool:
             return env.known_equal(size, 1)
 
         k_index = 0
+
+        def tensor_index_source_and_mask(
+            index_elem: torch.Tensor, index_var: str, pos: int
+        ) -> tuple[str, int | None]:
+            tile_id = env.get_tile_index_tensor_block_id(index_elem)
+            src = state.codegen.index_var(tile_id) if tile_id else index_var
+            mask_id = tile_id or (
+                env.get_block_id(output_size[pos]) if pos < len(output_size) else None
+            )
+            return src, mask_id
+
+        def handle_broadcast_tensor(
+            position: int,
+            index_elem: torch.Tensor,
+            index_var: str,
+            cur_output_idx: int,
+        ) -> tuple[str, dict[str, None]]:
+            """Handle tensor index with broadcast shape (cartesian or general)."""
+            assert broadcast_dims > 0
+            tensor_idx = next(
+                i for i, t in enumerate(tensor_indexers) if t is index_elem
+            )
+            first_tensor_out_idx = (
+                cur_output_idx if tensor_idx == 0 else cur_output_idx - broadcast_dims
+            )
+            non_trivial_output_positions: list[int] = []
+            if is_cartesian:
+                pos = first_tensor_out_idx + tensor_idx
+                single_output_dim = True
+            else:
+                # Find position(s) where this tensor contributes non-trivial dims
+                offset = max(0, broadcast_dims - index_elem.ndim)
+                non_trivial_output_positions = [
+                    first_tensor_out_idx + offset + i
+                    for i in range(index_elem.ndim)
+                    if env.size_hint(index_elem.size(i)) != 1
+                ]
+                pos = non_trivial_output_positions[0]
+                single_output_dim = len(non_trivial_output_positions) <= 1
+
+            new_masks: dict[str, None] = {}
+            if single_output_dim:
+                src, _ = tensor_index_source_and_mask(index_elem, index_var, pos)
+                expand = (
+                    tile_strategy.expand_str(output_size, pos)
+                    if index_elem.ndim == 1
+                    else ""
+                )
+                idx_val = f"({src}){expand}"
+            else:
+                # Multi-dim tensor with multiple non-trivial dims
+                idx_val = f"({index_var})"
+                if tensor_idx == 0:
+                    for p in non_trivial_output_positions:
+                        if (
+                            p < len(output_size)
+                            and (bid := env.get_block_id(output_size[p]))
+                            and (mv := state.codegen.mask_var(bid))
+                            and not _is_size_one(fake_value.size(len(index_values)))
+                        ):
+                            new_masks.setdefault(
+                                f"({mv}){tile_strategy.expand_str(output_size, p)}"
+                            )
+            # Padded iota mask
+            if (
+                orig_len := _get_padded_iota_original_length(state, position)
+            ) is not None:
+                new_masks.setdefault(
+                    f"(({index_var} < {orig_len}){tile_strategy.expand_str(output_size, first_tensor_out_idx + tensor_idx)})"
+                )
+            return idx_val, new_masks
+
         for n, k in enumerate(index):
             if k is None:
                 output_idx += 1
@@ -752,40 +845,41 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                         index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(output_size, output_idx)
+            elif isinstance(k, torch.Tensor):
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == len(index)
                 index_var = state.codegen.lift(ast_index[n], prefix="index").id
-                index_values.append(f"({index_var}){expand}")
-                if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
-                # Check if this index comes from a padded hl.arange and generate mask
-                if (
-                    original_length := _get_padded_iota_original_length(state, n)
-                ) is not None:
-                    mask_values.setdefault(f"({index_var} < {original_length}){expand}")
-                output_idx += 1
-                k_index += 1
-            elif (
-                isinstance(k, torch.Tensor) and len(index) == 1 and fake_value.ndim == 1
-            ):
-                # TODO(jansel): combine this case with the above
-                ast_index = state.ast_args[1]
-                assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == 1
-                index_var = state.codegen.lift(ast_index[0], prefix="index").id
-                index_values.append(index_var)
-                output_idx += k.ndim
-                for n, s in enumerate(output_size):
-                    if (block_idx := env.get_block_id(s)) is not None and (
-                        mask := state.codegen.mask_var(block_idx)
+
+                # Use broadcast handling for: multiple tensors, or single tensor with ndim > 1
+                if should_broadcast:
+                    idx_val, new_masks = handle_broadcast_tensor(
+                        n, k, index_var, output_idx
+                    )
+                    index_values.append(idx_val)
+                    mask_values.update(new_masks)
+                    if k is tensor_indexers[0]:
+                        output_idx += broadcast_dims
+                    k_index += 1
+                    continue
+
+                index_source, mask_block_id = tensor_index_source_and_mask(
+                    k, index_var, output_idx
+                )
+
+                expand = (
+                    tile_strategy.expand_str(output_size, output_idx)
+                    if k.ndim < len(output_size)
+                    else ""
+                )
+                index_values.append(f"({index_source}){expand}")
+                if mask_block_id is not None:
+                    mask_var = state.codegen.mask_var(mask_block_id)
+                    if mask_var and not _is_size_one(
+                        fake_value.size(len(index_values) - 1)
                     ):
-                        mask_values.setdefault(
-                            f"({mask}){tile_strategy.expand_str(output_size, n)}"
-                        )
+                        mask_values.setdefault(f"({mask_var}){expand}")
+
+                output_idx += k.ndim
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(type(k))