pytorch
diff --git a/‎helion/autotuner/config_spec.py‎
Lines changed: 1 addition & 1 deletion b/‎helion/autotuner/config_spec.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/test_associative_scan.expected‎
Lines changed: 42 additions & 42 deletions b/‎test/test_associative_scan.expected‎
Lines changed: 42 additions & 42 deletions
diff --git a/‎test/test_atomic_ops.expected‎
Lines changed: 14 additions & 14 deletions b/‎test/test_atomic_ops.expected‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎test/test_autotuner.expected‎
Lines changed: 2 additions & 2 deletions b/‎test/test_autotuner.expected‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/test_broadcasting.expected‎
Lines changed: 8 additions & 8 deletions b/‎test/test_broadcasting.expected‎
Lines changed: 8 additions & 8 deletions
@@ -33,7 +33,7 @@
     from ..runtime.config import PidTypeLiteral
 
 DEFAULT_NUM_WARPS = 4
-DEFAULT_NUM_STAGES = 3
+DEFAULT_NUM_STAGES = 2
 VALID_KEYS: frozenset[str] = frozenset(
     [
         "block_sizes",
 
@@ -27,7 +27,7 @@ def atomic_add_2d_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default
     """Test atomic_add with 2D indexing."""
     _BLOCK_SIZE_0 = 8
     _BLOCK_SIZE_1 = 8
-    _launcher(_helion_atomic_add_2d_kernel, (triton.cdiv(y.size(0), _BLOCK_SIZE_0) * triton.cdiv(y.size(1), _BLOCK_SIZE_1),), y, x, y.size(0), y.size(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_2d_kernel, (triton.cdiv(y.size(0), _BLOCK_SIZE_0) * triton.cdiv(y.size(1), _BLOCK_SIZE_1),), y, x, y.size(0), y.size(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_add_1d_tensor)
@@ -59,7 +59,7 @@ def atomic_add_1d_tensor_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_
     z = torch.zeros([n], dtype=x.dtype, device=x.device)
     _BLOCK_SIZE_0 = 32
     _RDIM_SIZE_1 = 64
-    _launcher(_helion_atomic_add_1d_tensor_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), x, y, z, x.stride(0), x.stride(1), y.stride(0), y.stride(1), z.stride(0), m, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_1d_tensor_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), x, y, z, x.stride(0), x.stride(1), y.stride(0), y.stride(1), z.stride(0), m, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=2)
     return z
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_add_float)
@@ -82,7 +82,7 @@ def _helion_atomic_add_float_kernel(indices, x, indices_size_0, indices_stride_0
 def atomic_add_float_kernel(x: torch.Tensor, indices: torch.Tensor, *, _launcher=_default_launcher):
     """Test atomic_add with a float constant value and reading from lookup"""
     _BLOCK_SIZE_0 = 32
-    _launcher(_helion_atomic_add_float_kernel, (triton.cdiv(indices.size(0), _BLOCK_SIZE_0),), indices, x, indices.size(0), indices.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_float_kernel, (triton.cdiv(indices.size(0), _BLOCK_SIZE_0),), indices, x, indices.size(0), indices.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_add_returns_prev)
@@ -106,7 +106,7 @@ def _helion_k(x, y, prev, x_size_0, prev_stride_0, x_stride_0, y_stride_0, _BLOC
 def k(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     prev = torch.empty_like(x)
     _BLOCK_SIZE_0 = 8
-    _launcher(_helion_k, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, prev, x.size(0), prev.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_k, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, prev, x.size(0), prev.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return (x, prev)
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_add_w_tile_attr)
@@ -127,7 +127,7 @@ def atomic_add_w_tile_attr(x: torch.Tensor, *, _launcher=_default_launcher):
     """Test atomic_add where the index is a symbolic int"""
     y = torch.zeros_like(x, device=x.device, dtype=torch.int32)
     _BLOCK_SIZE_0 = 2
-    _launcher(_helion_atomic_add_w_tile_attr, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), y, y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_w_tile_attr, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), y, y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return y
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_and)
@@ -149,7 +149,7 @@ def _helion_atomic_and_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZ
 
 def atomic_and_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 8
-    _launcher(_helion_atomic_and_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_and_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_cas)
@@ -172,7 +172,7 @@ def _helion_atomic_cas_kernel(x, expect, y, x_size_0, expect_stride_0, x_stride_
 
 def atomic_cas_kernel(x: torch.Tensor, y: torch.Tensor, expect: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 4
-    _launcher(_helion_atomic_cas_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, expect, y, x.size(0), expect.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_cas_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, expect, y, x.size(0), expect.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_max)
@@ -194,7 +194,7 @@ def _helion_atomic_max_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZ
 
 def atomic_max_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 4
-    _launcher(_helion_atomic_max_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_max_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_min)
@@ -216,7 +216,7 @@ def _helion_atomic_min_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZ
 
 def atomic_min_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 4
-    _launcher(_helion_atomic_min_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_min_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_or)
@@ -238,7 +238,7 @@ def _helion_atomic_or_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZE
 
 def atomic_or_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 8
-    _launcher(_helion_atomic_or_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_or_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_xchg)
@@ -260,7 +260,7 @@ def _helion_atomic_xchg_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SI
 
 def atomic_xchg_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 8
-    _launcher(_helion_atomic_xchg_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_xchg_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_atomic_xor)
@@ -282,7 +282,7 @@ def _helion_atomic_xor_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZ
 
 def atomic_xor_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 8
-    _launcher(_helion_atomic_xor_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_xor_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_basic_atomic_add)
@@ -305,7 +305,7 @@ def _helion_atomic_add_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZ
 def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     """Test basic atomic_add functionality."""
     _BLOCK_SIZE_0 = 32
-    _launcher(_helion_atomic_add_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
 
 --- assertExpectedJournal(TestAtomicOperations.test_overlapping_atomic_add)
@@ -329,5 +329,5 @@ def _helion_atomic_add_overlap_kernel(indices, y, x, _BLOCK_SIZE_0: tl.constexpr
 def atomic_add_overlap_kernel(x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor, *, _launcher=_default_launcher):
     """Test atomic_add with overlapping indices."""
     _BLOCK_SIZE_0 = 32
-    _launcher(_helion_atomic_add_overlap_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_atomic_add_overlap_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return x
@@ -2,7 +2,7 @@ This file is automatically generated by assertExpectedJournal calls in test_auto
 Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
 
 --- assertExpectedJournal(TestAutotuner.test_config_fragment0)
-helion.Config(block_sizes=[16, 16, 16], indexing='pointer', l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=3, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None])
+helion.Config(block_sizes=[16, 16, 16], indexing='pointer', l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None])
 helion.Config(block_sizes=[32, 128, 64], indexing='block_ptr', l2_groupings=[8], load_eviction_policies=['', ''], loop_orders=[[1, 0]], num_stages=8, num_warps=8, pid_type='persistent_blocked', range_flattens=[None, True], range_multi_buffers=[False, True], range_num_stages=[3, 0], range_unroll_factors=[1, 2], range_warp_specializes=[None, True])
 helion.Config(block_sizes=[16, 16, 16], indexing='tensor_descriptor', l2_groupings=[16], load_eviction_policies=['last', ''], loop_orders=[[0, 1]], num_stages=7, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 3], range_warp_specializes=[None, False])
 helion.Config(block_sizes=[16, 32, 256], indexing='pointer', l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[1, 0]], num_stages=2, num_warps=16, pid_type='persistent_interleaved', range_flattens=[True, True], range_multi_buffers=[False, None], range_num_stages=[2, 4], range_unroll_factors=[2, 3], range_warp_specializes=[True, None])
@@ -14,7 +14,7 @@ helion.Config(block_sizes=[256, 16, 16], indexing='pointer', l2_groupings=[2], l
 helion.Config(block_sizes=[16, 64, 16], indexing='tensor_descriptor', l2_groupings=[8], load_eviction_policies=['last', ''], loop_orders=[[0, 1]], num_stages=3, num_warps=32, pid_type='persistent_interleaved', range_flattens=[True, False], range_multi_buffers=[False, None], range_num_stages=[3, 0], range_unroll_factors=[3, 4], range_warp_specializes=[False, True])
 
 --- assertExpectedJournal(TestAutotuner.test_config_fragment1)
-helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing='pointer', l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=3, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
+helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing='pointer', l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=2, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
 helion.Config(block_sizes=[1, 64, 64], flatten_loops=[False], indexing='tensor_descriptor', l2_groupings=[4], load_eviction_policies=['first', 'first'], loop_orders=[[1, 2, 0]], num_stages=4, num_warps=8, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[False], range_unroll_factors=[1], range_warp_specializes=[True])
 helion.Config(block_sizes=[2, 8, 512], flatten_loops=[True], indexing='tensor_descriptor', l2_groupings=[8], load_eviction_policies=['first', 'first'], loop_orders=[[2, 0, 1]], num_stages=2, num_warps=1, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
 helion.Config(block_sizes=[1, 512, 1], flatten_loops=[True], indexing='tensor_descriptor', l2_groupings=[1], load_eviction_policies=['', 'last'], loop_orders=[[0, 2, 1]], num_stages=5, num_warps=2, pid_type='persistent_blocked', range_flattens=[True], range_multi_buffers=[False], range_unroll_factors=[2], range_warp_specializes=[True])
 
@@ -34,7 +34,7 @@ def broadcast_fn(a, b, *, _launcher=_default_launcher):
     out1 = torch.empty_like(a)
     _BLOCK_SIZE_0 = 16
     _BLOCK_SIZE_1 = 8
-    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return (out0, out1)
 
 --- assertExpectedJournal(TestBroadcasting.test_broadcast2)
@@ -70,7 +70,7 @@ def broadcast_fn(a, b, *, _launcher=_default_launcher):
     out1 = torch.empty_like(a)
     _BLOCK_SIZE_1 = 8
     _BLOCK_SIZE_0 = 16
-    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(1), _BLOCK_SIZE_1) * triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(1), _BLOCK_SIZE_1) * triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return (out0, out1)
 
 --- assertExpectedJournal(TestBroadcasting.test_broadcast3)
@@ -104,7 +104,7 @@ def broadcast_fn(a, b, *, _launcher=_default_launcher):
     out0 = torch.empty_like(a)
     out1 = torch.empty_like(a)
     _BLOCK_SIZE_0 = 64
-    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * a.size(1),), a, b, out0, out1, a.size(0), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * a.size(1),), a, b, out0, out1, a.size(0), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return (out0, out1)
 
 --- assertExpectedJournal(TestBroadcasting.test_broadcast4)
@@ -138,7 +138,7 @@ def broadcast_fn(a, b, *, _launcher=_default_launcher):
     out0 = torch.empty_like(a)
     out1 = torch.empty_like(a)
     _BLOCK_SIZE_1 = 64
-    _launcher(_helion_broadcast_fn, (a.size(0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_broadcast_fn, (a.size(0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return (out0, out1)
 
 --- assertExpectedJournal(TestBroadcasting.test_broadcast5)
@@ -170,7 +170,7 @@ def broadcast_fn(a, b, *, _launcher=_default_launcher):
     out1 = torch.empty_like(a)
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
-    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), b.size(0), out0.size(0), out0.size(1), out1.size(0), out1.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_broadcast_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out0, out1, a.size(0), a.size(1), b.size(0), out0.size(0), out0.size(1), out1.size(0), out1.size(1), a.stride(0), a.stride(1), b.stride(0), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return (out0, out1)
 
 --- assertExpectedJournal(TestBroadcasting.test_constexpr_index)
@@ -212,7 +212,7 @@ def fn(a, idx1, *, _launcher=_default_launcher):
     out2 = torch.empty_like(a)
     _BLOCK_SIZE_0 = 16
     _BLOCK_SIZE_1 = 16
-    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, out0, out1, out2, a.size(0), a.size(1), a.stride(0), a.stride(1), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), out2.stride(0), out2.stride(1), idx1, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, out0, out1, out2, a.size(0), a.size(1), a.stride(0), a.stride(1), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), out2.stride(0), out2.stride(1), idx1, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return (out0, out1, out2)
 
 --- assertExpectedJournal(TestBroadcasting.test_implicit_broadcast)
@@ -244,7 +244,7 @@ def fn(a, b, *, _launcher=_default_launcher):
     out = torch.empty_like(a)
     _BLOCK_SIZE_0 = 16
     _BLOCK_SIZE_1 = 16
-    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out.stride(0), out.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out.stride(0), out.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return out
 
 --- assertExpectedJournal(TestBroadcasting.test_python_float_promotion)
@@ -265,5 +265,5 @@ def _helion_fn(a, a_size_0, a_stride_0, beta, _BLOCK_SIZE_0: tl.constexpr):
 
 def fn(a, beta, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 16
-    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, a.size(0), a.stride(0), beta, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, a.size(0), a.stride(0), beta, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return a
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`from ..runtime.config import PidTypeLiteral`
`34`	`34`
`35`	`35`	`DEFAULT_NUM_WARPS = 4`
`36`		`-DEFAULT_NUM_STAGES = 3`
	`36`	`+DEFAULT_NUM_STAGES = 2`
`37`	`37`	`VALID_KEYS: frozenset[str] = frozenset(`
`38`	`38`	`[`
`39`	`39`	`"block_sizes",`