[Bench][AMD]Support Padding and Unswizzling Scale on CDNA4 (#8803)

knwng · wdziurdz · commit b530394d7155 · 2025-11-27T08:52:00.000Z
This PR supports
- the `CDNA4MXScaleLayout.unswizzle_data` method used in GPT-OSS model
- padding tensors with 0 when doing scale preshuffling

Signed-off-by: Witold Dziurdz &lt;witold.dziurdz@intel.com&gt;
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -369,8 +369,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, has_y_gamm
                 pytest.skip("Scale preshuffling on AMD GPU has not been emulated on non-CDNA4 arch yet.")
             if "mx" not in weight_dtype_str:
                 pytest.skip("Non-scale swizzling not supported on CDNA4 yet")
-            if n % 32 != 0 or k % (32 * 8) != 0:
-                pytest.skip(f"Shape {m}x{n}x{k} is not supported for scale swizzling on AMD GPU")
         if is_cuda():
             if torch.cuda.get_device_capability()[0] < 9:
                 pytest.skip("NYI. Ampere swizzling.")
diff --git a/python/triton_kernels/tests/test_tensor_details/test_layout_cdna4.py b/python/triton_kernels/tests/test_tensor_details/test_layout_cdna4.py
@@ -0,0 +1,24 @@
+import pytest
+import torch
+from triton_kernels.tensor_details.layout import CDNA4MXScaleLayout
+
+# ------------------------------------------------------------
+# Torch tests
+# ------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (3, 4096, 1024),
+        (10, 254, 60),
+        (1, 320, 160),
+        (2, 16, 512),
+        (3, 2, 36),
+    ],
+)
+def test_mxfp4_scale_roundtrip(shape):
+    x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
+    layout = CDNA4MXScaleLayout(x.shape)
+    res = layout.unswizzle_data(layout.swizzle_data(x))
+    assert (res == x).all()
diff --git a/python/triton_kernels/triton_kernels/tensor_details/layout_details/cdna4_scale.py b/python/triton_kernels/triton_kernels/tensor_details/layout_details/cdna4_scale.py
@@ -1,3 +1,5 @@
+import math
+import torch
 from dataclasses import dataclass
 import triton
 import triton.language as tl
@@ -12,24 +14,31 @@ class CDNA4MXScaleLayout(Layout):
 
     def __init__(self, shape) -> None:
         super().__init__(shape)
+        (
+            *self.leading_shape,
+            self.K_SCALE,
+            self.N,
+        ) = shape
+        self.B = math.prod(self.leading_shape)
+        self.ALIGN_K_SCALE = 8
+        self.ALIGN_N = 32
+        self.K_SCALE_pad = math.ceil(self.K_SCALE / self.ALIGN_K_SCALE) * self.ALIGN_K_SCALE
+        self.N_pad = math.ceil(self.N / self.ALIGN_N) * self.ALIGN_N
 
     def swizzle_data(self, data):
-        block_shape = data.shape
-        SCALE_K = block_shape[-2]
-        N = block_shape[-1]
+        data = torch.nn.functional.pad(data, (0, self.N_pad - self.N, 0, self.K_SCALE_pad - self.K_SCALE))
         data = data.transpose(-1, -2)
-        data = data.view(-1, N // NON_K_PRESHUFFLE_BLOCK_SIZE, 2, 16, SCALE_K // 8, 2, 4, 1)
+        data = data.view(-1, self.N_pad // NON_K_PRESHUFFLE_BLOCK_SIZE, 2, 16, self.K_SCALE_pad // 8, 2, 4, 1)
         data = data.permute(0, 1, 4, 6, 3, 5, 2, 7).contiguous()
-        if len(block_shape) == 3:
-            E = block_shape[0]
-            data = data.reshape(E, N // 32, SCALE_K * 32)
-        else:
-            assert len(block_shape) == 2
-            data = data.reshape(N // 32, SCALE_K * 32)
+        data = data.reshape(self.B, self.N_pad // 32, self.K_SCALE_pad * 32)
         return data.transpose(-1, -2)
 
     def unswizzle_data(self, data):
-        raise NotImplementedError()
+        data = data.transpose(-1, -2)
+        data = data.view(-1, self.N_pad // NON_K_PRESHUFFLE_BLOCK_SIZE, self.K_SCALE_pad // 8, 4, 16, 2, 2, 1)
+        data = data.permute(0, 1, 6, 4, 2, 5, 3, 7)
+        data = data.reshape(*self.leading_shape, self.N_pad, self.K_SCALE_pad)
+        return data.transpose(-1, -2)[..., :self.K_SCALE, :self.N]
 
     def swizzle_block_shape(self, block_shape):
         SCALE_K = block_shape[-2]