pytorch · danielvegamyhre · Aug 8, 2025 · Aug 11, 2025 · Aug 12, 2025 · Aug 13, 2025
diff --git a/benchmarks/float8/bench_grouped_mm.py b/benchmarks/float8/bench_grouped_mm.py
@@ -64,7 +64,7 @@ def run(
 
         # Run bf16 torch._grouped_mm baseline.
         A = torch.randn(M, K, device=device, dtype=dtype)
-        B = torch.randn(E, K, N, device=device, dtype=dtype)
+        B = torch.randn(E, N, K, device=device, dtype=dtype)
         offs = generate_jagged_offs(E, M)
         print(f"offs: {offs}")
         ref_time_sec, ref_tops_sec, ref_pct_top_peak = do_benchmarks(
@@ -73,7 +73,7 @@ def run(
             use_gpu_kernel_time,
             torch._grouped_mm,
             A,
-            B,
+            B.transpose(-2, -1),
             offs,
         )
         print(
@@ -84,12 +84,7 @@ def run(
 
         # Run scaled_grouped_mm.
         A_hp = torch.randn(M, K, device=device)
-        B_hp_t = (
-            torch.randn(E, K, N, device=device)
-            .transpose(-2, -1)
-            .contiguous()
-            .transpose(-2, -1)
-        )
+        B_hp_t = torch.randn(E, N, K, device=device).transpose(-2, -1)
 
         if recipe == "rowwise":
             # TODO: add e5m2

diff --git a/benchmarks/float8/utils.py b/benchmarks/float8/utils.py
@@ -219,7 +219,7 @@ def get_name_to_moe_shapes_iter(
     N: Optional[int] = None,
     E: Optional[int] = None,
 ):
-    M = 8192 if M is None else M
+    M = 16640 if M is None else M
     if shape_gen_name == "llama4_17bx16e":
         # num_experts=16, dim=5120
         names_to_shapes = {
@@ -232,8 +232,8 @@ def get_name_to_moe_shapes_iter(
         # num_experts=128, dim=5120
         names_to_shapes = {
             # M, K, N, E
-            "moe.experts.w1": (M, 5120, 8192, 128),
-            "moe.experts.w2": (M, 8192, 5120, 128),
+            "moe.experts.w1": (M, 5120, 4 * 5120, 128),
+            "moe.experts.w2": (M, 4 * 5120, 5120, 128),
         }
         return names_to_shapes.items()
     elif shape_gen_name == "custom":

diff --git a/benchmarks/prototype/moe_training/benchmark_kernels.py b/benchmarks/prototype/moe_training/benchmark_kernels.py