integrate torch._scaled_mm into Float8BlockwiseLinear and add bench script

danielvegamyhre · danielvegamyhre · commit 07a35b692210 · 2025-08-17T09:16:44.000-07:00
stack-info: PR: #2785, branch: danielvegamyhre/stack/44
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py
@@ -58,7 +58,7 @@ def get_configs() -> List[ExperimentConfig]:
         (16640, 5120, 8192),
         (16640, 8192, 5120),
     ]
-    out_dtypes = [torch.float32, torch.bfloat16]
+    out_dtypes = [torch.bfloat16]
     configs = []
     for mnk, out_dtype in itertools.product(mnk_list, out_dtypes):
         m, n, k = mnk
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py
@@ -58,7 +58,7 @@ def get_configs() -> List[ExperimentConfig]:
         (16640, 5120, 8192),
         (16640, 8192, 5120),
     ]
-    out_dtypes = [torch.float32, torch.bfloat16]
+    out_dtypes = [torch.bfloat16]
     configs = []
     for mnk, out_dtype in itertools.product(mnk_list, out_dtypes):
         m, n, k = mnk
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_linear_fwd_bwd.py b/benchmarks/prototype/blockwise_fp8_training/bench_linear_fwd_bwd.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import itertools
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from torch.nn import functional as F
+from tqdm import tqdm
+from triton.testing import do_bench
+
+from torchao.prototype.blockwise_fp8_training.linear import Float8BlockwiseLinear
+
+device = torch.device("cuda")
+
+# This benchmark requires CUDA 12.9+
+assert torch.version.cuda is not None, "CUDA is not available"
+cuda_major, cuda_minor = map(int, torch.version.cuda.split("."))
+assert cuda_major >= 12 and cuda_minor >= 9, "CUDA 12.9+ is required"
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    out_dtype: torch.dtype
+    m: int
+    n: int
+    k: int
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    bf16_linear_us: float
+    fp8_triton_linear_us: float
+    fp8_scaled_mm_linear_us: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    mnk_list = [
+        # Llama4 shapes
+        (16640, 5120, 8192),
+        (16640, 8192, 5120),
+    ]
+    out_dtypes = [torch.bfloat16]
+    configs = []
+    for mnk, out_dtype in itertools.product(mnk_list, out_dtypes):
+        m, n, k = mnk
+        configs.append(
+            ExperimentConfig(
+                out_dtype=out_dtype,
+                m=m,
+                n=n,
+                k=k,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    M, N, K = config.m, config.n, config.k
+    inputs = torch.randn(M, K, dtype=config.out_dtype, device="cuda")
+    bf16_linear = torch.nn.Linear(K, N, dtype=config.out_dtype, device="cuda")
+    fp8_triton_linear = Float8BlockwiseLinear(
+        K, N, dtype=config.out_dtype, device="cuda", use_triton=True
+    )
+    fp8_scaled_mm_linear = Float8BlockwiseLinear(
+        K, N, dtype=config.out_dtype, device="cuda", use_triton=False
+    )
+
+    def warmup(func, *args, **kwargs):
+        for _ in range(10):
+            func(*args, **kwargs)
+
+    def fwd_bwd(func, inputs, labels, *args, **kwargs):
+        out = func(inputs, *args, **kwargs)
+        loss = F.mse_loss(out, labels)
+        loss.backward()
+        torch.cuda.synchronize()
+
+    # Warmup then run bf16 torch.mm
+    labels = inputs.new_empty(M, N).fill_(1.0)
+    warmup(fwd_bwd, bf16_linear, inputs, labels)
+
+    bf16_linear_us = benchmark_cuda_function_in_microseconds(
+        fwd_bwd, bf16_linear, inputs, labels
+    )
+
+    # Warm up then run triton bench
+    warmup(
+        fwd_bwd,
+        fp8_triton_linear,
+        inputs,
+        labels,
+    )
+
+    fp8_triton_linear_us = benchmark_cuda_function_in_microseconds(
+        fwd_bwd,
+        fp8_triton_linear,
+        inputs,
+        labels,
+    )
+
+    warmup(
+        fwd_bwd,
+        fp8_scaled_mm_linear,
+        inputs,
+        labels,
+    )
+
+    fp8_scaled_mm_linear_us = benchmark_cuda_function_in_microseconds(
+        fwd_bwd,
+        fp8_scaled_mm_linear,
+        inputs,
+        labels,
+    )
+
+    return ExperimentResult(
+        bf16_linear_us=bf16_linear_us,
+        fp8_triton_linear_us=fp8_triton_linear_us,
+        fp8_scaled_mm_linear_us=fp8_scaled_mm_linear_us,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "M",
+        "N",
+        "K",
+        "out_dtype",
+        "bf16_mm_linear_us",
+        "fp8_triton_linear_us",
+        "fp8_scaled_mm_linear_us",
+    ]
+    rows = []
+    for experiment in experiments:
+        m, n, k = experiment.config.m, experiment.config.n, experiment.config.k
+        rows.append(
+            [
+                m,
+                n,
+                k,
+                experiment.config.out_dtype,
+                experiment.result.bf16_linear_us,
+                experiment.result.fp8_triton_linear_us,
+                experiment.result.fp8_scaled_mm_linear_us,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def benchmark_cuda_function_in_microseconds(f, *args, **kwargs):
+    return do_bench(lambda: f(*args, **kwargs), return_mode="median") * 1e3
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchao/prototype/blockwise_fp8_training/kernels.py b/torchao/prototype/blockwise_fp8_training/kernels.py
@@ -26,13 +26,6 @@
     for num_stages in [2, 4]
 ]
 
-# For fast compile times during development.
-dev_fp8_gemm_configs = [
-    triton.Config(
-        {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128}, num_warps=4, num_stages=3
-    ),
-]
-
 EPS = 1e-12
 
 
@@ -115,9 +108,9 @@ def blockwise_fp8_gemm_1x128_128x128(
         "a must be row-major, b must be column-major"
     )
 
-    # a_scales must be row-major, b_scales must be column-major
-    assert _is_row_major(a_s) and _is_column_major(b_s), (
-        "a_s must be row-major, b_s must be column-major"
+    # a_scales must be col-major, b_scales must be column-major
+    assert _is_column_major(a_s) and _is_column_major(b_s), (
+        "a_s must be col-major, b_s must be column-major"
     )
 
     M = a.size(0)
@@ -229,7 +222,9 @@ def blockwise_fp8_gemm_1x128_128x1(
 ):
     # 'a' must be in row-major layout, 'b' must be in column-major layout
     assert a.is_contiguous() and not b.is_contiguous()
-    assert a_s.is_contiguous() and b_s.is_contiguous()
+
+    # a_scales must be col-major
+    assert not a_s.is_contiguous() and b_s.is_contiguous()
     M = a.size(0)
     K = a.size(1)
     N = b.size(1)
@@ -260,6 +255,19 @@ def blockwise_fp8_gemm_1x128_128x1(
     return c
 
 
+# Quantization kernels autotuner configs
+quant_kernel_configs = [
+    triton.Config(
+        {},
+        num_warps=warps,
+        num_stages=stages,
+    )
+    for warps in [4, 8]
+    for stages in [2, 4, 6]
+]
+
+
+@triton.autotune(configs=quant_kernel_configs, key=["K"])
 @triton.jit
 def fp8_blockwise_act_quant_lhs_kernel(
     x_ptr,
@@ -320,7 +328,11 @@ def fp8_blockwise_act_quant_lhs(
     ], "dtype must be torch.float8_e4m3fn"
     M, K = x.size()
     y = torch.empty_like(x, dtype=dtype)
-    s = x.new_empty(M, K // block_size, dtype=torch.float32)
+    # Write scales to column-major format to align with torch._scaled_mm requirements.
+    s = x.new_empty(M, K // block_size, dtype=torch.float32).as_strided(
+        (M, K // block_size),
+        (1, M),
+    )
     grid = lambda meta: (M, triton.cdiv(K, meta["BLOCK_SIZE"]))
     fp8_blockwise_act_quant_lhs_kernel[grid](
         x,
@@ -340,6 +352,7 @@ def fp8_blockwise_act_quant_lhs(
     return y, s
 
 
+@triton.autotune(configs=quant_kernel_configs, key=["K"])
 @triton.jit
 def fp8_blockwise_act_quant_rhs_kernel(
     x_ptr,
@@ -424,6 +437,7 @@ def fp8_blockwise_act_quant_rhs(
     return y, s
 
 
+@triton.autotune(configs=quant_kernel_configs, key=["K"])
 @triton.jit
 def fp8_blockwise_act_quant_transposed_lhs_kernel(
     x_ptr,
@@ -497,7 +511,13 @@ def fp8_blockwise_act_quant_transposed_lhs(
     # Output should have transposed dims and be in row major format
     M, K = x.shape
     y = torch.empty(K, M, dtype=dtype, device=x.device)
-    s = x.new_empty(K, triton.cdiv(M, block_size), dtype=torch.float32)
+    M_blocks = triton.cdiv(M, block_size)
+
+    # Column major scales required for torch._scaled_mm
+    s = x.new_empty(K, M_blocks, dtype=torch.float32).as_strided(
+        (K, M_blocks),  # shape
+        (1, K),  # stride
+    )
     grid = lambda meta: (
         triton.cdiv(M, meta["SCALE_BLOCK_SIZE"]),
         triton.cdiv(K, meta["BLOCK_SIZE_K"]),
@@ -522,6 +542,7 @@ def fp8_blockwise_act_quant_transposed_lhs(
     return y, s
 
 
+@triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
 def fp8_blockwise_weight_quant_rhs_kernel(
     x_ptr,
@@ -582,8 +603,10 @@ def fp8_blockwise_weight_quant_rhs(
     M, N = x.size()
     y = torch.empty_like(x, dtype=dtype)
     y = y.as_strided(y.size(), (1, y.size(0)))  # Column major
-    s = x.new_empty(
-        triton.cdiv(M, block_size), triton.cdiv(N, block_size), dtype=torch.float32
+    M_blocks, N_blocks = triton.cdiv(M, block_size), triton.cdiv(N, block_size)
+    s = x.new_empty(M_blocks, N_blocks, dtype=torch.float32).as_strided(
+        (M_blocks, N_blocks),  # shape
+        (1, M_blocks),  # stride
     )
     grid = lambda meta: (
         triton.cdiv(M, meta["BLOCK_SIZE"]),
@@ -607,6 +630,7 @@ def fp8_blockwise_weight_quant_rhs(
     return y, s
 
 
+@triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
 def fp8_blockwise_weight_quant_transposed_rhs_kernel(
     x_ptr,
diff --git a/torchao/prototype/blockwise_fp8_training/linear.py b/torchao/prototype/blockwise_fp8_training/linear.py

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def get_configs() -> List[ExperimentConfig]:`
`58`	`58`	`(16640, 5120, 8192),`
`59`	`59`	`(16640, 8192, 5120),`
`60`	`60`	`]`
`61`		`- out_dtypes = [torch.float32, torch.bfloat16]`
	`61`	`+ out_dtypes = [torch.bfloat16]`
`62`	`62`	`configs = []`
`63`	`63`	`for mnk, out_dtype in itertools.product(mnk_list, out_dtypes):`
`64`	`64`	`m, n, k = mnk`