[AMD] Use LLVM op for s8->bf16 conversion (triton-lang#6445)

yiqian1 · web-flow · commit 11b288c0afbf · 2025-04-09T21:54:31.000-07:00
- Use LLVM::SIToFpOp to replace s8-&gt;bf16 conversion
  inline assembly.
- Enable bf16 conversions in test_core.py.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1905,8 +1905,12 @@ def test_cast(dtype_x, dtype_z, bitcast, size, num_ctas, device):
         check_type_supported(dtype_x, device)
         check_type_supported(dtype_z, device)
 
-    if is_hip() and (dtype_z in ("bfloat16", "float8_e4m3fn") or dtype_x == "float8_e4m3fn"):
-        pytest.skip(f'test_cast{(dtype_x, dtype_z)} cast to bfloat16 not supported on HIP.')
+    if is_hip():
+        if not is_hip_cdna3() and not is_hip_cdna4() and (dtype_x == 'float8_e4m3fn' or dtype_z == 'float8_e4m3fn'):
+            pytest.skip(f'test_cast{(dtype_x, dtype_z)} only supported on HIP CDNA3/CDNA4.')
+        if (not is_hip_cdna4()) and ((dtype_x == 'bfloat16' and dtype_z == "float8_e4m3fn") or
+                                     (dtype_x == "float8_e4m3fn" and dtype_z == 'bfloat16')):
+            pytest.skip(f'test_cast{(dtype_x, dtype_z)} only supported on HIP CDNA4.')
 
     torch.manual_seed(0)
     # This is tricky because numpy doesn't have bfloat, and torch doesn't have uints.
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -1494,24 +1494,12 @@ static SmallVector<Value> S8_to_Bf16(Location loc,
   SmallVector<Value> inValues = {v[0], v[1], v[2], v[3]};
   SmallVector<Value> outValues = {};
   for (Value inVal : inValues) {
-    Value i32Val = b.sext(i32_ty, inVal);
-
-    GCNBuilder builder;
-    auto &cvt = *builder.create("v_cvt_f32_i32");
-    auto res = builder.newOperand("=v");
-    auto operand = builder.newOperand(i32Val, "v");
-    cvt(res, operand);
-    auto f32Val = builder.launch(rewriter, loc, f32_ty, false);
-
-    f32Val = b.bitcast(f32Val, i32_ty);
-    auto shifted = b.lshr(i32_ty, f32Val, b.i32_val(16));
-    auto truncated = b.trunc(i16_ty, shifted);
-    outValues.push_back(b.bitcast(truncated, bf16_ty));
+    Value bf16Val = rewriter.create<LLVM::SIToFPOp>(loc, bf16_ty, inVal);
+    outValues.push_back(bf16Val);
   }
   return outValues;
 }
 
-// Uses inline ptx to convert s8/u8 to bf16, since the
 struct SIToFPOpConversion
     : ElementwiseOpConversionBase<arith::SIToFPOp, SIToFPOpConversion> {
   using ElementwiseOpConversionBase::ElementwiseOpConversionBase;