-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesAllows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10. Full diff: https://github.com/llvm/llvm-project/pull/144782.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..c459c4df11e9e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -2504,6 +2505,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
+ multiclass VOP2_Real_dpp_gfx9<bits<6> op> {
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>;
+ }
+
} // End DecoderNamespace = "GFX9"
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>;
} // End AssemblerPredicate = isGFX9Only
defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index 3dcf288bbbaa5..bbd36a9140b96 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,15 @@ v_pk_fmac_f16 v5, -4.0, v2
v_pk_fmac_f16 v5, v1, v255
// GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16 v5, v1, v2
+// GFX10: encoding: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+
+v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX10: encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
new file mode 100644
index 0000000000000..f7dab2d0359dc
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+
+v_pk_fmac_f16 v5, v1, v2
+// CHECK-MI: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
index 1774efa4a65c7..4a7471e6c6f98 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
@@ -2476,3 +2476,10 @@
# W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
# W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
index 40b8f24e4d72f..233f93a5b8e7d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
@@ -222,3 +222,6 @@
# W32: v_cndmask_b32_dpp v0, v1, v2, vcc_lo dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
# W64: v_cndmask_b32_dpp v0, v1, v2, vcc dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
new file mode 100644
index 0000000000000..ac1ef4baa9aa4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s
+
+# CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2
+0x01,0x05,0x0a,0x78
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
|
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here, VOP2_Real_e32e64_gfx9 shall define all forms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code updated as suggested.
let IsSingle = 1 in { | ||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | ||
} | ||
defm V_PK_FMAC_F16 : VOP2_Real_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Went back to previous commit since SDWA is not supported for this instruction in GFX10.
@@ -2560,6 +2558,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line and align the column.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf | ||
0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff | ||
|
||
# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sdwa disasm tests needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -2172,6 +2175,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You still need to have a singe macro.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pls see the new commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Not sure about that IsSingle though, if it is still needed. Please let Joe check it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should have [MC] in the commit title or message, since there is no codegen support.
IsSingle is fine, it suppresses the _e32 suffix on instructions with no VOP3 form, which is this one.
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.
This reverts commit f2d51a2.
f447b3a
to
39fcf97
Compare
Allows dpp in v_pk_fmac_f16 for GFX9, and both dpp and dpp8 for GFX10.
Allows dpp in v_pk_fmac_f16 for GFX9, and both dpp and dpp8 for GFX10.
# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | ||
0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x05 | ||
|
||
# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the test case that is causing me problems (see my other comment). I would expect to see only a neg
modifier (or -v2
) on this instruction. But the corresponding assembly test in llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s confirms that the instruction is printed by AMDGPUInstPrinter::printOperandAndIntInputMods
instead of MDGPUInstPrinter::printOperandAndFPInputMods
as I would expect.
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.