Skip to content

Commit 3a23ac4

Browse files
AMDGPU/GlobalISel: Add regbanklegalize rules for uniform global loads
1 parent 06a4394 commit 3a23ac4

File tree

2 files changed

+98
-0
lines changed

2 files changed

+98
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
671671
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
672672
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
673673
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
674+
.Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
675+
.Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
676+
.Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
674677
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
675678
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
676679

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
3+
4+
define amdgpu_ps void @uniform_load_32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1, ptr addrspace(1) inreg %ptr2) {
5+
; CHECK-LABEL: uniform_load_32:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
8+
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
9+
; CHECK-NEXT: s_waitcnt vmcnt(0)
10+
; CHECK-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
11+
; CHECK-NEXT: s_waitcnt vmcnt(0)
12+
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
13+
; CHECK-NEXT: v_readfirstlane_b32 s1, v2
14+
; CHECK-NEXT: s_add_i32 s0, s0, s1
15+
; CHECK-NEXT: v_mov_b32_e32 v1, s0
16+
; CHECK-NEXT: global_store_dword v0, v1, s[4:5]
17+
; CHECK-NEXT: s_endpgm
18+
%load0 = load volatile i32, ptr addrspace(1) %ptr0
19+
%load1 = load volatile i32, ptr addrspace(1) %ptr1
20+
%sum = add i32 %load0, %load1
21+
store i32 %sum, ptr addrspace(1) %ptr2
22+
ret void
23+
}
24+
25+
define amdgpu_ps void @uniform_load_64(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
26+
; CHECK-LABEL: uniform_load_64:
27+
; CHECK: ; %bb.0:
28+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
29+
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
30+
; CHECK-NEXT: s_waitcnt vmcnt(0)
31+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
32+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
33+
; CHECK-NEXT: s_add_i32 s0, s0, s1
34+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
35+
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
36+
; CHECK-NEXT: s_endpgm
37+
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
38+
%elt0 = extractelement <2 x i32> %load, i32 0
39+
%elt1 = extractelement <2 x i32> %load, i32 1
40+
%sum = add i32 %elt0, %elt1
41+
store i32 %sum, ptr addrspace(1) %ptr1
42+
ret void
43+
}
44+
45+
define amdgpu_ps void @uniform_load_96(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
46+
; CHECK-LABEL: uniform_load_96:
47+
; CHECK: ; %bb.0:
48+
; CHECK-NEXT: v_mov_b32_e32 v3, 0
49+
; CHECK-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] glc dlc
50+
; CHECK-NEXT: s_waitcnt vmcnt(0)
51+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
52+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
53+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
54+
; CHECK-NEXT: s_add_i32 s0, s0, s1
55+
; CHECK-NEXT: s_add_i32 s0, s0, s4
56+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
57+
; CHECK-NEXT: global_store_dword v3, v0, s[2:3]
58+
; CHECK-NEXT: s_endpgm
59+
%load = load volatile <3 x i32>, ptr addrspace(1) %ptr0
60+
%elt0 = extractelement <3 x i32> %load, i32 0
61+
%elt1 = extractelement <3 x i32> %load, i32 1
62+
%elt2 = extractelement <3 x i32> %load, i32 2
63+
%sum0 = add i32 %elt0, %elt1
64+
%sum = add i32 %sum0, %elt2
65+
store i32 %sum, ptr addrspace(1) %ptr1
66+
ret void
67+
}
68+
69+
define amdgpu_ps void @uniform_load_128(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
70+
; CHECK-LABEL: uniform_load_128:
71+
; CHECK: ; %bb.0:
72+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
73+
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] glc dlc
74+
; CHECK-NEXT: s_waitcnt vmcnt(0)
75+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
76+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
77+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
78+
; CHECK-NEXT: v_readfirstlane_b32 s5, v3
79+
; CHECK-NEXT: s_add_i32 s0, s0, s1
80+
; CHECK-NEXT: s_add_i32 s0, s0, s4
81+
; CHECK-NEXT: s_add_i32 s0, s0, s5
82+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
83+
; CHECK-NEXT: global_store_dword v4, v0, s[2:3]
84+
; CHECK-NEXT: s_endpgm
85+
%load = load volatile <4 x i32>, ptr addrspace(1) %ptr0
86+
%elt0 = extractelement <4 x i32> %load, i32 0
87+
%elt1 = extractelement <4 x i32> %load, i32 1
88+
%elt2 = extractelement <4 x i32> %load, i32 2
89+
%elt3 = extractelement <4 x i32> %load, i32 3
90+
%sum0 = add i32 %elt0, %elt1
91+
%sum1 = add i32 %sum0, %elt2
92+
%sum = add i32 %sum1, %elt3
93+
store i32 %sum, ptr addrspace(1) %ptr1
94+
ret void
95+
}

0 commit comments

Comments
 (0)