|
23 | 23 | #include "GCNSubtarget.h"
|
24 | 24 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
|
25 | 25 | #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
|
| 26 | +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
| 27 | +#include "llvm/CodeGen/GlobalISel/Utils.h" |
26 | 28 | #include "llvm/CodeGen/MachineFunctionPass.h"
|
27 | 29 | #include "llvm/CodeGen/MachineUniformityAnalysis.h"
|
28 | 30 | #include "llvm/CodeGen/TargetPassConfig.h"
|
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
|
115 | 117 | VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
|
116 | 118 | VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
|
117 | 119 |
|
118 |
| - bool isLaneMask(Register Reg) { |
119 |
| - const RegisterBank *RB = MRI.getRegBankOrNull(Reg); |
120 |
| - if (RB && RB->getID() == AMDGPU::VCCRegBankID) |
121 |
| - return true; |
| 120 | + bool isLaneMask(Register Reg); |
| 121 | + std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); |
| 122 | + std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); |
| 123 | + Register getReadAnyLaneSrc(Register Src); |
| 124 | + void replaceRegWithOrBuildCopy(Register Dst, Register Src); |
122 | 125 |
|
123 |
| - const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
124 |
| - return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); |
125 |
| - } |
| 126 | + bool tryEliminateReadAnyLane(MachineInstr &Copy); |
| 127 | + void tryCombineCopy(MachineInstr &MI); |
| 128 | + void tryCombineS1AnyExt(MachineInstr &MI); |
| 129 | +}; |
126 | 130 |
|
127 |
| - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { |
128 |
| - MI.eraseFromParent(); |
129 |
| - if (Optional0 && isTriviallyDead(*Optional0, MRI)) |
130 |
| - Optional0->eraseFromParent(); |
131 |
| - } |
| 131 | +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { |
| 132 | + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); |
| 133 | + if (RB && RB->getID() == AMDGPU::VCCRegBankID) |
| 134 | + return true; |
132 | 135 |
|
133 |
| - std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { |
134 |
| - MachineInstr *MatchMI = MRI.getVRegDef(Src); |
135 |
| - if (MatchMI->getOpcode() != Opcode) |
136 |
| - return {nullptr, Register()}; |
137 |
| - return {MatchMI, MatchMI->getOperand(1).getReg()}; |
138 |
| - } |
| 136 | + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
| 137 | + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); |
| 138 | +} |
139 | 139 |
|
140 |
| - void tryCombineCopy(MachineInstr &MI) { |
141 |
| - Register Dst = MI.getOperand(0).getReg(); |
142 |
| - Register Src = MI.getOperand(1).getReg(); |
143 |
| - // Skip copies of physical registers. |
144 |
| - if (!Dst.isVirtual() || !Src.isVirtual()) |
145 |
| - return; |
146 |
| - |
147 |
| - // This is a cross bank copy, sgpr S1 to lane mask. |
148 |
| - // |
149 |
| - // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) |
150 |
| - // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) |
151 |
| - // -> |
152 |
| - // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) |
153 |
| - if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { |
154 |
| - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); |
155 |
| - assert(Trunc && MRI.getType(TruncS32Src) == S32 && |
156 |
| - "sgpr S1 must be result of G_TRUNC of sgpr S32"); |
157 |
| - |
158 |
| - B.setInstr(MI); |
159 |
| - // Ensure that truncated bits in BoolSrc are 0. |
160 |
| - auto One = B.buildConstant({SgprRB, S32}, 1); |
161 |
| - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); |
162 |
| - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); |
163 |
| - cleanUpAfterCombine(MI, Trunc); |
164 |
| - return; |
165 |
| - } |
| 140 | +std::pair<MachineInstr *, Register> |
| 141 | +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { |
| 142 | + MachineInstr *MatchMI = MRI.getVRegDef(Src); |
| 143 | + if (MatchMI->getOpcode() != Opcode) |
| 144 | + return {nullptr, Register()}; |
| 145 | + return {MatchMI, MatchMI->getOperand(1).getReg()}; |
| 146 | +} |
| 147 | + |
| 148 | +std::pair<GUnmerge *, int> |
| 149 | +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { |
| 150 | + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); |
| 151 | + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) |
| 152 | + return {nullptr, -1}; |
| 153 | + |
| 154 | + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); |
| 155 | + if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) |
| 156 | + return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; |
166 | 157 |
|
167 |
| - // Src = G_AMDGPU_READANYLANE RALSrc |
168 |
| - // Dst = COPY Src |
169 |
| - // -> |
170 |
| - // Dst = RALSrc |
171 |
| - if (MRI.getRegBankOrNull(Dst) == VgprRB && |
172 |
| - MRI.getRegBankOrNull(Src) == SgprRB) { |
173 |
| - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); |
174 |
| - if (!RAL) |
175 |
| - return; |
176 |
| - |
177 |
| - assert(MRI.getRegBank(RALSrc) == VgprRB); |
178 |
| - MRI.replaceRegWith(Dst, RALSrc); |
179 |
| - cleanUpAfterCombine(MI, RAL); |
180 |
| - return; |
| 158 | + return {nullptr, -1}; |
| 159 | +} |
| 160 | + |
| 161 | +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { |
| 162 | + // Src = G_AMDGPU_READANYLANE RALSrc |
| 163 | + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); |
| 164 | + if (RAL) |
| 165 | + return RALSrc; |
| 166 | + |
| 167 | + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc |
| 168 | + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr |
| 169 | + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr |
| 170 | + // Src G_MERGE_VALUES LoSgpr, HiSgpr |
| 171 | + auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); |
| 172 | + if (Merge) { |
| 173 | + unsigned NumElts = Merge->getNumSources(); |
| 174 | + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); |
| 175 | + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) |
| 176 | + return {}; |
| 177 | + |
| 178 | + // Check if all elements are from same unmerge and there is no shuffling. |
| 179 | + for (unsigned i = 1; i < NumElts; ++i) { |
| 180 | + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); |
| 181 | + if (UnmergeI != Unmerge || (unsigned)IdxI != i) |
| 182 | + return {}; |
181 | 183 | }
|
| 184 | + return Unmerge->getSourceReg(); |
182 | 185 | }
|
183 | 186 |
|
184 |
| - void tryCombineS1AnyExt(MachineInstr &MI) { |
185 |
| - // %Src:sgpr(S1) = G_TRUNC %TruncSrc |
186 |
| - // %Dst = G_ANYEXT %Src:sgpr(S1) |
187 |
| - // -> |
188 |
| - // %Dst = G_... %TruncSrc |
189 |
| - Register Dst = MI.getOperand(0).getReg(); |
190 |
| - Register Src = MI.getOperand(1).getReg(); |
191 |
| - if (MRI.getType(Src) != S1) |
192 |
| - return; |
193 |
| - |
194 |
| - auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); |
195 |
| - if (!Trunc) |
196 |
| - return; |
197 |
| - |
198 |
| - LLT DstTy = MRI.getType(Dst); |
199 |
| - LLT TruncSrcTy = MRI.getType(TruncSrc); |
200 |
| - |
201 |
| - if (DstTy == TruncSrcTy) { |
202 |
| - MRI.replaceRegWith(Dst, TruncSrc); |
203 |
| - cleanUpAfterCombine(MI, Trunc); |
204 |
| - return; |
205 |
| - } |
| 187 | + // ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge |
| 188 | + // SgprI = G_AMDGPU_READANYLANE VgprI |
| 189 | + // SgprLarge G_MERGE_VALUES ..., SgprI, ... |
| 190 | + // ..., Src, ... = G_UNMERGE_VALUES SgprLarge |
| 191 | + auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI); |
| 192 | + if (!UnMerge) |
| 193 | + return {}; |
| 194 | + |
| 195 | + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); |
| 196 | + Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); |
| 197 | + if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) |
| 198 | + return {}; |
| 199 | + |
| 200 | + Register SrcRegIdx = Merge->getSourceReg(Idx); |
| 201 | + if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) |
| 202 | + return {}; |
| 203 | + |
| 204 | + auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); |
| 205 | + if (RALEl) |
| 206 | + return RALElSrc; |
| 207 | + |
| 208 | + return {}; |
| 209 | +} |
| 210 | + |
| 211 | +void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, |
| 212 | + Register Src) { |
| 213 | + if (Dst.isVirtual()) |
| 214 | + MRI.replaceRegWith(Dst, Src); |
| 215 | + else |
| 216 | + B.buildCopy(Dst, Src); |
| 217 | +} |
| 218 | + |
| 219 | +bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( |
| 220 | + MachineInstr &Copy) { |
| 221 | + Register Dst = Copy.getOperand(0).getReg(); |
| 222 | + Register Src = Copy.getOperand(1).getReg(); |
| 223 | + |
| 224 | + // Skip non-vgpr Dst |
| 225 | + if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) |
| 226 | + : !TRI.isVGPR(MRI, Dst)) |
| 227 | + return false; |
| 228 | + |
| 229 | + // Skip physical source registers and source registers with register class |
| 230 | + if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) |
| 231 | + return false; |
| 232 | + |
| 233 | + Register RALDst = Src; |
| 234 | + MachineInstr &SrcMI = *MRI.getVRegDef(Src); |
| 235 | + if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) |
| 236 | + RALDst = SrcMI.getOperand(1).getReg(); |
| 237 | + |
| 238 | + Register RALSrc = getReadAnyLaneSrc(RALDst); |
| 239 | + if (!RALSrc) |
| 240 | + return false; |
| 241 | + |
| 242 | + B.setInstr(Copy); |
| 243 | + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { |
| 244 | + // Src = READANYLANE RALSrc Src = READANYLANE RALSrc |
| 245 | + // Dst = Copy Src $Dst = Copy Src |
| 246 | + // -> -> |
| 247 | + // Dst = RALSrc $Dst = Copy RALSrc |
| 248 | + replaceRegWithOrBuildCopy(Dst, RALSrc); |
| 249 | + } else { |
| 250 | + // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc |
| 251 | + // Src = G_BITCAST RALDst Src = G_BITCAST RALDst |
| 252 | + // Dst = Copy Src Dst = Copy Src |
| 253 | + // -> -> |
| 254 | + // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst |
| 255 | + // Dst = NewVgpr $Dst = Copy NewVgpr |
| 256 | + auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); |
| 257 | + replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); |
| 258 | + } |
| 259 | + |
| 260 | + eraseInstr(Copy, MRI); |
| 261 | + return true; |
| 262 | +} |
| 263 | + |
| 264 | +void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { |
| 265 | + if (tryEliminateReadAnyLane(MI)) |
| 266 | + return; |
| 267 | + |
| 268 | + Register Dst = MI.getOperand(0).getReg(); |
| 269 | + Register Src = MI.getOperand(1).getReg(); |
| 270 | + // Skip copies of physical registers. |
| 271 | + if (!Dst.isVirtual() || !Src.isVirtual()) |
| 272 | + return; |
| 273 | + |
| 274 | + // This is a cross bank copy, sgpr S1 to lane mask. |
| 275 | + // |
| 276 | + // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) |
| 277 | + // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) |
| 278 | + // -> |
| 279 | + // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) |
| 280 | + if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { |
| 281 | + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); |
| 282 | + assert(Trunc && MRI.getType(TruncS32Src) == S32 && |
| 283 | + "sgpr S1 must be result of G_TRUNC of sgpr S32"); |
206 | 284 |
|
207 | 285 | B.setInstr(MI);
|
| 286 | + // Ensure that truncated bits in BoolSrc are 0. |
| 287 | + auto One = B.buildConstant({SgprRB, S32}, 1); |
| 288 | + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); |
| 289 | + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); |
| 290 | + eraseInstr(MI, MRI); |
| 291 | + } |
| 292 | +} |
208 | 293 |
|
209 |
| - if (DstTy == S32 && TruncSrcTy == S64) { |
210 |
| - auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); |
211 |
| - MRI.replaceRegWith(Dst, Unmerge.getReg(0)); |
212 |
| - cleanUpAfterCombine(MI, Trunc); |
213 |
| - return; |
214 |
| - } |
| 294 | +void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { |
| 295 | + // %Src:sgpr(S1) = G_TRUNC %TruncSrc |
| 296 | + // %Dst = G_ANYEXT %Src:sgpr(S1) |
| 297 | + // -> |
| 298 | + // %Dst = G_... %TruncSrc |
| 299 | + Register Dst = MI.getOperand(0).getReg(); |
| 300 | + Register Src = MI.getOperand(1).getReg(); |
| 301 | + if (MRI.getType(Src) != S1) |
| 302 | + return; |
| 303 | + |
| 304 | + auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); |
| 305 | + if (!Trunc) |
| 306 | + return; |
| 307 | + |
| 308 | + LLT DstTy = MRI.getType(Dst); |
| 309 | + LLT TruncSrcTy = MRI.getType(TruncSrc); |
| 310 | + |
| 311 | + if (DstTy == TruncSrcTy) { |
| 312 | + MRI.replaceRegWith(Dst, TruncSrc); |
| 313 | + eraseInstr(MI, MRI); |
| 314 | + return; |
| 315 | + } |
215 | 316 |
|
216 |
| - if (DstTy == S64 && TruncSrcTy == S32) { |
217 |
| - B.buildMergeLikeInstr(MI.getOperand(0).getReg(), |
218 |
| - {TruncSrc, B.buildUndef({SgprRB, S32})}); |
219 |
| - cleanUpAfterCombine(MI, Trunc); |
220 |
| - return; |
221 |
| - } |
| 317 | + B.setInstr(MI); |
222 | 318 |
|
223 |
| - if (DstTy == S32 && TruncSrcTy == S16) { |
224 |
| - B.buildAnyExt(Dst, TruncSrc); |
225 |
| - cleanUpAfterCombine(MI, Trunc); |
226 |
| - return; |
227 |
| - } |
| 319 | + if (DstTy == S32 && TruncSrcTy == S64) { |
| 320 | + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); |
| 321 | + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); |
| 322 | + eraseInstr(MI, MRI); |
| 323 | + return; |
| 324 | + } |
228 | 325 |
|
229 |
| - if (DstTy == S16 && TruncSrcTy == S32) { |
230 |
| - B.buildTrunc(Dst, TruncSrc); |
231 |
| - cleanUpAfterCombine(MI, Trunc); |
232 |
| - return; |
233 |
| - } |
| 326 | + if (DstTy == S64 && TruncSrcTy == S32) { |
| 327 | + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), |
| 328 | + {TruncSrc, B.buildUndef({SgprRB, S32})}); |
| 329 | + eraseInstr(MI, MRI); |
| 330 | + return; |
| 331 | + } |
234 | 332 |
|
235 |
| - llvm_unreachable("missing anyext + trunc combine"); |
| 333 | + if (DstTy == S32 && TruncSrcTy == S16) { |
| 334 | + B.buildAnyExt(Dst, TruncSrc); |
| 335 | + eraseInstr(MI, MRI); |
| 336 | + return; |
236 | 337 | }
|
237 |
| -}; |
| 338 | + |
| 339 | + if (DstTy == S16 && TruncSrcTy == S32) { |
| 340 | + B.buildTrunc(Dst, TruncSrc); |
| 341 | + eraseInstr(MI, MRI); |
| 342 | + return; |
| 343 | + } |
| 344 | + |
| 345 | + llvm_unreachable("missing anyext + trunc combine"); |
| 346 | +} |
238 | 347 |
|
239 | 348 | // Search through MRI for virtual registers with sgpr register bank and S1 LLT.
|
240 | 349 | [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
|
|
0 commit comments