Skip to content

AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize #145911

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/petar-avramovic/rbl-ral-tests
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 213 additions & 104 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
Expand Down Expand Up @@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};

bool isLaneMask(Register Reg) {
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
return true;
bool isLaneMask(Register Reg);
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
Register getReadAnyLaneSrc(Register Src);
void replaceRegWithOrBuildCopy(Register Dst, Register Src);

const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
}
bool tryEliminateReadAnyLane(MachineInstr &Copy);
void tryCombineCopy(MachineInstr &MI);
void tryCombineS1AnyExt(MachineInstr &MI);
};

void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
MI.eraseFromParent();
if (Optional0 && isTriviallyDead(*Optional0, MRI))
Optional0->eraseFromParent();
}
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
if (RB && RB->getID() == AMDGPU::VCCRegBankID)
return true;

std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
MachineInstr *MatchMI = MRI.getVRegDef(Src);
if (MatchMI->getOpcode() != Opcode)
return {nullptr, Register()};
return {MatchMI, MatchMI->getOperand(1).getReg()};
}
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
}

void tryCombineCopy(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
// Skip copies of physical registers.
if (!Dst.isVirtual() || !Src.isVirtual())
return;

// This is a cross bank copy, sgpr S1 to lane mask.
//
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
// ->
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
"sgpr S1 must be result of G_TRUNC of sgpr S32");

B.setInstr(MI);
// Ensure that truncated bits in BoolSrc are 0.
auto One = B.buildConstant({SgprRB, S32}, 1);
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
cleanUpAfterCombine(MI, Trunc);
return;
}
std::pair<MachineInstr *, Register>
AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
MachineInstr *MatchMI = MRI.getVRegDef(Src);
if (MatchMI->getOpcode() != Opcode)
return {nullptr, Register()};
return {MatchMI, MatchMI->getOperand(1).getReg()};
}
Comment on lines +140 to +146
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better name for this than tryMatch? Come to think of it, can the generic matching infrastructure be used for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can use mi_match, this is shorter because we use auto instead of declaring what we want to capture. To me at least, this has nicer formatting.
How about matchInstAndGetSrc?


std::pair<GUnmerge *, int>
AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
return {nullptr, -1};

Register RALSrc = ReadAnyLane->getOperand(1).getReg();
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};

// Src = G_AMDGPU_READANYLANE RALSrc
// Dst = COPY Src
// ->
// Dst = RALSrc
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
MRI.getRegBankOrNull(Src) == SgprRB) {
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
if (!RAL)
return;

assert(MRI.getRegBank(RALSrc) == VgprRB);
MRI.replaceRegWith(Dst, RALSrc);
cleanUpAfterCombine(MI, RAL);
return;
return {nullptr, -1};
}

Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
if (RAL)
return RALSrc;

// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
// Src G_MERGE_VALUES LoSgpr, HiSgpr
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
if (Merge) {
unsigned NumElts = Merge->getNumSources();
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
return {};

// Check if all elements are from same unmerge and there is no shuffling.
for (unsigned i = 1; i < NumElts; ++i) {
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
return {};
}
return Unmerge->getSourceReg();
}

void tryCombineS1AnyExt(MachineInstr &MI) {
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
// %Dst = G_ANYEXT %Src:sgpr(S1)
// ->
// %Dst = G_... %TruncSrc
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
if (MRI.getType(Src) != S1)
return;

auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
if (!Trunc)
return;

LLT DstTy = MRI.getType(Dst);
LLT TruncSrcTy = MRI.getType(TruncSrc);

if (DstTy == TruncSrcTy) {
MRI.replaceRegWith(Dst, TruncSrc);
cleanUpAfterCombine(MI, Trunc);
return;
}
// SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
// SourceReg G_MERGE_VALUES ..., SrcRegIdx, ...
// ..., Src, ... = G_UNMERGE_VALUES SourceReg
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
if (!UnMerge)
return {};

int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
return {};

Register SrcRegIdx = Merge->getSourceReg(Idx);
if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
return {};

auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
if (RALEl)
return RALElSrc;

return {};
}

void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
Register Src) {
if (Dst.isVirtual())
MRI.replaceRegWith(Dst, Src);
else
B.buildCopy(Dst, Src);
}

bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
MachineInstr &Copy) {
Register Dst = Copy.getOperand(0).getReg();
Register Src = Copy.getOperand(1).getReg();

// Skip non-vgpr Dst
if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
: !TRI.isVGPR(MRI, Dst))
return false;

// Skip physical source registers and source registers with register class
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't happen?

It is quite common
physical source registers is lower formal arguments for function %0:_(s32) = COPY $vgpr0
source registers with register class is amdgpu-regbakselect(the pass) isolating registers with register classes from being used in g_instruction %1:sgpr(s32) = COPY %0:sreg_32(s32), register class comes from selecting instruction and constraining register classes to all operands

if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
return false;

Register RALDst = Src;
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
RALDst = SrcMI.getOperand(1).getReg();

Register RALSrc = getReadAnyLaneSrc(RALDst);
if (!RALSrc)
return false;

B.setInstr(Copy);
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
// Dst = Copy Src $Dst = Copy Src
// -> ->
// Dst = RALSrc $Dst = Copy RALSrc
replaceRegWithOrBuildCopy(Dst, RALSrc);
} else {
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
// Dst = Copy Src Dst = Copy Src
// -> ->
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
// Dst = NewVgpr $Dst = Copy NewVgpr
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
}

eraseInstr(Copy, MRI);
return true;
}

void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
if (tryEliminateReadAnyLane(MI))
return;

Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
// Skip copies of physical registers.
if (!Dst.isVirtual() || !Src.isVirtual())
return;

// This is a cross bank copy, sgpr S1 to lane mask.
//
// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
// ->
// %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1
// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32)
if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
"sgpr S1 must be result of G_TRUNC of sgpr S32");
Comment on lines +281 to +283
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really guaranteed? Couldn't the source program have an LLVM IR trunc from i64 to i1?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct, not guaranteed. Was trying to make one with trunc from i64 to i1 but did not find one yet. Here is one test that actually hits that assert.

define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
entry:
  br label %A

A:
  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
  %a.val = load i32, ptr addrspace(1) %a.plus.counter
  %a.cond = icmp eq i32 %a.val, 0
  br i1 %a.cond, label %exit, label %loop.body

loop.body:
  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
  %x.val = load i32, ptr addrspace(1) %x.plus.counter
  %x.val.plus.1 = add i32 %x.val, 1
  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
  %counter.plus.1 = add i32 %counter, 1
  %x.cond = trunc i32 %counter to i1
  br i1 %x.cond, label %exit, label %A

exit:
  ret void
}

Original idea where that assert comes from trunc created regbanklegalize. Legal i1 used by something that is lowered by divergence lowering. Uniform i1 is lowered as sgpr S32 that is truncated to S1.
As this pass is still partially implemented can we deal with this in later patch?


B.setInstr(MI);
// Ensure that truncated bits in BoolSrc are 0.
auto One = B.buildConstant({SgprRB, S32}, 1);
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
eraseInstr(MI, MRI);
}
}

if (DstTy == S32 && TruncSrcTy == S64) {
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
cleanUpAfterCombine(MI, Trunc);
return;
}
void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
// %Src:sgpr(S1) = G_TRUNC %TruncSrc
// %Dst = G_ANYEXT %Src:sgpr(S1)
// ->
// %Dst = G_... %TruncSrc
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
if (MRI.getType(Src) != S1)
return;

auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
if (!Trunc)
return;

LLT DstTy = MRI.getType(Dst);
LLT TruncSrcTy = MRI.getType(TruncSrc);

if (DstTy == TruncSrcTy) {
MRI.replaceRegWith(Dst, TruncSrc);
eraseInstr(MI, MRI);
return;
}

if (DstTy == S64 && TruncSrcTy == S32) {
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
{TruncSrc, B.buildUndef({SgprRB, S32})});
cleanUpAfterCombine(MI, Trunc);
return;
}
B.setInstr(MI);

if (DstTy == S32 && TruncSrcTy == S16) {
B.buildAnyExt(Dst, TruncSrc);
cleanUpAfterCombine(MI, Trunc);
return;
}
if (DstTy == S32 && TruncSrcTy == S64) {
auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
MRI.replaceRegWith(Dst, Unmerge.getReg(0));
eraseInstr(MI, MRI);
return;
}

if (DstTy == S16 && TruncSrcTy == S32) {
B.buildTrunc(Dst, TruncSrc);
cleanUpAfterCombine(MI, Trunc);
return;
}
if (DstTy == S64 && TruncSrcTy == S32) {
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
{TruncSrc, B.buildUndef({SgprRB, S32})});
eraseInstr(MI, MRI);
return;
}

llvm_unreachable("missing anyext + trunc combine");
if (DstTy == S32 && TruncSrcTy == S16) {
B.buildAnyExt(Dst, TruncSrc);
eraseInstr(MI, MRI);
return;
}
};

if (DstTy == S16 && TruncSrcTy == S32) {
B.buildTrunc(Dst, TruncSrc);
eraseInstr(MI, MRI);
return;
}

llvm_unreachable("missing anyext + trunc combine");
}

// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
Expand Down
Loading
Loading