Skip to content

[SDAG][AMDGPU] Allow opting in to OOB-generating PTRADD transforms #146074

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/ritter-x2a/06-17-_amdgpu_sdag_handle_isd_ptradd_in_various_special_cases
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3510,6 +3510,13 @@ class LLVM_ABI TargetLoweringBase {
return false;
}

/// True if the target allows transformations of in-bounds pointer
/// arithmetic that cause out-of-bounds intermediate results.
virtual bool canTransformPtrArithOutOfBounds(const Function &F,
EVT PtrVT) const {
return false;
}

/// Does this target support complex deinterleaving
virtual bool isComplexDeinterleavingSupported() const { return false; }

Expand Down
125 changes: 74 additions & 51 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2689,59 +2689,82 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
if (PtrVT == IntVT && isNullConstant(N0))
return N1;

if (N0.getOpcode() != ISD::PTRADD ||
reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1))
return SDValue();

SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
SDValue Z = N1;
bool N0OneUse = N0.hasOneUse();
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
// * y is a constant and (ptradd x, y) has one use; or
// * y and z are both constants.
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
// If both additions in the original were NUW, the new ones are as well.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
AddToWorklist(Add.getNode());
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
if (N0.getOpcode() == ISD::PTRADD &&
!reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
SDValue Z = N1;
bool N0OneUse = N0.hasOneUse();
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
// * y is a constant and (ptradd x, y) has one use; or
// * y and z are both constants.
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
// If both additions in the original were NUW, the new ones are as well.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
AddToWorklist(Add.getNode());
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
}
}

// The following combines can turn in-bounds pointer arithmetic out of bounds.
// That is problematic for settings like AArch64's CPA, which checks that
// intermediate results of pointer arithmetic remain in bounds. The target
// therefore needs to opt-in to enable them.
if (!TLI.canTransformPtrArithOutOfBounds(
DAG.getMachineFunction().getFunction(), PtrVT))
return SDValue();

if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
// global address GA and constant c, such that c can be folded into GA.
SDValue GAValue = N0.getOperand(0);
if (const GlobalAddressSDNode *GA =
dyn_cast<GlobalAddressSDNode>(GAValue)) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
// If both additions in the original were NUW, reassociation preserves
// that.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}

// TODO: There is another possible fold here that was proven useful.
// It would be this:
//
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
// * (ptradd x, y) has one use; and
// * y is a constant; and
// * z is not a constant.
//
// In some cases, specifically in AArch64's FEAT_CPA, it exposes the
// opportunity to select more complex instructions such as SUBPT and
// MSUBPT. However, a hypothetical corner case has been found that we could
// not avoid. Consider this (pseudo-POSIX C):
//
// char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
// char *p = mmap(LARGE_CONSTANT);
// char *q = foo(p, -LARGE_CONSTANT);
//
// Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
// further + z takes it back to the start of the mapping, so valid,
// regardless of the address mmap gave back. However, if mmap gives you an
// address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
// borrow from the high bits (with the subsequent + z carrying back into
// the high bits to give you a well-defined pointer) and thus trip
// FEAT_CPA's pointer corruption checks.
//
// We leave this fold as an opportunity for future work, addressing the
// corner case for FEAT_CPA, as well as reconciling the solution with the
// more general application of pointer arithmetic in other future targets.
// For now each architecture that wants this fold must implement it in the
// target-specific code (see e.g. SITargetLowering::performPtrAddCombine)
if (N1.getOpcode() == ISD::ADD && N1.hasOneUse()) {
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
// y is not, and (add y, z) is used only once.
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
// z is not, and (add y, z) is used only once.
// The goal is to move constant offsets to the outermost ptradd, to create
// more opportunities to fold offsets into memory instructions.
// Together with the another combine above, this also implements
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// If both additions in the original were NUW, reassociation preserves that.
SDNodeFlags ReassocFlags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;

if (ZIsConstant != YIsConstant) {
if (YIsConstant)
std::swap(Y, Z);
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
}
}

return SDValue();
}
Expand Down
60 changes: 10 additions & 50 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10602,6 +10602,11 @@ bool SITargetLowering::shouldPreservePtrArith(const Function &F,
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
}

bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
EVT PtrVT) const {
return true;
}

// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
// offset (the offset that is included in bounds checking and swizzling, to be
// split between the instruction's voffset and immoffset fields) and soffset
Expand Down Expand Up @@ -15131,65 +15136,17 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
return Folded;
}

if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
// global address GA and constant c, such that c can be folded into GA.
SDValue GAValue = N0.getOperand(0);
if (const GlobalAddressSDNode *GA =
dyn_cast<GlobalAddressSDNode>(GAValue)) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
// If both additions in the original were NUW, reassociation preserves
// that.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}

if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
return SDValue();

// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
// y is not, and (add y, z) is used only once.
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
// z is not, and (add y, z) is used only once.
// The goal is to move constant offsets to the outermost ptradd, to create
// more opportunities to fold offsets into memory instructions.
// Together with the generic combines in DAGCombiner.cpp, this also
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
//
// This transform is here instead of in the general DAGCombiner as it can
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
// AArch64's CPA.
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// If both additions in the original were NUW, reassociation preserves that.
SDNodeFlags ReassocFlags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;

if (ZIsConstant != YIsConstant) {
if (YIsConstant)
std::swap(Y, Z);
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
DCI.AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
}

// If one of Y and Z is constant, they have been handled above. If both were
// constant, the addition would have been folded in SelectionDAG::getNode
// already. This ensures that the generic DAG combines won't undo the
// following reassociation.
assert(!YIsConstant && !ZIsConstant);

if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
Y->isDivergent() != Z->isDivergent()) {
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
// y are uniform and z isn't.
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
Expand All @@ -15200,6 +15157,9 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
// reassociate.
if (Y->isDivergent())
std::swap(Y, Z);
// If both additions in the original were NUW, reassociation preserves that.
SDNodeFlags ReassocFlags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
DCI.AddToWorklist(UniformInner.getNode());
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {

bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;

bool canTransformPtrArithOutOfBounds(const Function &F,
EVT PtrVT) const override;

private:
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]
Expand Down