Skip to content

release/21.x: [LV] Vectorize maxnum/minnum w/o fast-math flags. (#148239) #149736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/Analysis/IVDescriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ enum class RecurKind {
FMul, ///< Product of floats.
FMin, ///< FP min implemented in terms of select(cmp()).
FMax, ///< FP max implemented in terms of select(cmp()).
FMinNum, ///< FP min with llvm.minnum semantics including NaNs.
FMaxNum, ///< FP max with llvm.maxnum semantics including NaNs.
FMinimum, ///< FP min with llvm.minimum semantics
FMaximum, ///< FP max with llvm.maximum semantics
FMinimumNum, ///< FP min with llvm.minimumnum semantics
Expand Down Expand Up @@ -250,6 +252,7 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is a floating-point min/max kind.
static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
}
Expand Down
26 changes: 23 additions & 3 deletions llvm/lib/Analysis/IVDescriptors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
};
if (isIntMinMaxRecurrenceKind(Kind) ||
(HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
if (isIntMinMaxRecurrenceKind(Kind))
return isMinMaxPattern(I, Kind, Prev);
else if (isFMulAddIntrinsic(I))
if (isFPMinMaxRecurrenceKind(Kind)) {
InstDesc Res = isMinMaxPattern(I, Kind, Prev);
if (!Res.isRecurrence())
return InstDesc(false, I);
if (HasRequiredFMF())
return Res;
// We may be able to vectorize FMax/FMin reductions using maxnum/minnum
// intrinsics with extra checks ensuring the vector loop handles only
// non-NaN inputs.
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
assert(Kind == RecurKind::FMax &&
"unexpected recurrence kind for maxnum");
return InstDesc(I, RecurKind::FMaxNum);
}
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
assert(Kind == RecurKind::FMin &&
"unexpected recurrence kind for minnum");
return InstDesc(I, RecurKind::FMinNum);
}
return InstDesc(false, I);
}
if (isFMulAddIntrinsic(I))
return InstDesc(Kind == RecurKind::FMulAdd, I,
I->hasAllowReassoc() ? nullptr : I);
return InstDesc(false, I);
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
case RecurKind::UMin:
return Intrinsic::vector_reduce_umin;
case RecurKind::FMax:
case RecurKind::FMaxNum:
return Intrinsic::vector_reduce_fmax;
case RecurKind::FMin:
case RecurKind::FMinNum:
return Intrinsic::vector_reduce_fmin;
case RecurKind::FMaximum:
return Intrinsic::vector_reduce_fmaximum;
Expand Down Expand Up @@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
case RecurKind::SMax:
return Intrinsic::smax;
case RecurKind::FMin:
case RecurKind::FMinNum:
return Intrinsic::minnum;
case RecurKind::FMax:
case RecurKind::FMaxNum:
return Intrinsic::maxnum;
case RecurKind::FMinimum:
return Intrinsic::minimum;
Expand Down Expand Up @@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
Value *Right) {
Type *Ty = Left->getType();
if (Ty->isIntOrIntVectorTy() ||
(RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
(RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
// TODO: Add float minnum/maxnum support when FMF nnan is set.
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
"rdx.minmax");
Expand Down Expand Up @@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
case RecurKind::UMin:
case RecurKind::FMax:
case RecurKind::FMin:
case RecurKind::FMinNum:
case RecurKind::FMaxNum:
case RecurKind::FMinimum:
case RecurKind::FMaximum:
case RecurKind::FMinimumNum:
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,6 @@ class VPBuilder {

/// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
/// and \p B.
/// TODO: add createFCmp when needed.
VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
Expand All @@ -240,6 +239,17 @@ class VPBuilder {
new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
}

/// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
/// and \p B.
VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
return tryInsertInstruction(
new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
}

VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4345,10 +4345,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
if (any_of(OrigLoop->getHeader()->phis(),
[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
// Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
// reductions need special handling and are currently unsupported.
if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
if (!Legal->isReductionVariable(&Phi))
return Legal->isFixedOrderRecurrence(&Phi);
RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
}))
return false;

// Phis with uses outside of the loop require special handling and are
Expand Down Expand Up @@ -8817,6 +8821,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);

// Apply mandatory transformation to handle FP maxnum/minnum reduction with
// NaNs if possible, bail out otherwise.
if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
*Plan))
return nullptr;

// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23196,6 +23196,8 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down Expand Up @@ -23333,6 +23335,8 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down Expand Up @@ -23435,6 +23439,8 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return ResTy;
}
case Instruction::ICmp:
case Instruction::FCmp:
case VPInstruction::ActiveLaneMask:
assert(inferScalarType(R->getOperand(0)) ==
inferScalarType(R->getOperand(1)) &&
Expand Down
160 changes: 160 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,3 +628,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
}

bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
RedPhiR->getBackedgeValue()->getDefiningRecipe());
if (!MinMaxR)
return nullptr;

auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
!(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
return nullptr;

#ifndef NDEBUG
Intrinsic::ID RdxIntrinsicId =
RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
: Intrinsic::minnum;
assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
RdxIntrinsicId) ||
(RepR &&
cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
RdxIntrinsicId) &&
"Intrinsic did not match recurrence kind");
#endif

if (MinMaxR->getOperand(0) == RedPhiR)
return MinMaxR->getOperand(1);

assert(MinMaxR->getOperand(1) == RedPhiR &&
"Reduction phi operand expected");
return MinMaxR->getOperand(0);
};

VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPReductionPHIRecipe *RedPhiR = nullptr;
bool HasUnsupportedPhi = false;
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
continue;
auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
if (!Cur) {
// TODO: Also support fixed-order recurrence phis.
HasUnsupportedPhi = true;
continue;
}
// For now, only a single reduction is supported.
// TODO: Support multiple MaxNum/MinNum reductions and other reductions.
if (RedPhiR)
return false;
if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
Cur->getRecurrenceKind() != RecurKind::FMinNum) {
HasUnsupportedPhi = true;
continue;
}
RedPhiR = Cur;
}

if (!RedPhiR)
return true;

// We won't be able to resume execution in the scalar tail, if there are
// unsupported header phis or there is no scalar tail at all, due to
// tail-folding.
if (HasUnsupportedPhi || !Plan.hasScalarTail())
return false;

VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
if (!MinMaxOp)
return false;

RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
"unsupported reduction");

/// Check if the vector loop of \p Plan can early exit and restart
/// execution of last vector iteration in the scalar loop. This requires all
/// recipes up to early exit point be side-effect free as they are
/// re-executed. Currently we check that the loop is free of any recipe that
/// may write to memory. Expected to operate on an early VPlan w/o nested
/// regions.
for (VPBlockBase *VPB : vp_depth_first_shallow(
Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
auto *VPBB = cast<VPBasicBlock>(VPB);
for (auto &R : *VPBB) {
if (R.mayWriteToMemory() &&
!match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
return false;
}
}

VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
VPBuilder Builder(LatchVPBB->getTerminator());
auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
"Unexpected terminator");
auto *IsLatchExitTaken =
Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
LatchExitingBranch->getOperand(1));

VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
auto *AnyExitTaken =
Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
LatchExitingBranch->eraseFromParent();

// If we exit early due to NaNs, compute the final reduction result based on
// the reduction phi at the beginning of the last vector iteration.
auto *RdxResult = find_singleton<VPSingleDefRecipe>(
RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
auto *VPI = dyn_cast<VPInstruction>(U);
if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
return VPI;
return nullptr;
});

auto *MiddleVPBB = Plan.getMiddleBlock();
Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
auto *NewSel =
Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
RdxResult->setOperand(1, NewSel);

auto *ScalarPH = Plan.getScalarPreheader();
// Update resume phis for inductions in the scalar preheader. If AnyNaN is
// true, the resume from the start of the last vector iteration via the
// canonical IV, otherwise from the original value.
for (auto &R : ScalarPH->phis()) {
auto *ResumeR = cast<VPPhi>(&R);
VPValue *VecV = ResumeR->getOperand(0);
if (VecV == RdxResult)
continue;
if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
if (DerivedIV->getNumUsers() == 1 &&
DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
&Plan.getVectorTripCount());
DerivedIV->moveAfter(&*Builder.getInsertPoint());
DerivedIV->setOperand(1, NewSel);
continue;
}
}
// Bail out and abandon the current, partially modified, VPlan if we
// encounter resume phi that cannot be updated yet.
if (VecV != &Plan.getVectorTripCount()) {
LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
"FMaxNum/FMinNum reduction.\n");
return false;
}
auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
ResumeR->setOperand(0, NewSel);
}

auto *MiddleTerm = MiddleVPBB->getTerminator();
Builder.setInsertPoint(MiddleTerm);
VPValue *MiddleCond = MiddleTerm->getOperand(0);
VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
MiddleTerm->setOperand(0, NewCond);
return true;
}
Loading
Loading