Skip to content

[AArch64] Enable RT and partial unrolling with reductions for Apple CPUs. #149699

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/Transforms/Utils/UnrollLoop.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ LLVM_ABI bool computeUnrollCount(
TargetTransformInfo::UnrollingPreferences &UP,
TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);

LLVM_ABI std::optional<RecurrenceDescriptor>
canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
ScalarEvolution *SE);
} // end namespace llvm

#endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
22 changes: 22 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <optional>
Expand Down Expand Up @@ -4787,6 +4788,22 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
if (!L->getExitBlock())
return;

// Check if the loop contains any reductions that could be parallelized when
// unrolling. If so, enable partial unrolling, if the trip count is know to be
// a multiple of 2.
bool HasParellelizableReductions =
L->getNumBlocks() == 1 &&
any_of(L->getHeader()->phis(),
[&SE, L](PHINode &Phi) {
return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
}) &&
isLoopSizeWithinBudget(L, TTI, 12, nullptr);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we somehow get the unroller itself to compute whether it will actually generate a parallelized reduction, and pass that down to getUnrollingPreferences, instead of trying to recompute it in target-specific code?

if (HasParellelizableReductions &&
SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
UP.Partial = true;
UP.MaxCount = 4;
}

const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
(SE.getSmallConstantMaxTripCount(L) > 0 &&
Expand All @@ -4802,6 +4819,11 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Limit to loops with trip counts that are cheap to expand.
UP.SCEVExpansionBudget = 1;

if (HasParellelizableReductions) {
UP.Runtime = true;
UP.DefaultUnrollRuntimeCount = 4;
}

// Try to unroll small, single block loops, if they have load/store
// dependencies, to expose more parallel memory access streams.
BasicBlock *Header = L->getHeader();
Expand Down
130 changes: 130 additions & 0 deletions llvm/lib/Transforms/Utils/LoopUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
Expand Down Expand Up @@ -660,6 +661,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
OrigPHINode.push_back(cast<PHINode>(I));
}

// Collect phi nodes for reductions for which we can introduce multiple
// parallel reduction phis and compute the final reduction result after the
// loop. This requires a single exit block after unrolling. This is ensured by
// restricting to single-block loops where the unrolled iterations are known
// to not exit.
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
bool CanAddAdditionalAccumulators =
!CompletelyUnroll && L->getNumBlocks() == 1 &&
(ULO.Runtime ||
(ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
ExitInfos[Header].BreakoutTrip == 0))));

// Limit parallelizing reductions to unroll counts of 4 or less for now.
// TODO: The number of parallel reductions should depend on the number of
// execution units. We also don't have to add a parallel reduction phi per
// unrolled iteration, but could for example add a parallel phi for every 2
// unrolled iterations.
if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
for (PHINode &Phi : Header->phis()) {
auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
if (!RdxDesc)
continue;

// Only handle duplicate phis for a single reduction for now.
// TODO: Handle any number of reductions
if (!Reductions.empty())
continue;

Reductions[&Phi] = *RdxDesc;
}
}

std::vector<BasicBlock *> Headers;
std::vector<BasicBlock *> Latches;
Headers.push_back(Header);
Expand Down Expand Up @@ -710,6 +743,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// latch. This is a reasonable default placement if we don't have block
// frequencies, and if we do, well the layout will be adjusted later.
auto BlockInsertPt = std::next(LatchBlock->getIterator());
SmallVector<Instruction *> PartialReductions;
for (unsigned It = 1; It != ULO.Count; ++It) {
SmallVector<BasicBlock *, 8> NewBlocks;
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
Expand All @@ -733,6 +767,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
for (PHINode *OrigPHI : OrigPHINode) {
PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);

// Use cloned phis as parallel phis for partial reductions, which will
// get combined to the final reduction result after the loop.
if (Reductions.contains(OrigPHI)) {
// Collect partial reduction results.
if (PartialReductions.empty())
PartialReductions.push_back(cast<Instruction>(InVal));
PartialReductions.push_back(cast<Instruction>(VMap[InVal]));

// Update the start value for the cloned phis to use the identity
// value for the reduction.
const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
NewPHI->setIncomingValueForBlock(
L->getLoopPreheader(),
getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
OrigPHI->getType(),
RdxDesc.getFastMathFlags()));

// Update NewPHI to use the cloned value for the iteration and move
// to header.
NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
NewPHI->moveBefore(OrigPHI->getIterator());
continue;
}

if (Instruction *InValI = dyn_cast<Instruction>(InVal))
if (It > 1 && L->contains(InValI))
InVal = LastValueMap[InValI];
Expand Down Expand Up @@ -832,7 +891,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
PN->eraseFromParent();
} else if (ULO.Count > 1) {
if (Reductions.contains(PN))
continue;

Value *InVal = PN->removeIncomingValue(LatchBlock, false);

// If this value was defined in the loop, take the value defined by the
// last iteration of the loop.
if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
Expand Down Expand Up @@ -1010,6 +1073,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
}
}

// If there are partial reductions, create code in the exit block to compute
// the final result and update users of the final result.
if (!PartialReductions.empty()) {
BasicBlock *ExitBlock = L->getExitBlock();
assert(ExitBlock &&
"Can only introduce parallel reduction phis with single exit block");
assert(Reductions.size() == 1 &&
"currently only a single reduction is supported");
Value *FinalRdxValue = PartialReductions.back();
Value *RdxResult = nullptr;
for (PHINode &Phi : ExitBlock->phis()) {
if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
continue;
if (!RdxResult) {
RdxResult = PartialReductions.front();
IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
for (Instruction *RdxPart : drop_begin(PartialReductions)) {
RdxResult = Builder.CreateBinOp(
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
RdxPart, RdxResult, "bin.rdx");
}
NeedToFixLCSSA = true;
for (Instruction *RdxPart : PartialReductions)
RdxPart->dropPoisonGeneratingFlags();
}

Phi.replaceAllUsesWith(RdxResult);
continue;
}
}

if (DTUToUse) {
// Apply updates to the DomTree.
DT = &DTU.getDomTree();
Expand Down Expand Up @@ -1111,3 +1206,38 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
}
return nullptr;
}

std::optional<RecurrenceDescriptor>
llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
ScalarEvolution *SE) {
RecurrenceDescriptor RedDes;
if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RedDes,
/*DemandedBits=*/nullptr,
/*AC=*/nullptr, /*DT=*/nullptr, SE))
return std::nullopt;
RecurKind RK = RedDes.getRecurrenceKind();
// Skip unsupported reductions.
// TODO: Handle additional reductions, including FP and min-max
// reductions.
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
return std::nullopt;

// Don't unroll reductions with constant ops; those can be folded to a
// single induction update.
if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
->operands(),
IsaPred<Constant>))
return std::nullopt;

BasicBlock *Latch = L->getLoopLatch();
if (!Latch ||
!is_contained(
cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
&Phi))
return std::nullopt;

return RedDes;
}
Loading
Loading