Skip to content

Commit 18f749c

Browse files
committed
[VPlan] Add transform to fold early-exit branches into loops
Implement VPlanTransforms::foldEarlyExitBranchIntoLoop() to promote early-exit branches from middle.split blocks to the loop level, creating multiple-exiting loops. This transformation restructures control flow from a pattern where exit conditions are combined with logical OR and re-evaluated in middle.split, to a structure where early exits are handled directly within the loop. This enables optimization opportunities on CPUs with advanced branch prediction by creating simpler, more predictable branch patterns.
1 parent 433a5a7 commit 18f749c

File tree

4 files changed

+210
-0
lines changed

4 files changed

+210
-0
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
400400
cl::desc(
401401
"Enable vectorization of early exit loops with uncountable exits."));
402402

403+
static cl::opt<bool> FoldEarlyExitBranchIntoLoop(
404+
"fold-early-exit-branch-into-loop", cl::init(false), cl::Hidden,
405+
cl::desc("Fold early exit branch into its loop."));
406+
403407
// Likelyhood of bypassing the vectorized loop because there are zero trips left
404408
// after prolog. See `emitIterationCountCheck`.
405409
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -7242,6 +7246,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72427246
// Regions are dissolved after optimizing for VF and UF, which completely
72437247
// removes unneeded loop regions first.
72447248
VPlanTransforms::dissolveLoopRegions(BestVPlan);
7249+
7250+
if (FoldEarlyExitBranchIntoLoop)
7251+
VPlanTransforms::foldEarlyExitBranchIntoLoop(BestVPlan);
7252+
72457253
// Perform the actual loop transformation.
72467254
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
72477255
OrigLoop->getParentLoop(),

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2853,6 +2853,117 @@ void VPlanTransforms::handleUncountableEarlyExit(
28532853
LatchExitingBranch->eraseFromParent();
28542854
}
28552855

2856+
void VPlanTransforms::foldEarlyExitBranchIntoLoop(VPlan &Plan) {
2857+
using namespace llvm::VPlanPatternMatch;
2858+
2859+
VPDominatorTree VPDT(Plan);
2860+
auto IsTargetLatchExiting = [&](VPBasicBlock *VPBB) {
2861+
// Looking for the following pattern:
2862+
// IfFalse:
2863+
// ...
2864+
// VPBB:
2865+
// EMIT vp<%4> = ...
2866+
// EMIT vp<%6> = or vp<%4>, ...
2867+
// EMIT branch-on-cond vp<%6>
2868+
// Successor(s): IfTrue, IfFalse
2869+
//
2870+
// IfTrue:
2871+
// EMIT branch-on-cond vp<%4>
2872+
// Successor(s): vector.early.exit, middle.block
2873+
//
2874+
// Checks that:
2875+
// 1. The terminator of VPBB is a conditional branch on a logical OR
2876+
// result.
2877+
// 2. The terminator of IfTrue block is also a conditional branch
2878+
// using the same operand from the logical OR.
2879+
// 3. The edge to IfFalse is a backedge.
2880+
if (isa<VPIRBasicBlock>(VPBB))
2881+
return false;
2882+
2883+
auto *CondBranch = cast_if_present<VPInstruction>(VPBB->getTerminator());
2884+
VPValue *EarlyExitCond;
2885+
VPValue *MainExitCond;
2886+
2887+
if (!CondBranch ||
2888+
!match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
2889+
m_VPValue(MainExitCond)))))
2890+
return false;
2891+
2892+
VPBasicBlock *MiddleSplit = VPBB->getSuccessors()[0]->getEntryBasicBlock();
2893+
auto *CondBranch2 =
2894+
cast_if_present<VPInstruction>(MiddleSplit->getTerminator());
2895+
if (!CondBranch2 ||
2896+
!match(CondBranch2, m_BranchOnCond((m_Specific(EarlyExitCond)))))
2897+
return false;
2898+
2899+
// Check if VPBB has a backedge to loop header.
2900+
VPBasicBlock *HeaderBB = VPBB->getSuccessors()[1]->getEntryBasicBlock();
2901+
if (!VPDT.dominates(HeaderBB, VPBB))
2902+
return false;
2903+
return true;
2904+
};
2905+
2906+
/// Promotes early-exit branch from middle.split to the loop level.
2907+
///
2908+
/// Transforms the control flow from:
2909+
/// LatchExiting:
2910+
/// branch-on-cond (AltExit | MainExit) -> {MiddleSplit, LoopHeader}
2911+
/// MiddleSplit:
2912+
/// branch-on-cond (AltExit) -> {EarlyExit, Middle}
2913+
///
2914+
/// To:
2915+
/// EarlyExiting:
2916+
/// branch-on-cond (AltExit) -> {EarlyExit, LatchExiting}
2917+
/// LatchExiting:
2918+
/// branch-on-cond (MainExit) -> {MiddleSplit, LoopHeader}
2919+
/// MiddleSplit:
2920+
/// direct-jump -> {Middle}
2921+
2922+
auto PromoteEarlyExit = [](VPBasicBlock *LatchExiting) {
2923+
auto *CondBranch = cast<VPInstruction>(LatchExiting->getTerminator());
2924+
VPBasicBlock *MiddleSplit =
2925+
LatchExiting->getSuccessors()[0]->getEntryBasicBlock();
2926+
VPBasicBlock *EarlyExit =
2927+
MiddleSplit->getSuccessors()[0]->getEntryBasicBlock();
2928+
VPBasicBlock *Middle =
2929+
MiddleSplit->getSuccessors()[1]->getEntryBasicBlock();
2930+
2931+
// Update the exit condition of LatchExiting.
2932+
VPValue *EarlyExitCond;
2933+
VPValue *MainExitCond;
2934+
VPValue *CombinedExitCond = CondBranch->getOperand(0);
2935+
match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
2936+
m_VPValue(MainExitCond))));
2937+
CondBranch->setOperand(0, MainExitCond);
2938+
2939+
// Remove the successor and branch-on-cond in middle.split.
2940+
auto *CondBranch2 = cast<VPInstruction>(MiddleSplit->getTerminator());
2941+
DebugLoc DL = CondBranch2->getDebugLoc();
2942+
CondBranch2->eraseFromParent();
2943+
VPBlockUtils::disconnectBlocks(MiddleSplit, EarlyExit);
2944+
// TODO: Merge middle block into middle.split.
2945+
2946+
// Create an early-exiting block and branch-on-cond.
2947+
VPBasicBlock *EarlyExiting =
2948+
CombinedExitCond->getDefiningRecipe()->getParent();
2949+
VPBasicBlock *EarlyExitingSplit = EarlyExiting->splitAt(
2950+
std::prev(CombinedExitCond->getDefiningRecipe()->getIterator()));
2951+
auto *BOC =
2952+
new VPInstruction(VPInstruction::BranchOnCond, {EarlyExitCond}, DL);
2953+
EarlyExiting->appendRecipe(BOC);
2954+
VPBlockUtils::connectBlocks(EarlyExiting, EarlyExit);
2955+
EarlyExiting->swapSuccessors();
2956+
if (CombinedExitCond->getNumUsers() == 0)
2957+
CombinedExitCond->getDefiningRecipe()->eraseFromParent();
2958+
};
2959+
2960+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2961+
vp_depth_first_shallow(Plan.getEntry()))) {
2962+
if (IsTargetLatchExiting(VPBB))
2963+
PromoteEarlyExit(VPBB);
2964+
}
2965+
}
2966+
28562967
/// This function tries convert extended in-loop reductions to
28572968
/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
28582969
/// valid. The created recipe must be decomposed to its constituent

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ struct VPlanTransforms {
196196
VPBasicBlock *LatchVPBB,
197197
VFRange &Range);
198198

199+
/// Promote the early-exit branch in the middle.split to the loop level,
200+
/// making the loop multiple exiting.
201+
static void foldEarlyExitBranchIntoLoop(VPlan &Plan);
202+
199203
/// Replace loop regions with explicit CFG.
200204
static void dissolveLoopRegions(VPlan &Plan);
201205

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 -fold-early-exit-branch-into-loop | FileCheck %s
3+
4+
declare void @init_mem(ptr, i64);
5+
6+
define i64 @same_exit_block_phi_of_consts() {
7+
; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() {
8+
; CHECK-NEXT: entry:
9+
; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
10+
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
11+
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
12+
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
13+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
14+
; CHECK: vector.ph:
15+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
16+
; CHECK: vector.body:
17+
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY_SPLIT:%.*]] ]
18+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
19+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
20+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
21+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
22+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
23+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
24+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
25+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
26+
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
27+
; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
28+
; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[VECTOR_BODY_SPLIT]]
29+
; CHECK: vector.body.split:
30+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
31+
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
32+
; CHECK: middle.split:
33+
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
34+
; CHECK: middle.block:
35+
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
36+
; CHECK: vector.early.exit:
37+
; CHECK-NEXT: br label [[LOOP_END]]
38+
; CHECK: scalar.ph:
39+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
40+
; CHECK-NEXT: br label [[LOOP:%.*]]
41+
; CHECK: loop:
42+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
43+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
44+
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
45+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
46+
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
47+
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
48+
; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
49+
; CHECK: loop.inc:
50+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
51+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
52+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
53+
; CHECK: loop.end:
54+
; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
55+
; CHECK-NEXT: ret i64 [[RETVAL]]
56+
;
57+
entry:
58+
%p1 = alloca [1024 x i8]
59+
%p2 = alloca [1024 x i8]
60+
call void @init_mem(ptr %p1, i64 1024)
61+
call void @init_mem(ptr %p2, i64 1024)
62+
br label %loop
63+
64+
loop:
65+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
66+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
67+
%ld1 = load i8, ptr %arrayidx, align 1
68+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
69+
%ld2 = load i8, ptr %arrayidx1, align 1
70+
%cmp3 = icmp eq i8 %ld1, %ld2
71+
br i1 %cmp3, label %loop.inc, label %loop.end
72+
73+
loop.inc:
74+
%index.next = add i64 %index, 1
75+
%exitcond = icmp ne i64 %index.next, 67
76+
br i1 %exitcond, label %loop, label %loop.end
77+
78+
loop.end:
79+
%retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
80+
ret i64 %retval
81+
}
82+
;.
83+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
84+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
85+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
86+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
87+
;.

0 commit comments

Comments
 (0)