llvm · arcbbb · Jul 6, 2025 · fhahn · Jul 14, 2025 · david-arm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -400,6 +400,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool> FoldEarlyExitBranchIntoLoop(
+    "fold-early-exit-branch-into-loop", cl::init(false), cl::Hidden,
+    cl::desc("Fold early exit branch into its loop."));
+
 // Likelyhood of bypassing the vectorized loop because there are zero trips left
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -7242,6 +7246,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // Regions are dissolved after optimizing for VF and UF, which completely
   // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+  if (FoldEarlyExitBranchIntoLoop)
+    VPlanTransforms::foldEarlyExitBranchIntoLoop(BestVPlan);
+
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
                          OrigLoop->getParentLoop(),

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2853,6 +2853,117 @@ void VPlanTransforms::handleUncountableEarlyExit(
   LatchExitingBranch->eraseFromParent();
 }
 
+void VPlanTransforms::foldEarlyExitBranchIntoLoop(VPlan &Plan) {
+  using namespace llvm::VPlanPatternMatch;
+
+  VPDominatorTree VPDT(Plan);
+  auto IsTargetLatchExiting = [&](VPBasicBlock *VPBB) {
+    // Looking for the following pattern:
+    //   IfFalse:
+    //     ...
+    //   VPBB:
+    //     EMIT vp<%4> = ...
+    //     EMIT vp<%6> = or vp<%4>, ...
+    //     EMIT branch-on-cond vp<%6>
+    //   Successor(s): IfTrue, IfFalse
+    //
+    //   IfTrue:
+    //     EMIT branch-on-cond vp<%4>
+    //   Successor(s): vector.early.exit, middle.block
+    //
+    // Checks that:
+    //   1. The terminator of VPBB is a conditional branch on a logical OR
+    //      result.
+    //   2. The terminator of IfTrue block is also a conditional branch
+    //      using the same operand from the logical OR.
+    //   3. The edge to IfFalse is a backedge.
+    if (isa<VPIRBasicBlock>(VPBB))
+      return false;
+
+    auto *CondBranch = cast_if_present<VPInstruction>(VPBB->getTerminator());
+    VPValue *EarlyExitCond;
+    VPValue *MainExitCond;
+
+    if (!CondBranch ||
+        !match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
+                                                     m_VPValue(MainExitCond)))))
+      return false;
+
+    VPBasicBlock *MiddleSplit = VPBB->getSuccessors()[0]->getEntryBasicBlock();
+    auto *CondBranch2 =
+        cast_if_present<VPInstruction>(MiddleSplit->getTerminator());
+    if (!CondBranch2 ||
+        !match(CondBranch2, m_BranchOnCond((m_Specific(EarlyExitCond)))))
+      return false;
+
+    // Check if VPBB has a backedge to loop header.
+    VPBasicBlock *HeaderBB = VPBB->getSuccessors()[1]->getEntryBasicBlock();
+    if (!VPDT.dominates(HeaderBB, VPBB))
+      return false;
+    return true;
+  };
+
+  /// Promotes early-exit branch from middle.split to the loop level.
+  ///
+  /// Transforms the control flow from:
+  ///   LatchExiting:
+  ///       branch-on-cond (AltExit | MainExit) -> {MiddleSplit, LoopHeader}
+  ///   MiddleSplit:
+  ///       branch-on-cond (AltExit) -> {EarlyExit, Middle}
+  ///
+  /// To:
+  ///   EarlyExiting:
+  ///       branch-on-cond (AltExit) -> {EarlyExit, LatchExiting}
+  ///   LatchExiting:
+  ///       branch-on-cond (MainExit) -> {MiddleSplit, LoopHeader}
+  ///   MiddleSplit:
+  ///       direct-jump -> {Middle}
+
+  auto PromoteEarlyExit = [](VPBasicBlock *LatchExiting) {
+    auto *CondBranch = cast<VPInstruction>(LatchExiting->getTerminator());
+    VPBasicBlock *MiddleSplit =
+        LatchExiting->getSuccessors()[0]->getEntryBasicBlock();
+    VPBasicBlock *EarlyExit =
+        MiddleSplit->getSuccessors()[0]->getEntryBasicBlock();
+    VPBasicBlock *Middle =
+        MiddleSplit->getSuccessors()[1]->getEntryBasicBlock();
+
+    // Update the exit condition of LatchExiting.
+    VPValue *EarlyExitCond;
+    VPValue *MainExitCond;
+    VPValue *CombinedExitCond = CondBranch->getOperand(0);
+    match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
+                                                m_VPValue(MainExitCond))));
+    CondBranch->setOperand(0, MainExitCond);
+
+    // Remove the successor and branch-on-cond in middle.split.
+    auto *CondBranch2 = cast<VPInstruction>(MiddleSplit->getTerminator());
+    DebugLoc DL = CondBranch2->getDebugLoc();
+    CondBranch2->eraseFromParent();
+    VPBlockUtils::disconnectBlocks(MiddleSplit, EarlyExit);
+    // TODO: Merge middle block into middle.split.
+
+    // Create an early-exiting block and branch-on-cond.
+    VPBasicBlock *EarlyExiting =
+        CombinedExitCond->getDefiningRecipe()->getParent();
+    VPBasicBlock *EarlyExitingSplit = EarlyExiting->splitAt(
+        std::prev(CombinedExitCond->getDefiningRecipe()->getIterator()));
+    auto *BOC =
+        new VPInstruction(VPInstruction::BranchOnCond, {EarlyExitCond}, DL);
+    EarlyExiting->appendRecipe(BOC);
+    VPBlockUtils::connectBlocks(EarlyExiting, EarlyExit);
+    EarlyExiting->swapSuccessors();
+    if (CombinedExitCond->getNumUsers() == 0)
+      CombinedExitCond->getDefiningRecipe()->eraseFromParent();
+  };
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getEntry()))) {
+    if (IsTargetLatchExiting(VPBB))
+      PromoteEarlyExit(VPBB);
+  }
+}
+
 /// This function tries convert extended in-loop reductions to
 /// VPExpressionRecipe and clamp the \p Range if it is beneficial and
 /// valid. The created recipe must be decomposed to its constituent

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -196,6 +196,10 @@ struct VPlanTransforms {
                                          VPBasicBlock *LatchVPBB,
                                          VFRange &Range);
 
+  /// Promote the early-exit branch in the middle.split to the loop level,
+  /// making the loop multiple exiting.
+  static void foldEarlyExitBranchIntoLoop(VPlan &Plan);
+
   /// Replace loop regions with explicit CFG.
   static void dissolveLoopRegions(VPlan &Plan);
 

diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 -fold-early-exit-branch-into-loop | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY_SPLIT:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[VECTOR_BODY_SPLIT]]
+; CHECK:       vector.body.split:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
+  ret i64 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.