-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[VPlan] Add transform to fold early-exit branches into loops #148404
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
arcbbb
wants to merge
1
commit into
llvm:main
Choose a base branch
from
arcbbb:dispatch-exit-in-loop
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+210
−0
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | ||
; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 -fold-early-exit-branch-into-loop | FileCheck %s | ||
|
||
declare void @init_mem(ptr, i64); | ||
|
||
define i64 @same_exit_block_phi_of_consts() { | ||
; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 | ||
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 | ||
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) | ||
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) | ||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] | ||
; CHECK: vector.ph: | ||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
; CHECK: vector.body: | ||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY_SPLIT:%.*]] ] | ||
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] | ||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] | ||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 | ||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 | ||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] | ||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 | ||
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 | ||
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] | ||
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 | ||
; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) | ||
; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[VECTOR_BODY_SPLIT]] | ||
; CHECK: vector.body.split: | ||
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 | ||
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | ||
; CHECK: middle.split: | ||
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] | ||
; CHECK: middle.block: | ||
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] | ||
; CHECK: vector.early.exit: | ||
; CHECK-NEXT: br label [[LOOP_END]] | ||
; CHECK: scalar.ph: | ||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] | ||
; CHECK-NEXT: br label [[LOOP:%.*]] | ||
; CHECK: loop: | ||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] | ||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 | ||
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 | ||
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] | ||
; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] | ||
; CHECK: loop.inc: | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 | ||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 | ||
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] | ||
; CHECK: loop.end: | ||
; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ] | ||
; CHECK-NEXT: ret i64 [[RETVAL]] | ||
; | ||
entry: | ||
%p1 = alloca [1024 x i8] | ||
%p2 = alloca [1024 x i8] | ||
call void @init_mem(ptr %p1, i64 1024) | ||
call void @init_mem(ptr %p2, i64 1024) | ||
br label %loop | ||
|
||
loop: | ||
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] | ||
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index | ||
%ld1 = load i8, ptr %arrayidx, align 1 | ||
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index | ||
%ld2 = load i8, ptr %arrayidx1, align 1 | ||
%cmp3 = icmp eq i8 %ld1, %ld2 | ||
br i1 %cmp3, label %loop.inc, label %loop.end | ||
|
||
loop.inc: | ||
%index.next = add i64 %index, 1 | ||
%exitcond = icmp ne i64 %index.next, 67 | ||
br i1 %exitcond, label %loop, label %loop.end | ||
|
||
loop.end: | ||
%retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ] | ||
ret i64 %retval | ||
} | ||
;. | ||
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} | ||
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} | ||
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} | ||
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} | ||
;. |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have any performance data you could share? I think it would be preferable to not require an option for that, we should do it if profitable.
From the description
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I'd like to assess the performance impact of such a change as well before making this the default behaviour as we've previously optimised codegen for the approach favoured by @fhahn. However, in my initial implementation of the early exit work the CFG looked similar to how it does in this PR. Isn't the problem here also about choosing the best canonical form that allows following IR passes to optimise the best they can? The advantage of this new approach is presumably that you can add individual branch weights to match the scalar PGO data.
cc @huntergr-arm since he is intending to add support for having stores in early exit loops, which also requires a different approach.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arcbbb If you look at the actual code generated for AArch64 CPUs when targeting SVE you'll see that after lowering it does end up with this CFG. For example, if you take one of the existing tests llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll and run the command:
you'll see the assembly looks like this:
so it does end up in the form that you prefer. Have you tried achieving the same result when lowering?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, the question is whether this needs to be done in LV or may be better doing later (perhaps by the backend) where we have better target-specific information to decide what form is most profitable for the target.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(we can do it now that we dissolve the hierarchical CFG before execute where we aren't tied to single-entry-single exit regions, but the backend/other CFG simplification passes may still be a better place to decide what form is best)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would only work nicely for state-changing early exit loops (such as those with stores) if all such operations take place before the exits; we need to make sure stores from active lanes execute when the early exit is before the store in the original loop. This is much easier to do if there's a single unified exit. Same goes for any reductions after the exit I think.
I see you're splitting just before the combined condition (the or), and that's normally after all the other recipes, but I think you'd need a check for that before performing that transform.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have performance data available at the moment. This approach is the default in downstream, and there isn't a switch for me to test the alternative implementation.
You make a good point about advanced branch predictors. I believe that keeping branches simple and isolated likely prevents interference between unrelated conditions (e.g. predictable condition and unpredictable condition).
The question then becomes whether this should be the canonical form for early exit loop in the vectorizer.
I believe it should be, as it separates distinct conditions (countable and uncountable condition).
And let later transformations with more target-specific information decide whether to merge them based on the microarchitecture.
However, it's fine without this form, since David demonstrated that later transformations are able to handle this pattern.
I didn't catch that before. Thanks for pointing it out. It's reassuring that SelectionDAG already handles this without this patch.
Thanks for the reminding! I'll move the splitting point to be positioned after the last instruction with side-effect. That should be safe.