-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[VPlan] Run narrowInterleaveGroups during general VPlan optimizations. #149706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Move narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: a) a new clone which contains all VFs of Plan, except VFToOptimize, and b) the original Plan with VFToOptimize as single VF. The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with llvm#149702, this allows to take the narrowed interleave groups into account when interleaving.
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesMove narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with #149702, this allows to take the narrowed interleave groups into account when interleaving. Patch is 30.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149706.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6e420632d83e5..7bde63f8c8c06 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7253,9 +7253,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
@@ -8364,6 +8361,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
!VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
*Plan, CM.getMaxSafeElements()))
break;
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(
+ *Plan,
+ TTI.getRegisterBitWidth(
+ TargetTransformInfo::RGK_FixedWidthVector),
+ SubRange))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 40a55656bfa7e..e7919495cb9a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -976,6 +976,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
} else {
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}
+
+ this->UF.setUnderlyingValue(ConstantInt::get(TCTy, UF));
}
VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
@@ -1252,6 +1254,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
+ Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 204268e586b43..60c5397738269 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3895,6 +3895,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
+ /// Represents the symbolic unroll factor of the loop.
+ VPValue UF;
+
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4050,6 +4053,9 @@ class VPlan {
/// Returns the VF of the vector loop region.
VPValue &getVF() { return VF; };
+ /// Returns the symbolic UF of the vector loop region.
+ VPValue &getSymbolicUF() { return UF; };
+
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2a920832f272f..f5e270aaa4bc7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3146,19 +3146,20 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
+ VFRange &Range) {
using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (VF.isScalable() || !VectorLoop)
- return;
+ if (Plan.hasScalableVF() || !VectorLoop)
+ return nullptr;
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
Type *CanonicalIVType = CanonicalIV->getScalarType();
VPTypeAnalysis TypeInfo(CanonicalIVType);
- unsigned FixedVF = VF.getFixedValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<unsigned> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
@@ -3173,11 +3174,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
+ return nullptr;
// Do not narrow interleave groups if there are VectorPointer recipes and
// the plan was unrolled. The recipe implicitly uses VF from
@@ -3185,18 +3186,35 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// TODO: Remove restriction once the VF for the VectorPointer offset is
// modeled explicitly as operand.
if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
- VectorRegWidth))
- return;
-
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plans VFs.
+ //
+ if (VFToOptimize) {
+ if (!isConsecutiveInterleaveGroup(InterleaveR, *VFToOptimize, TypeInfo,
+ VectorRegWidth))
+ return nullptr;
+ } else {
+ for (ElementCount VF : Plan.vectorFactors()) {
+ if (!VF.isFixed())
+ continue;
+ if (isConsecutiveInterleaveGroup(InterleaveR, VF.getFixedValue(),
+ TypeInfo, VectorRegWidth)) {
+ VFToOptimize = VF.getFixedValue();
+ break;
+ }
+ }
+ if (!VFToOptimize)
+ return nullptr;
+ }
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -3232,24 +3250,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(ElementCount::getFixed(*VFToOptimize));
+ bool First = true;
+ for (ElementCount VF : NewPlan->vectorFactors()) {
+ if (VF.isFixed() && VF.getFixedValue() == *VFToOptimize)
+ continue;
+ if (First) {
+ NewPlan->setVF(VF);
+ First = false;
+ continue;
+ }
+ NewPlan->addVF(VF);
+ }
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
auto NarrowOp = [](VPValue *V) -> VPValue * {
@@ -3314,11 +3352,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
- Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
- CanIV->getScalarType(), 1 * Plan.getUF())));
+ Inc->setOperand(1, &Plan.getSymbolicUF());
Plan.getVF().replaceAllUsesWith(
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
removeDeadRecipes(Plan);
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 04cb7a7a5c19b..2da2fb00ab433 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,14 +234,19 @@ struct VPlanTransforms {
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
static void materializeBroadcasts(VPlan &Plan);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with VF elements) can be replaced by wide loads ans tores
+ /// processing VF elements, if all transformed interleave groups access the
+ /// full vector width (checked via \o VectorRegWidth). If the transformation
+ /// can be applied, the original \p Plan will be split in 2, if is has
+ /// multiple VFs: a) a new clone which contains all VFs of Plan, except
+ /// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In
+ /// that case, the new clone is returned.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index cb7f0bfc64be1..ed23f5d5e6cbb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,15 +8,70 @@ target triple = "x86_64-unknown-linux"
define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
; CHECK-LABEL: define void @test_4xi64(
; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
+; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
+; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -27,15 +82,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -56,7 +111,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -118,7 +173,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesMove narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with #149702, this allows to take the narrowed interleave groups into account when interleaving. Patch is 30.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149706.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6e420632d83e5..7bde63f8c8c06 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7253,9 +7253,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
@@ -8364,6 +8361,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
!VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
*Plan, CM.getMaxSafeElements()))
break;
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(
+ *Plan,
+ TTI.getRegisterBitWidth(
+ TargetTransformInfo::RGK_FixedWidthVector),
+ SubRange))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 40a55656bfa7e..e7919495cb9a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -976,6 +976,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
} else {
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}
+
+ this->UF.setUnderlyingValue(ConstantInt::get(TCTy, UF));
}
VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
@@ -1252,6 +1254,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
+ Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 204268e586b43..60c5397738269 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3895,6 +3895,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
+ /// Represents the symbolic unroll factor of the loop.
+ VPValue UF;
+
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4050,6 +4053,9 @@ class VPlan {
/// Returns the VF of the vector loop region.
VPValue &getVF() { return VF; };
+ /// Returns the symbolic UF of the vector loop region.
+ VPValue &getSymbolicUF() { return UF; };
+
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2a920832f272f..f5e270aaa4bc7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3146,19 +3146,20 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
+ VFRange &Range) {
using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (VF.isScalable() || !VectorLoop)
- return;
+ if (Plan.hasScalableVF() || !VectorLoop)
+ return nullptr;
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
Type *CanonicalIVType = CanonicalIV->getScalarType();
VPTypeAnalysis TypeInfo(CanonicalIVType);
- unsigned FixedVF = VF.getFixedValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<unsigned> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
@@ -3173,11 +3174,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
+ return nullptr;
// Do not narrow interleave groups if there are VectorPointer recipes and
// the plan was unrolled. The recipe implicitly uses VF from
@@ -3185,18 +3186,35 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// TODO: Remove restriction once the VF for the VectorPointer offset is
// modeled explicitly as operand.
if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
- VectorRegWidth))
- return;
-
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plans VFs.
+ //
+ if (VFToOptimize) {
+ if (!isConsecutiveInterleaveGroup(InterleaveR, *VFToOptimize, TypeInfo,
+ VectorRegWidth))
+ return nullptr;
+ } else {
+ for (ElementCount VF : Plan.vectorFactors()) {
+ if (!VF.isFixed())
+ continue;
+ if (isConsecutiveInterleaveGroup(InterleaveR, VF.getFixedValue(),
+ TypeInfo, VectorRegWidth)) {
+ VFToOptimize = VF.getFixedValue();
+ break;
+ }
+ }
+ if (!VFToOptimize)
+ return nullptr;
+ }
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -3232,24 +3250,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(ElementCount::getFixed(*VFToOptimize));
+ bool First = true;
+ for (ElementCount VF : NewPlan->vectorFactors()) {
+ if (VF.isFixed() && VF.getFixedValue() == *VFToOptimize)
+ continue;
+ if (First) {
+ NewPlan->setVF(VF);
+ First = false;
+ continue;
+ }
+ NewPlan->addVF(VF);
+ }
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
auto NarrowOp = [](VPValue *V) -> VPValue * {
@@ -3314,11 +3352,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
- Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
- CanIV->getScalarType(), 1 * Plan.getUF())));
+ Inc->setOperand(1, &Plan.getSymbolicUF());
Plan.getVF().replaceAllUsesWith(
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
removeDeadRecipes(Plan);
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 04cb7a7a5c19b..2da2fb00ab433 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,14 +234,19 @@ struct VPlanTransforms {
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
static void materializeBroadcasts(VPlan &Plan);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with VF elements) can be replaced by wide loads ans tores
+ /// processing VF elements, if all transformed interleave groups access the
+ /// full vector width (checked via \o VectorRegWidth). If the transformation
+ /// can be applied, the original \p Plan will be split in 2, if is has
+ /// multiple VFs: a) a new clone which contains all VFs of Plan, except
+ /// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In
+ /// that case, the new clone is returned.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index cb7f0bfc64be1..ed23f5d5e6cbb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,15 +8,70 @@ target triple = "x86_64-unknown-linux"
define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
; CHECK-LABEL: define void @test_4xi64(
; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
+; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
+; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -27,15 +82,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -56,7 +111,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -118,7 +173,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_...
[truncated]
|
@@ -4050,6 +4053,9 @@ class VPlan { | |||
/// Returns the VF of the vector loop region. | |||
VPValue &getVF() { return VF; }; | |||
|
|||
/// Returns the symbolic UF of the vector loop region. | |||
VPValue &getSymbolicUF() { return UF; }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const
Move narrowInterleaveGroups to to general VPlan optimization stage.
To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width.
If such a VF is found, the original VPlan is split into 2:
a) a new clone which contains all VFs of Plan, except VFToOptimize, and
b) the original Plan with VFToOptimize as single VF.
The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans.
Together with #149702, this allows to take the narrowed interleave groups into account when interleaving.