diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9c4f4246deb41..cfe429a775e68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2465,6 +2465,33 @@ static void licm(VPlan &Plan) { R.moveBefore(*Preheader, Preheader->end()); } } + + // Sink any recipes which don't have any users in the region to the nearest + // common dominator of its users. + VPDominatorTree VPDT(Plan); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_post_order_shallow(LoopRegion->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { + if (cannotHoistOrSinkRecipe(R)) + continue; + SmallSetVector UserVPBBs; + if (any_of(R.definedValues(), [&UserVPBBs](VPValue *V) { + return any_of(V->users(), [&UserVPBBs](VPUser *U) { + auto *UR = cast(U); + UserVPBBs.insert(UR->getParent()); + return UR->getParent()->getEnclosingLoopRegion(); + }); + })) + continue; + if (UserVPBBs.empty()) + continue; + VPBasicBlock *SinkVPBB = UserVPBBs.front(); + for (auto *UserVPBB : drop_begin(UserVPBBs)) + SinkVPBB = cast( + VPDT.findNearestCommonDominator(SinkVPBB, UserVPBB)); + R.moveBefore(*SinkVPBB, SinkVPBB->begin()); + } + } } void VPlanTransforms::truncateToMinimalBitwidths( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 40db6a53b49e4..46bb50dfd9799 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -94,10 +94,10 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]] @@ -106,15 +106,15 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64> -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -163,11 +163,11 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) { ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64> ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -271,7 +271,7 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) { ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 @@ -289,15 +289,15 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) { ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 4 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i32> [[TMP7]] to <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = shl <4 x i64> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = shl <4 x i64> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll index 9dfb987bd24a6..3951b88e41d55 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll @@ -42,11 +42,11 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) { ; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] ; CHECK: [[PRED_UREM_CONTINUE6]]: ; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP11]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP14]], %[[PRED_UREM_IF5]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index b2be0e1d7a442..e80449432d29b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -493,12 +493,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVE1: for.exit: @@ -526,12 +526,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: @@ -553,12 +553,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 ; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-MAXBW: for.exit: @@ -1894,7 +1894,7 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -1911,12 +1911,12 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP8]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1946,12 +1946,12 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]]) -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT4]] to <8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT4]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP8]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 @@ -1963,7 +1963,7 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -1980,12 +1980,12 @@ define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32> ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP8]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2019,7 +2019,7 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2037,12 +2037,12 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2073,12 +2073,12 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[VEC_PHI1]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP11]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP6]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 @@ -2090,7 +2090,7 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2108,12 +2108,12 @@ define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2147,7 +2147,7 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2165,12 +2165,12 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2201,12 +2201,12 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP11]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 @@ -2218,7 +2218,7 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2236,12 +2236,12 @@ define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) ; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2275,7 +2275,7 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2293,12 +2293,12 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2329,12 +2329,12 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP11]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 @@ -2346,7 +2346,7 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 @@ -2364,12 +2364,12 @@ define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 ; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP10]] to <8 x i64> ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) ; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 5be1fac8c6287..ec93f05089f24 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -778,12 +778,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVE1: for.exit: @@ -811,12 +811,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15 ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: @@ -838,12 +838,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-MAXBW: for.exit: @@ -878,28 +878,28 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add [[TMP16]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP13]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add [[TMP7]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP17]], i32 [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP8]], i32 [[TMP25]] ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -910,7 +910,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], 2 ; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] @@ -919,22 +919,22 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add [[TMP30]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul [[TMP9]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add [[TMP12]], [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP11]], i32 [[TMP29]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: @@ -945,28 +945,28 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add [[TMP20]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 1 ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul [[TMP6]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add [[TMP7]], [[TMP6]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 16 ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP25]] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP8]], i32 [[TMP25]] ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll index e084307c0c2ae..70e771ff0f890 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -502,9 +502,9 @@ define float @fadd_predicated(ptr noalias nocapture %a, i64 %n) { ; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]] ; CHECK-UNORDERED: pred.load.continue2 ; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}} -; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]] ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]] ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]]) ; CHECK-UNORDERED: for.end ; CHECK-UNORDERED: ret float %[[RDX]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index e4ba6fe9d757d..25dc1216b1817 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -636,47 +636,53 @@ for.end: ; Test for https://github.com/llvm/llvm-project/issues/159402. For invariant divisors, ; selects can be introduced outside the vector loop and their cost should not be ; considered for each loop iteration. -define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { +define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c, ptr %p) { ; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i8 [[X:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i16 [[Y:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.stepvector.nxv8i8() -; CHECK-NEXT: [[TMP2:%.*]] = mul [[TMP1]], splat (i8 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i8 -12), [[TMP2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i8 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i16 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.stepvector.nxv4i8() +; CHECK-NEXT: [[TMP2:%.*]] = mul [[TMP1]], splat (i8 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i8 -12), [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i8 [[TMP4]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vp.merge.nxv8i8( [[TMP0]], [[BROADCAST_SPLAT2]], splat (i8 1), i32 [[TMP3]]) -; CHECK-NEXT: [[TMP9:%.*]] = udiv [[VEC_IND]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = zext [[TMP9]] to -; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.vp.merge.nxv8i16( [[TMP0]], [[BROADCAST_SPLAT4]], splat (i16 1), i32 [[TMP3]]) -; CHECK-NEXT: [[TMP12:%.*]] = sdiv [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = sext [[TMP12]] to -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i8 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[EVL_BASED_IV]] to i16 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -12, [[DOTCAST]] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vp.merge.nxv4i8( [[TMP0]], [[BROADCAST_SPLAT2]], splat (i8 1), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = udiv [[VEC_IND]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = zext [[TMP6]] to +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vp.merge.nxv4i16( [[TMP0]], [[BROADCAST_SPLAT4]], splat (i16 1), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = sext [[TMP9]] to +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], zeroinitializer, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], i16 [[OFFSET_IDX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[PREDPHI]], ptr align 4 [[TMP11]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP3]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP3]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP3]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4 ; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 0 -; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = extractelement [[PREDPHI]], i64 [[TMP17]] +; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = extractelement [[PREDPHI]], i64 [[TMP17]] ; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 [[MERGE_LCSSA]] @@ -695,11 +701,15 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 +; FIXED-NEXT: [[OFFSET_IDX:%.*]] = add i16 -12, [[DOTCAST]] ; FIXED-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[VEC_IND]], [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> ; FIXED-NEXT: [[TMP4:%.*]] = sdiv <4 x i16> [[TMP3]], [[TMP1]] ; FIXED-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> ; FIXED-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i16 [[OFFSET_IDX]] +; FIXED-NEXT: store <4 x i32> [[PREDPHI]], ptr [[TMP8]], align 4 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; FIXED-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 @@ -727,6 +737,8 @@ then: loop.latch: %merge = phi i32 [ 0, %loop.header ], [ %sd.ext, %then ] + %gep = getelementptr i32, ptr %p, i16 %iv + store i32 %merge, ptr %gep %iv.next = add nsw i16 %iv, 1 %ec = icmp eq i16 %iv.next, 0 %iv.next.trunc = trunc i16 %iv.next to i8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll index 164a5cd1ae3c0..1542e0b973cdb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll @@ -105,8 +105,6 @@ define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c ; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP19]], splat (i64 -7070675565921424023) ; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP20]], splat (i64 -4) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[TMP21]], align 1 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP11]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr nusw i8, [[VECTOR_GEP]], i64 12 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[TMP16]], [[BROADCAST_SPLAT7]] ; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]] ; CHECK-NEXT: [[TMP27:%.*]] = mul i64 12, [[TMP26]] @@ -114,6 +112,8 @@ define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr nusw i8, [[VECTOR_GEP]], i64 12 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[TMP30]], [[BROADCAST_SPLAT7]] ; CHECK-NEXT: [[TMP29:%.*]] = sub i64 [[TMP26]], 1 ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll index 651e2ad5e74da..2f90e0154e631 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -325,13 +325,13 @@ define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 { ; CHECK-NEXT: [[TMP68:%.*]] = sext <8 x i32> [[TMP67]] to <8 x i64> ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], <8 x i64> [[TMP68]], i32 2 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP69]], <8 x i1> [[TMP1]], <8 x i64> poison) -; CHECK-NEXT: [[TMP70:%.*]] = or <8 x i64> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[TMP70]], <8 x i64> zeroinitializer ; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8) ; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP71]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP73:%.*]] = or <8 x i64> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[TMP73]], <8 x i64> zeroinitializer ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i64> [[PREDPHI]], i32 7 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP63]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -684,275 +684,67 @@ exit: } ; Test for https://github.com/llvm/llvm-project/issues/129236. -define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) { +define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a, ptr %p) { ; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev( -; CHECK-NEXT: iter.check: +; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0 ; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32 ; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32 -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[CMP_I]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP60:%.*]] = xor <32 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[CMP_I]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE62:%.*]] ] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <32 x i1> [[TMP60]], i32 0 -; CHECK-NEXT: br i1 [[TMP61]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[LOOP_HEADER1]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i8 100, [[DOTCAST]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_UREM_IF:%.*]], label [[LOOP_LATCH:%.*]] ; CHECK: pred.urem.if: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]] +; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: pred.urem.continue: -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x i1> [[TMP60]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_UREM_IF1:%.*]], label [[PRED_UREM_CONTINUE2:%.*]] ; CHECK: pred.urem.if1: ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE2]] ; CHECK: pred.urem.continue2: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <32 x i1> [[TMP60]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] ; CHECK: pred.urem.if3: ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE4]] ; CHECK: pred.urem.continue4: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <32 x i1> [[TMP60]], i32 3 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6]] ; CHECK: pred.urem.if5: ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE6]] ; CHECK: pred.urem.continue6: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <32 x i1> [[TMP60]], i32 4 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]] -; CHECK: pred.urem.if7: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE8]] -; CHECK: pred.urem.continue8: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP60]], i32 5 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]] -; CHECK: pred.urem.if9: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE10]] -; CHECK: pred.urem.continue10: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <32 x i1> [[TMP60]], i32 6 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]] -; CHECK: pred.urem.if11: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE12]] -; CHECK: pred.urem.continue12: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP60]], i32 7 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]] -; CHECK: pred.urem.if13: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE14]] -; CHECK: pred.urem.continue14: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <32 x i1> [[TMP60]], i32 8 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16:%.*]] -; CHECK: pred.urem.if15: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE16]] -; CHECK: pred.urem.continue16: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i1> [[TMP60]], i32 9 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UREM_IF17:%.*]], label [[PRED_UREM_CONTINUE18:%.*]] -; CHECK: pred.urem.if17: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE18]] -; CHECK: pred.urem.continue18: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <32 x i1> [[TMP60]], i32 10 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF19:%.*]], label [[PRED_UREM_CONTINUE20:%.*]] -; CHECK: pred.urem.if19: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE20]] -; CHECK: pred.urem.continue20: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i1> [[TMP60]], i32 11 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_UREM_IF21:%.*]], label [[PRED_UREM_CONTINUE22:%.*]] -; CHECK: pred.urem.if21: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE22]] -; CHECK: pred.urem.continue22: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i1> [[TMP60]], i32 12 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_UREM_IF23:%.*]], label [[PRED_UREM_CONTINUE24:%.*]] -; CHECK: pred.urem.if23: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE24]] -; CHECK: pred.urem.continue24: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <32 x i1> [[TMP60]], i32 13 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF25:%.*]], label [[PRED_UREM_CONTINUE26:%.*]] -; CHECK: pred.urem.if25: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE26]] -; CHECK: pred.urem.continue26: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <32 x i1> [[TMP60]], i32 14 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_UREM_IF27:%.*]], label [[PRED_UREM_CONTINUE28:%.*]] -; CHECK: pred.urem.if27: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE28]] -; CHECK: pred.urem.continue28: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <32 x i1> [[TMP60]], i32 15 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF29:%.*]], label [[PRED_UREM_CONTINUE30:%.*]] -; CHECK: pred.urem.if29: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE30]] -; CHECK: pred.urem.continue30: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <32 x i1> [[TMP60]], i32 16 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_UREM_IF31:%.*]], label [[PRED_UREM_CONTINUE32:%.*]] -; CHECK: pred.urem.if31: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE32]] -; CHECK: pred.urem.continue32: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i1> [[TMP60]], i32 17 -; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UREM_IF33:%.*]], label [[PRED_UREM_CONTINUE34:%.*]] -; CHECK: pred.urem.if33: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE34]] -; CHECK: pred.urem.continue34: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <32 x i1> [[TMP60]], i32 18 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF35:%.*]], label [[PRED_UREM_CONTINUE36:%.*]] -; CHECK: pred.urem.if35: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE36]] -; CHECK: pred.urem.continue36: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <32 x i1> [[TMP60]], i32 19 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_UREM_IF37:%.*]], label [[PRED_UREM_CONTINUE38:%.*]] -; CHECK: pred.urem.if37: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE38]] -; CHECK: pred.urem.continue38: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP60]], i32 20 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF39:%.*]], label [[PRED_UREM_CONTINUE40:%.*]] -; CHECK: pred.urem.if39: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE40]] -; CHECK: pred.urem.continue40: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i1> [[TMP60]], i32 21 -; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_UREM_IF41:%.*]], label [[PRED_UREM_CONTINUE42:%.*]] -; CHECK: pred.urem.if41: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE42]] -; CHECK: pred.urem.continue42: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i1> [[TMP60]], i32 22 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_UREM_IF43:%.*]], label [[PRED_UREM_CONTINUE44:%.*]] -; CHECK: pred.urem.if43: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE44]] -; CHECK: pred.urem.continue44: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <32 x i1> [[TMP60]], i32 23 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF45:%.*]], label [[PRED_UREM_CONTINUE46:%.*]] -; CHECK: pred.urem.if45: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE46]] -; CHECK: pred.urem.continue46: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <32 x i1> [[TMP60]], i32 24 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_UREM_IF47:%.*]], label [[PRED_UREM_CONTINUE48:%.*]] -; CHECK: pred.urem.if47: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE48]] -; CHECK: pred.urem.continue48: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <32 x i1> [[TMP60]], i32 25 -; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF49:%.*]], label [[PRED_UREM_CONTINUE50:%.*]] -; CHECK: pred.urem.if49: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE50]] -; CHECK: pred.urem.continue50: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP60]], i32 26 -; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_UREM_IF51:%.*]], label [[PRED_UREM_CONTINUE52:%.*]] -; CHECK: pred.urem.if51: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE52]] -; CHECK: pred.urem.continue52: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i1> [[TMP60]], i32 27 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_UREM_IF53:%.*]], label [[PRED_UREM_CONTINUE54:%.*]] -; CHECK: pred.urem.if53: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE54]] -; CHECK: pred.urem.continue54: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <32 x i1> [[TMP60]], i32 28 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_UREM_IF55:%.*]], label [[PRED_UREM_CONTINUE56:%.*]] -; CHECK: pred.urem.if55: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE56]] -; CHECK: pred.urem.continue56: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <32 x i1> [[TMP60]], i32 29 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_UREM_IF57:%.*]], label [[PRED_UREM_CONTINUE58:%.*]] -; CHECK: pred.urem.if57: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE58]] -; CHECK: pred.urem.continue58: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <32 x i1> [[TMP60]], i32 30 -; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_UREM_IF59:%.*]], label [[PRED_UREM_CONTINUE60:%.*]] -; CHECK: pred.urem.if59: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE60]] -; CHECK: pred.urem.continue60: -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i1> [[TMP60]], i32 31 -; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_UREM_IF61:%.*]], label [[PRED_UREM_CONTINUE62]] -; CHECK: pred.urem.if61: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE62]] -; CHECK: pred.urem.continue62: -; CHECK-NEXT: [[TMP33:%.*]] = select <32 x i1> [[TMP60]], <32 x i1> poison, <32 x i1> zeroinitializer -; CHECK-NEXT: [[TMP34:%.*]] = or <32 x i1> [[TMP33]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[CMP_I]], <32 x i32> zeroinitializer, <32 x i32> poison -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i32> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP36:%.*]] = ashr i32 [[CONV5_I]], [[TMP35]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <32 x i32> poison, i32 [[TMP36]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT64:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT63]], <32 x i32> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = icmp eq <32 x i32> [[BROADCAST_SPLAT64]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = shl <32 x i32> [[PREDPHI]], splat (i32 24) -; CHECK-NEXT: [[TMP39:%.*]] = ashr exact <32 x i32> [[TMP38]], splat (i32 24) -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i1> [[TMP37]], i32 0 -; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], <32 x i32> [[TMP39]], <32 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI65:%.*]] = select <32 x i1> [[TMP34]], <32 x i32> [[TMP41]], <32 x i32> zeroinitializer -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> poison, <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[CMP_I]], <4 x i32> zeroinitializer, <4 x i32> poison +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = ashr i32 [[CONV5_I]], [[TMP7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 24) +; CHECK-NEXT: [[TMP11:%.*]] = ashr exact <4 x i32> [[TMP10]], splat (i32 24) +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P:%.*]], i8 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i64 -3 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[PREDPHI9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i32> [[PREDPHI65]], i32 31 -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT66:%.*]] = insertelement <4 x i1> poison, i1 [[CMP_I]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT67:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT66]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT67]], splat (i1 true) -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX68:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT81:%.*]], [[PRED_UREM_CONTINUE76:%.*]] ] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP44]], i32 0 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_UREM_IF69:%.*]], label [[PRED_UREM_CONTINUE70:%.*]] -; CHECK: pred.urem.if69: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE70]] -; CHECK: pred.urem.continue70: -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i1> [[TMP44]], i32 1 -; CHECK-NEXT: br i1 [[TMP46]], label [[PRED_UREM_IF71:%.*]], label [[PRED_UREM_CONTINUE72:%.*]] -; CHECK: pred.urem.if71: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE72]] -; CHECK: pred.urem.continue72: -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP44]], i32 2 -; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_UREM_IF73:%.*]], label [[PRED_UREM_CONTINUE74:%.*]] -; CHECK: pred.urem.if73: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE74]] -; CHECK: pred.urem.continue74: -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP44]], i32 3 -; CHECK-NEXT: br i1 [[TMP48]], label [[PRED_UREM_IF75:%.*]], label [[PRED_UREM_CONTINUE76]] -; CHECK: pred.urem.if75: -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE76]] -; CHECK: pred.urem.continue76: -; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP50:%.*]] = or <4 x i1> [[TMP49]], [[BROADCAST_SPLAT67]] -; CHECK-NEXT: [[PREDPHI77:%.*]] = select i1 [[CMP_I]], <4 x i32> zeroinitializer, <4 x i32> poison -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[PREDPHI77]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[CONV5_I]], [[TMP51]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT78:%.*]] = insertelement <4 x i32> poison, i32 [[TMP52]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT79:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT78]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP53:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT79]], zeroinitializer -; CHECK-NEXT: [[TMP54:%.*]] = shl <4 x i32> [[PREDPHI77]], splat (i32 24) -; CHECK-NEXT: [[TMP55:%.*]] = ashr exact <4 x i32> [[TMP54]], splat (i32 24) -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP53]], i32 0 -; CHECK-NEXT: [[TMP57:%.*]] = select i1 [[TMP56]], <4 x i32> [[TMP55]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI80:%.*]] = select <4 x i1> [[TMP50]], <4 x i32> [[TMP57]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDEX_NEXT81]] = add nuw i32 [[INDEX68]], 4 -; CHECK-NEXT: [[TMP58:%.*]] = icmp eq i32 [[INDEX_NEXT81]], 100 -; CHECK-NEXT: br i1 [[TMP58]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i32> [[PREDPHI80]], i32 3 -; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 4, [[VEC_EPILOG_ITER_CHECK]] ], [ 100, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]] -; CHECK: then: -; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER1]] ] -; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]] -; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0 -; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24 -; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: else: -; CHECK-NEXT: [[REM_I]] = urem i32 -1, [[CONV_I]] -; CHECK-NEXT: [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1 -; CHECK-NEXT: br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]] -; CHECK: loop.latch: -; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[P_2_LCSSA:%.*]] = extractelement <4 x i32> [[PREDPHI9]], i32 3 +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ [[TMP59]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[P_2_LCSSA]] ; entry: @@ -981,6 +773,8 @@ else: loop.latch: %p.2 = phi i32 [ 0, %else ], [ %3, %then ] + %gep = getelementptr i32, ptr %p, i8 %iv + store i32 %p.2, ptr %gep %iv.next = add i8 %iv, -1 %ec = icmp eq i8 %iv.next, 0 br i1 %ec, label %exit, label %loop.header @@ -1078,7 +872,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 { ; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP42]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -1098,7 +892,7 @@ define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 { ; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 16 -; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1432,17 +1226,17 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE62]] ; CHECK: pred.udiv.continue62: ; CHECK-NEXT: [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ] -; CHECK-NEXT: [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64> -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32) ; CHECK-NEXT: [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP207:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64> +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <32 x i64> zeroinitializer, <32 x i64> [[TMP207]] ; CHECK-NEXT: [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF14:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0 @@ -1528,13 +1322,13 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE84]] ; CHECK: pred.udiv.continue84: ; CHECK-NEXT: [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ] -; CHECK-NEXT: [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64> -; CHECK-NEXT: [[PREDPHI85:%.*]] = select i1 [[C]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]] ; CHECK-NEXT: [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8) ; CHECK-NEXT: [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000 -; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP210:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64> +; CHECK-NEXT: [[PREDPHI85:%.*]] = select i1 [[C]], <8 x i64> zeroinitializer, <8 x i64> [[TMP210]] ; CHECK-NEXT: [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7 ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -1552,7 +1346,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[MERGE_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll index d19ae728cc913..1f33d63d2073c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -32,12 +32,12 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[REVERSE1]] ; CHECK-NEXT: [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]]) ; CHECK-NEXT: br label [[FOR_INC35:%.*]] ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 04bff3c393f62..1a8fe49d7a0b6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -402,15 +402,15 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <16 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <16 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i64> [[TMP0]] to <16 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[VEC_IND1]], [[TMP1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <16 x i16> [[VEC_IND1]], splat (i16 16) ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[VEC_IND1]], [[TMP6]] ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i16> [[TMP2]], i32 15 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index 7e5964ac30cba..faf90a79c8354 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -37,12 +37,12 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[BROADCAST_SPLAT]], <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP9]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP10]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1) ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -64,12 +64,12 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]] ; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT12]], ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[BROADCAST_SPLAT9]], <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: [[PREDPHI14:%.*]] = select i1 [[TMP10]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX10]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[PREDPHI14:%.*]] = select i1 [[TMP9]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI14]], i32 7 ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[CMP_N16]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll index 57cbe7f4c241b..1302895080958 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll @@ -6,104 +6,24 @@ target triple = "x86_64" define i8 @pr141968(i1 %cond, i8 %v) { ; CHECK-LABEL: define i8 @pr141968( ; CHECK-SAME: i1 [[COND:%.*]], i8 [[V:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ZEXT_TRUE:%.*]] = zext i1 true to i16 ; CHECK-NEXT: [[SEXT:%.*]] = sext i8 [[V]] to i16 -; CHECK-NEXT: br label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[COND]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE30:.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]] -; CHECK: [[PRED_SDIV_IF]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]] -; CHECK: [[PRED_SDIV_CONTINUE]]: -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]] -; CHECK: [[PRED_SDIV_IF1]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE2]] -; CHECK: [[PRED_SDIV_CONTINUE2]]: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]] -; CHECK: [[PRED_SDIV_IF3]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]] -; CHECK: [[PRED_SDIV_CONTINUE4]]: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]] -; CHECK: [[PRED_SDIV_IF5]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]] -; CHECK: [[PRED_SDIV_CONTINUE6]]: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_SDIV_IF7:.*]], label %[[PRED_SDIV_CONTINUE8:.*]] -; CHECK: [[PRED_SDIV_IF7]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE8]] -; CHECK: [[PRED_SDIV_CONTINUE8]]: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_SDIV_IF9:.*]], label %[[PRED_SDIV_CONTINUE10:.*]] -; CHECK: [[PRED_SDIV_IF9]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE10]] -; CHECK: [[PRED_SDIV_CONTINUE10]]: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_SDIV_IF11:.*]], label %[[PRED_SDIV_CONTINUE12:.*]] -; CHECK: [[PRED_SDIV_IF11]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE12]] -; CHECK: [[PRED_SDIV_CONTINUE12]]: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_SDIV_IF13:.*]], label %[[PRED_SDIV_CONTINUE14:.*]] -; CHECK: [[PRED_SDIV_IF13]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE14]] -; CHECK: [[PRED_SDIV_CONTINUE14]]: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_SDIV_IF15:.*]], label %[[PRED_SDIV_CONTINUE16:.*]] -; CHECK: [[PRED_SDIV_IF15]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE16]] -; CHECK: [[PRED_SDIV_CONTINUE16]]: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_SDIV_IF17:.*]], label %[[PRED_SDIV_CONTINUE18:.*]] -; CHECK: [[PRED_SDIV_IF17]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE18]] -; CHECK: [[PRED_SDIV_CONTINUE18]]: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_SDIV_IF19:.*]], label %[[PRED_SDIV_CONTINUE20:.*]] -; CHECK: [[PRED_SDIV_IF19]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE20]] -; CHECK: [[PRED_SDIV_CONTINUE20]]: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_SDIV_IF21:.*]], label %[[PRED_SDIV_CONTINUE22:.*]] -; CHECK: [[PRED_SDIV_IF21]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE22]] -; CHECK: [[PRED_SDIV_CONTINUE22]]: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_SDIV_IF23:.*]], label %[[PRED_SDIV_CONTINUE24:.*]] -; CHECK: [[PRED_SDIV_IF23]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE24]] -; CHECK: [[PRED_SDIV_CONTINUE24]]: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_SDIV_IF25:.*]], label %[[PRED_SDIV_CONTINUE26:.*]] -; CHECK: [[PRED_SDIV_IF25]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE26]] -; CHECK: [[PRED_SDIV_CONTINUE26]]: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_SDIV_IF27:.*]], label %[[PRED_SDIV_CONTINUE28:.*]] -; CHECK: [[PRED_SDIV_IF27]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE28]] -; CHECK: [[PRED_SDIV_CONTINUE28]]: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_SDIV_IF29:.*]], label %[[PRED_SDIV_CONTINUE30]] -; CHECK: [[PRED_SDIV_IF29]]: -; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE30]] -; CHECK: [[PRED_SDIV_CONTINUE30]]: -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], i8 0, i8 [[V]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP_LATCH]], label %[[COND_FALSE:.*]] +; CHECK: [[COND_FALSE]]: +; CHECK-NEXT: [[SDIV:%.*]] = sdiv i16 [[SEXT]], [[ZEXT_TRUE]] +; CHECK-NEXT: [[SDIV_TRUNC:%.*]] = trunc i16 [[SDIV]] to i8 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[RET:%.*]] = phi i8 [ [[SDIV_TRUNC]], %[[COND_FALSE]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i8 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: [[PREDPHI:%.*]] = phi i8 [ [[RET]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: ret i8 [[PREDPHI]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll index 368842634c374..2307213677ed9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -73,13 +73,13 @@ define void @test(ptr %p) { ; VEC-NEXT: store i64 0, ptr [[TMP24]], align 8 ; VEC-NEXT: store i64 0, ptr [[TMP25]], align 8 ; VEC-NEXT: store i64 0, ptr [[TMP26]], align 8 -; VEC-NEXT: [[TMP27:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) -; VEC-NEXT: [[TMP28:%.*]] = zext <4 x i16> [[TMP27]] to <4 x i64> ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; VEC-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC: middle.block: +; VEC-NEXT: [[TMP29:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) +; VEC-NEXT: [[TMP28:%.*]] = zext <4 x i16> [[TMP29]] to <4 x i64> ; VEC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP28]], i32 3 ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; VEC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll index 715d6db50488f..cb8ee2513284f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll @@ -207,7 +207,6 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP16]] ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP17]] ; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP10]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4 @@ -216,12 +215,13 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP25]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP26]], i32 2 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i32 3 -; CHECK-NEXT: [[TMP32:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP32]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP10]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP36:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP32]], <4 x float> [[TMP36]]) ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll index 3813560d9300a..d64965bfe93e5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll @@ -462,38 +462,30 @@ define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %pt ; I64-NEXT: [[ENTRY:.*]]: ; I64-NEXT: br label %[[OUTER_LOOP:.*]] ; I64: [[OUTER_LOOP_LOOPEXIT:.*]]: -; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ] +; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], [[INNER_LOOP:%.*]] ] ; I64-NEXT: br label %[[OUTER_LOOP]] ; I64: [[OUTER_LOOP]]: ; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ] ; I64-NEXT: [[COND:%.*]] = call i1 @cond() -; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], [[EXIT:label %.*]] ; I64: [[INNER_LOOP_PREHEADER]]: -; I64-NEXT: br label %[[INNER_LOOP]] -; I64: [[INNER_LOOP]]: -; I64-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ] -; I64-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[MUL1:%.*]], %[[INNER_LOOP]] ], [ [[ACCUM]], %[[INNER_LOOP_PREHEADER]] ] -; I64-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 1 -; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]] -; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]] -; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 -; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]] +; I64-NEXT: br label %[[VECTOR_PH:.*]] +; I64: [[VECTOR_PH]]: +; I64-NEXT: br label %[[VECTOR_BODY:.*]] +; I64: [[VECTOR_BODY]]: +; I64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ] ; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8 ; I64-NEXT: [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00 -; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 -; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8 -; I64-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00 -; I64-NEXT: [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00 -; I64-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00 -; I64-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00 -; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8 -; I64-NEXT: [[DIV:%.*]] = fdiv double [[TMP24]], [[ADD3]] -; I64-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]] -; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[ADD1]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; I64-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], zeroinitializer +; I64-NEXT: [[IV]] = add nuw i64 [[INDEX]], 2 ; I64-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 100 -; I64-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP_LOOPEXIT]], label %[[INNER_LOOP]] -; I64: [[EXIT]]: -; I64-NEXT: ret double [[ACCUM]] +; I64-NEXT: br i1 [[EXITCOND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; I64: [[MIDDLE_BLOCK]]: +; I64-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; I64-NEXT: br label %[[SCALAR_PH:.*]] +; I64: [[SCALAR_PH]]: ; ; I32-LABEL: define double @test_load_used_by_other_load_scev( ; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) { @@ -966,7 +958,7 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s ; I64-NEXT: store double [[TMP91]], ptr [[TMP83]], align 8 ; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; I64-NEXT: [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; I64-NEXT: br i1 [[TMP92]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; I64-NEXT: br i1 [[TMP92]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; I64: [[MIDDLE_BLOCK]]: ; I64-NEXT: br label %[[SCALAR_PH:.*]] ; I64: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index 4c3fd29a56021..a3d9039073779 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -139,11 +139,11 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, ; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP7]], <8 x i1> [[TMP4]], <8 x i32> poison) ; CHECK-NEXT: [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]]) ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll index cf2e7ccd1b2f0..15622f9eb961c 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll @@ -94,9 +94,6 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i16> [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i64 4 ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4 @@ -106,6 +103,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) +; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i16> [[TMP7]], [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[FOR_END]]: @@ -152,7 +152,6 @@ define void @sink_dead_inst(ptr %a) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]] ; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5) ; CHECK-NEXT: [[TMP4]] = add <4 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> @@ -168,6 +167,7 @@ define void @sink_dead_inst(ptr %a) { ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; CHECK-NEXT: br label %[[SCALAR_PH:.*]] @@ -221,13 +221,13 @@ define void @unused_recurrence(ptr %a) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4) -; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP3]], splat (i16 5) ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; CHECK-NEXT: br label %[[SCALAR_PH:.*]] ; CHECK: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll index 40587c0c8b68c..cabd19f5c8ff3 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll @@ -24,11 +24,11 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) { ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.splice.nxv1i64( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.splice.nxv1i64( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP10]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index b72cbd333cb79..5e03600574545 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -207,13 +207,10 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%and.red> = phi vp<[[RDX_START]]>, ir<%and.red.next> -; CHECK-NEXT: EMIT vp<[[WIDEN_CAN:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> ; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%rem>, ir<%recur.next> ; CHECK-NEXT: WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add> -; CHECK-NEXT: EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]>, ir<%and.red.next>, ir<%and.red> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -221,6 +218,9 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[WIDEN_CAN:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]> +; CHECK-NEXT: EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]>, ir<%and.red.next>, ir<%and.red> ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]> ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll index 372876c5faac6..408573273b962 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll @@ -134,18 +134,18 @@ define i16 @for_phi_removed(ptr %src) { ; UNROLL-NO-IC: [[VECTOR_BODY]]: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104 ; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL-NO-IC: [[MIDDLE_BLOCK]]: +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0 +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i16 1, i16 0 ; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]] ; UNROLL-NO-IC: [[SCALAR_PH]]: ; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]] ; UNROLL-NO-IC: [[LOOP]]: ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP4]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0 @@ -165,18 +165,18 @@ define i16 @for_phi_removed(ptr %src) { ; UNROLL-NO-VF: [[VECTOR_BODY]]: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 110 ; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL-NO-VF: [[MIDDLE_BLOCK]]: +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i16 1, i16 0 ; UNROLL-NO-VF-NEXT: br label %[[SCALAR_PH:.*]] ; UNROLL-NO-VF: [[SCALAR_PH]]: ; UNROLL-NO-VF-NEXT: br label %[[LOOP:.*]] ; UNROLL-NO-VF: [[LOOP]]: ; UNROLL-NO-VF-NEXT: [[IV:%.*]] = phi i16 [ 110, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; UNROLL-NO-VF-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; UNROLL-NO-VF-NEXT: [[P:%.*]] = phi i16 [ [[TMP4]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; UNROLL-NO-VF-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; UNROLL-NO-VF-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0 @@ -196,18 +196,18 @@ define i16 @for_phi_removed(ptr %src) { ; SINK-AFTER: [[VECTOR_BODY]]: ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 -; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108 ; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SINK-AFTER: [[MIDDLE_BLOCK]]: +; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0 +; SINK-AFTER-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i16 1, i16 0 ; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]] ; SINK-AFTER: [[SCALAR_PH]]: ; SINK-AFTER-NEXT: br label %[[LOOP:.*]] ; SINK-AFTER: [[LOOP]]: ; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[TMP4]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; SINK-AFTER-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; SINK-AFTER-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0 diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 063f47ce2b32d..e0fced93fef2e 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -924,12 +924,11 @@ define i32 @PR27246() { ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[I_016]], [[INDEX]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -946,7 +945,7 @@ define i32 @PR27246() { ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 ; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1264,12 +1263,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x, i32 %n) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] @@ -1301,13 +1300,11 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x, i32 %n) { ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add i32 [[VECTOR_RECUR]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i32 [[TMP6]], [[X]] ; UNROLL-NO-VF-NEXT: [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2 ; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -1323,7 +1320,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x, i32 %n) { ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], [[N]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL-NO-VF: for.end: -; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[VAL_PHI_LCSSA]] ; ; SINK-AFTER-LABEL: @extract_second_last_iteration( @@ -1340,12 +1337,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x, i32 %n) { ; SINK-AFTER: vector.body: ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 ; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] @@ -2441,7 +2438,6 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]] ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5) ; UNROLL-NO-IC-NEXT: [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5) ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> @@ -2457,6 +2453,7 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 ; UNROLL-NO-IC-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: br label [[SCALAR_PH:%.*]] @@ -2490,7 +2487,6 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1 ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i16 [[TMP1]], 1 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add i16 [[TMP2]], 5 ; UNROLL-NO-VF-NEXT: [[TMP6]] = add i16 [[TMP3]], 5 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10 @@ -2503,6 +2499,7 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 ; UNROLL-NO-VF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 ; UNROLL-NO-VF-NEXT: br label [[SCALAR_PH:%.*]] ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: br label [[FOR_COND:%.*]] @@ -2533,7 +2530,6 @@ define void @sink_dead_inst(ptr %a) { ; SINK-AFTER-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 ; SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]] ; SINK-AFTER-NEXT: [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) -; SINK-AFTER-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> ; SINK-AFTER-NEXT: [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5) ; SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10) @@ -2544,6 +2540,7 @@ define void @sink_dead_inst(ptr %a) { ; SINK-AFTER-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 ; SINK-AFTER-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; SINK-AFTER-NEXT: br label [[SCALAR_PH:%.*]] @@ -2687,12 +2684,12 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]] ; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]] -; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] +; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] @@ -2736,12 +2733,12 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ] ; UNROLL-NO-VF-NEXT: [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] ; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-VF: bb1: @@ -2808,11 +2805,11 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ] ; SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]] -; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: @@ -3003,13 +3000,13 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: store i32 [[TMP9]], ptr [[TMP71]], align 4 ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE29]] ; UNROLL-NO-IC: pred.store.continue29: -; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] +; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] @@ -3066,12 +3063,12 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: store i32 [[TMP3]], ptr [[TMP15]], align 4 ; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE6]] ; UNROLL-NO-VF: pred.store.continue6: -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = select i1 [[TMP4]], i32 [[TMP10]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = select i1 [[TMP4]], i32 [[TMP10]], i32 [[VEC_PHI]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-VF: bb1: @@ -3168,12 +3165,12 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: store i32 [[TMP5]], ptr [[TMP36]], align 4 ; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE12]] ; SINK-AFTER: pred.store.continue12: -; SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP28:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]]) ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: @@ -3214,9 +3211,6 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i64 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4 @@ -3226,6 +3220,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP5]], [[TMP5]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i32> ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: for.end: @@ -3239,18 +3236,18 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = zext i16 [[TMP5]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]] -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP3]] ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP8]], align 4 ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16 ; UNROLL-NO-VF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = or i16 [[TMP4]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = zext i16 [[TMP5]] to i32 ; UNROLL-NO-VF-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret i32 [[TMP10]] @@ -3264,9 +3261,6 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; SINK-AFTER-NEXT: [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) -; SINK-AFTER-NEXT: [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]] -; SINK-AFTER-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]] ; SINK-AFTER-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 @@ -3274,6 +3268,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) +; SINK-AFTER-NEXT: [[TMP5:%.*]] = or <4 x i16> [[TMP2]], [[TMP2]] +; SINK-AFTER-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; SINK-AFTER-NEXT: br label [[LOOP:%.*]] ; SINK-AFTER: for.end: @@ -3322,13 +3319,13 @@ define void @unused_recurrence(ptr %a, i16 %n) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4) -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5) ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4) ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1) +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP7]], splat (i16 5) ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3362,15 +3359,14 @@ define void @unused_recurrence(ptr %a, i16 %n) { ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i16 -; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST2]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = add i16 [[OFFSET_IDX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add i16 [[TMP8]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[TMP9]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i16 +; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST2]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add i16 [[OFFSET_IDX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[TMP9]], 5 ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3404,13 +3400,13 @@ define void @unused_recurrence(ptr %a, i16 %n) { ; SINK-AFTER: vector.body: ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[TMP7:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) -; SINK-AFTER-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP7]], splat (i16 5) ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[TMP7:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1) +; SINK-AFTER-NEXT: [[TMP1:%.*]] = add <4 x i16> [[TMP7]], splat (i16 5) ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 6d56b72e370c0..01c9be50de708 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -12,11 +12,11 @@ define i32 @one_direct_branch(ptr %src) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: exit: @@ -53,11 +53,11 @@ define i32 @two_direct_branch(ptr %src) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: exit: @@ -101,14 +101,14 @@ define i32 @cond_branch(i32 %a, ptr %src) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> splat (i32 10) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> splat (i32 10) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 ; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 2200a7d0431d2..e66afb89b92a0 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -393,14 +393,14 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; IC4VF4-NEXT: [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[TMP105]], <4 x i16> [[VEC_PHI1]] ; IC4VF4-NEXT: [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[TMP106]], <4 x i16> [[VEC_PHI2]] ; IC4VF4-NEXT: [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[TMP107]], <4 x i16> [[VEC_PHI3]] -; IC4VF4-NEXT: [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]] -; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] -; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] -; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] ; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4) ; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]] +; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] +; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] +; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] ; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]]) ; IC4VF4-NEXT: [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]]) ; IC4VF4-NEXT: [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]]) @@ -719,14 +719,14 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; IC4VF4-NEXT: [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[TMP105]], <4 x i16> [[VEC_PHI1]] ; IC4VF4-NEXT: [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[TMP106]], <4 x i16> [[VEC_PHI2]] ; IC4VF4-NEXT: [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[TMP107]], <4 x i16> [[VEC_PHI3]] -; IC4VF4-NEXT: [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]] -; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] -; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] -; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] ; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4) ; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]] +; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] +; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] +; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] ; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]]) ; IC4VF4-NEXT: [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]]) ; IC4VF4-NEXT: [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]]) diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index 8bf98f043580b..0203fc50858bd 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -212,25 +212,24 @@ define ptr @both(ptr %p, i32 %k) { ; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 4 -; INTERLEAVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 4 +; INTERLEAVE-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP8]] ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; INTERLEAVE: [[VECTOR_BODY]]: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4 -; INTERLEAVE-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP8]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; INTERLEAVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; INTERLEAVE-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 -4 +; INTERLEAVE-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 -4 ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; INTERLEAVE: [[SCALAR_PH]]: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] -; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[NEXT_GEP1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[NEXT_GEP1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[FOR_BODY:.*]] ; INTERLEAVE: [[FOR_BODY]]: ; INTERLEAVE-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] @@ -643,12 +642,12 @@ define i32 @postinc_not_iv_backedge_value(i32 %k) { ; VEC: [[VECTOR_BODY]]: ; VEC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VEC-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) ; VEC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; VEC-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; VEC-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; VEC-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -676,12 +675,11 @@ define i32 @postinc_not_iv_backedge_value(i32 %k) { ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; INTERLEAVE: [[VECTOR_BODY]]: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; INTERLEAVE: [[SCALAR_PH]]: @@ -1138,13 +1136,13 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]] ; VEC-NEXT: store i16 0, ptr [[TMP2]], align 2 ; VEC-NEXT: store i16 0, ptr [[TMP3]], align 2 -; VEC-NEXT: [[TMP4:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 1) -; VEC-NEXT: [[TMP5:%.*]] = add <2 x i32> splat (i32 1), [[TMP4]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4) ; VEC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 ; VEC-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 1) +; VEC-NEXT: [[TMP5:%.*]] = add <2 x i32> splat (i32 1), [[TMP8]] ; VEC-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; VEC-NEXT: br label %[[E_EXIT:.*]] ; VEC: [[E_EXIT]]: @@ -1166,12 +1164,12 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]] ; INTERLEAVE-NEXT: store i16 0, ptr [[TMP2]], align 2 ; INTERLEAVE-NEXT: store i16 0, ptr [[TMP3]], align 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 ; INTERLEAVE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]] ; INTERLEAVE-NEXT: br label %[[E_EXIT:.*]] ; INTERLEAVE: [[E_EXIT]]: ; INTERLEAVE-NEXT: ret i32 [[TMP5]] @@ -1293,10 +1291,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i64 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i64 -1 ; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2 -; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1 -; VEC-NEXT: [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP5]], 1 ; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1 +; VEC-NEXT: [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP5]], 1 ; VEC-NEXT: br label %[[EXIT:.*]] ; VEC: [[EXIT]]: ; VEC-NEXT: ret i64 [[IV_1_NEXT_LCSSA1]] @@ -1312,10 +1310,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; INTERLEAVE-NEXT: store i16 1, ptr [[TMP0]], align 2 ; INTERLEAVE-NEXT: store i16 1, ptr [[TMP1]], align 2 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 1, -1 -; INTERLEAVE-NEXT: [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 2, -1 +; INTERLEAVE-NEXT: [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[EXIT]]: ; INTERLEAVE-NEXT: ret i64 [[IV_1_NEXT_LCSSA1]] diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll index 5c6bccfb32d8a..890521c0791cc 100644 --- a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll +++ b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll @@ -89,14 +89,6 @@ define void @test3(ptr %p) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY1:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INC46:%.*]] = add i32 6, 1 -; CHECK-NEXT: [[TMP5:%.*]] = add i32 7, 1 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 8, 1 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 9, 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[INC46]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 3 ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P:%.*]], i64 0, i64 6 @@ -123,6 +115,14 @@ define void @test3(ptr %p) { ; CHECK: pred.store.continue6: ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP4:%.*]] = add i32 6, 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 7, 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 8, 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 9, 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 3 ; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> , i1 false) ; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 ; CHECK-NEXT: [[INC46_LCSSA:%.*]] = extractelement <4 x i32> [[TMP11]], i64 [[LAST_ACTIVE_LANE]] diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll index 9c73d85bd58ff..5e2fb0c514d8d 100644 --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -23,7 +23,7 @@ define i32 @test1() { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -35,21 +35,21 @@ define i32 @test1() { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] -; CHECK: [[_LR_PH_I1]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] +; CHECK: [[_LR_PH_I]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I:.*:]] -; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] +; CHECK: [[_LR_PH_I1:.*:]] +; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] ; CHECK-NEXT: [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10 ; CHECK-NEXT: br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]] ; CHECK: [[UNNAMEDBB10]]: @@ -96,7 +96,7 @@ define i32 @test2() { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -108,21 +108,21 @@ define i32 @test2() { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> [[VEC_IND]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 1), <2 x i32> [[VEC_IND]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] -; CHECK: [[_LR_PH_I1]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] +; CHECK: [[_LR_PH_I]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I:.*:]] -; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] +; CHECK: [[_LR_PH_I1:.*:]] +; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] ; CHECK-NEXT: [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10 ; CHECK-NEXT: br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]] ; CHECK: [[UNNAMEDBB10]]: @@ -169,7 +169,7 @@ define i32 @test3(i32 %N) { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -183,24 +183,24 @@ define i32 @test3(i32 %N) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <2 x i32> [[VEC_IND]], splat (i32 10) ; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP5]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> zeroinitializer, <2 x i32> splat (i32 2) ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[PREDPHI]], <2 x i32> splat (i32 1) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[PREDPHI1]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] -; CHECK: [[_LR_PH_I]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] +; CHECK: [[_LR_PH_I1]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I1:.*:]] -; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] +; CHECK: [[_LR_PH_I:.*:]] +; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] ; CHECK-NEXT: [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10 ; CHECK-NEXT: br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]] ; CHECK: [[UNNAMEDBB10]]: @@ -257,7 +257,7 @@ define i32 @test4(i32 %N) { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -269,21 +269,21 @@ define i32 @test4(i32 %N) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] -; CHECK: [[_LR_PH_I1]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] +; CHECK: [[_LR_PH_I]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[DOTLR_PH_I_PREHEADER]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I:.*:]] -; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] +; CHECK: [[_LR_PH_I1:.*:]] +; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] ; CHECK-NEXT: [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10 ; CHECK-NEXT: br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]] ; CHECK: [[UNNAMEDBB10]]: @@ -519,7 +519,7 @@ define i8 @outside_user_non_phi() { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -531,22 +531,22 @@ define i8 @outside_user_non_phi() { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[PREDPHI]] to <2 x i8> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[PREDPHI]] to <2 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP4]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] -; CHECK: [[_LR_PH_I1]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] +; CHECK: [[_LR_PH_I]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I:.*:]] -; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] +; CHECK: [[_LR_PH_I1:.*:]] +; CHECK-NEXT: [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] ; CHECK-NEXT: [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10 ; CHECK-NEXT: br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]] ; CHECK: [[UNNAMEDBB10]]: @@ -650,14 +650,14 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N) { ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[C1]], [[B2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[C1]], [[A3]] ; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i32 [[TMP3]], 8 ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[_LR_PH_I1]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[_LR_PH_I]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] @@ -680,12 +680,12 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]] -; CHECK: [[_LR_PH_I1]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]] +; CHECK: [[_LR_PH_I]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] -; CHECK: [[_LR_PH_I:.*:]] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ] +; CHECK: [[_LR_PH_I1:.*:]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = sext i32 [[IV]] to i64 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[BLOAD:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll index 18803e71f1041..d4d21c6b6e243 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -32,11 +32,11 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF-TWO-CHECK: middle.block: +; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -54,11 +54,11 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x i32>, ptr [[TMP9]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX6]] ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, ptr [[TMP11]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]] ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 ; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: +; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]] ; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 ; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 08903351aa2f0..9651e1cd2dd04 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -484,13 +484,13 @@ define i32 @pr45526() optsize { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 ; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 false) ; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 @@ -512,13 +512,13 @@ define i32 @pr45526() optsize { ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 ; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; PGSO: [[MIDDLE_BLOCK]]: +; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 false) ; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 ; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 @@ -540,13 +540,13 @@ define i32 @pr45526() optsize { ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] -; NPGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; NPGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; NPGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 ; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: +; NPGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; NPGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 false) ; NPGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 ; NPGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 @@ -584,13 +584,13 @@ define i32 @pr45526_pgso() !prof !14 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 ; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 false) ; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 @@ -612,13 +612,13 @@ define i32 @pr45526_pgso() !prof !14 { ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 ; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; PGSO: [[MIDDLE_BLOCK]]: +; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) ; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 false) ; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 ; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 @@ -639,12 +639,12 @@ define i32 @pr45526_pgso() !prof !14 { ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NPGSO-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; NPGSO-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508 ; NPGSO-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: +; NPGSO-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) ; NPGSO-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; NPGSO-NEXT: br label %[[SCALAR_PH:.*]] ; NPGSO: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index d96134e8adf1d..6560dfcfe33c8 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -642,6 +642,12 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]] +; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; STRIDED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; STRIDED: middle.block: ; STRIDED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[TMP1]] ; STRIDED-NEXT: [[TMP4:%.*]] = mul i64 0, [[TMP1]] ; STRIDED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], [[TMP4]] @@ -652,13 +658,7 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP1]] ; STRIDED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]] ; STRIDED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]] -; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]] -; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 ; STRIDED-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[NEXT_GEP3]], i64 [[STRIDE]] -; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; STRIDED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; STRIDED: middle.block: ; STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 3 ; STRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; STRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll index 310c7729a6b63..23f1927185883 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll @@ -12,12 +12,12 @@ define i16 @duplicate_lcssa(i16 %val) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sub nsw <4 x i16> [[VEC_IND]], splat (i16 1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4) ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536 ; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub nsw <4 x i16> [[VEC_IND]], splat (i16 1) ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll index 70a808f9b336e..1d1562e2d429d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll @@ -133,9 +133,9 @@ define i32 @test3(i64 %y) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[Y:%.*]], 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> , <4 x i32> splat (i32 55) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> , <4 x i32> splat (i32 55) ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> , i1 false) ; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll index 615ea062afd53..ab2965abc2a0d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll @@ -55,11 +55,11 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize { ; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i16> [[TMP22]], [[TMP23]] ; CHECK-NEXT: [[TMP25]] = add <2 x i16> [[VEC_PHI]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]] ; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP26]]) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 8d14299a02878..07623a1b0d19d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -23,7 +23,6 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI5:%.*]] = select i1 [[C_2]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) ; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] ; CHECK-NEXT: [[PREDPHI7]] = select i1 [[C_2]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 @@ -31,6 +30,7 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[PREDPHI5:%.*]] = select i1 [[C_2]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]]) ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll index 1e093407620d5..be9e53c59aaf4 100644 --- a/llvm/test/Transforms/LoopVectorize/pr66616.ll +++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll @@ -13,11 +13,11 @@ define void @pr66616(ptr %ptr) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 ; CHECK-NEXT: br label [[LOOP_1:%.*]] ; CHECK: preheader: ; CHECK-NEXT: [[TMP4:%.*]] = sub i32 -1, [[TMP0]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll index e4322cfcc00ac..c9cc8060ff498 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll @@ -94,11 +94,11 @@ define i32 @chained_smax(i32 %x, ptr %src) { ; CHECK: [[PRED_LOAD_CONTINUE6]]: ; CHECK-NEXT: [[TMP25:%.*]] = phi <4 x i32> [ [[TMP19]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP26]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP25]], <4 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP27]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll index b51db48c1c6ed..186c64c924233 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll @@ -16,18 +16,18 @@ define i32 @foo() !prof !1 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]] +; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9) -; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]] -; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]] ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index caf1a934ed70a..e2ae1b82b16b1 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -21,8 +21,6 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]] -; CHECK-NEXT: [[PREDPHI1:%.*]] = select i1 [[C]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255) ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8> @@ -32,6 +30,8 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select i1 [[C]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 ; CHECK-NEXT: [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3 diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll index 49700dd880106..dd80e391fc66c 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -632,12 +632,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; CHECK-VF4UF1: [[VECTOR_BODY]]: ; CHECK-VF4UF1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF1-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-VF4UF1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; CHECK-VF4UF1-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-VF4UF1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4UF1-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4UF1: [[MIDDLE_BLOCK]]: +; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-VF4UF1-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-VF4UF1-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4 ; CHECK-VF4UF1-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 @@ -679,12 +679,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; CHECK-VF4UF2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF2-NEXT: [[STEP_ADD:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF2-NEXT: [[VEC_IND_NEXT:%.*]] = add [[STEP_ADD]], [[BROADCAST_SPLAT2]] -; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = add [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]] ; CHECK-VF4UF2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]] ; CHECK-VF4UF2-NEXT: [[VEC_IND_NEXT1]] = add [[VEC_IND_NEXT]], [[BROADCAST_SPLAT2]] ; CHECK-VF4UF2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4UF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4UF2: [[MIDDLE_BLOCK]]: +; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = add [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]] ; CHECK-VF4UF2-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-VF4UF2-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4 ; CHECK-VF4UF2-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll index 1f5646d2a3090..bdbfcf842d507 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll @@ -24,17 +24,17 @@ define i32 @test(i64 %N, i32 %x) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP2]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_PHI]], <4 x i32> splat (i32 10) -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit.loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll index 19ab96dd822b6..7aafd6dfc5937 100644 --- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll @@ -26,19 +26,19 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3]] = xor <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT4]] -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: store i32 [[TMP6]], ptr @a, align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll index 541ee96ac760b..fc0582b752341 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -162,15 +162,15 @@ define i64 @live_out_scalar_vf(i64 %n) { ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD_3]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4) -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD_2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i64> [[STEP_ADD_3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD_2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i64> [[STEP_ADD_3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false) ; CHECK-NEXT: [[TMP6:%.*]] = add i64 12, [[FIRST_INACTIVE_LANE]] ; CHECK-NEXT: [[FIRST_INACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP3]], i1 false) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 0fde76477b9f9..bc074ea50d0d3 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -1106,8 +1106,6 @@ define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) { ; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%res2> = phi vp<%3>, vp<%5> ; CHECK-NEXT: CLONE ir<%load> = load ir<%a> -; CHECK-NEXT: WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32 -; CHECK-NEXT: WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64 ; CHECK-NEXT: EXPRESSION vp<%5> = ir<%res2> + reduce.add (ir<%mul> zext to i64) ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%0> ; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1> @@ -1116,6 +1114,8 @@ define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64 ; CHECK-NEXT: EMIT vp<%7> = compute-reduction-result ir<%res2>, vp<%5> ; CHECK-NEXT: EMIT vp<[[EXT_PART:%.+]]> = extract-last-part ir<%load.ext.ext> ; CHECK-NEXT: EMIT vp<%vector.recur.extract> = extract-last-lane vp<[[EXT_PART]]> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0c58966469f5a..b9875026d69c1 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -526,7 +526,6 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -536,6 +535,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off> ; CHECK-NEXT: EMIT vp<[[EXIT_PART:%.+]]> = extract-last-part ir<%add> ; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-last-lane vp<[[EXIT_PART]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>