diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll index e460f558f4723..8dc88f7b96716 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes=slp-vectorizer,instcombine -pass-remarks-output=%t | FileCheck %s +; RUN: opt < %s -S -passes=slp-vectorizer -pass-remarks-output=%t | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=REMARK %s -; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer' -pass-remarks-output=%t | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=REMARK %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -16,10 +16,10 @@ target triple = "aarch64--linux-gnu" define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @gather_multiple_use( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], splat (i32 15) ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], splat (i32 65537) ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], splat (i32 65535) @@ -57,22 +57,26 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { @data = global [6 x [258 x i8]] zeroinitializer, align 1 define void @gather_load(ptr noalias %ptr) { ; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 2 -; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 6 -; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 8 -; CHECK-NEXT: [[L0:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 258), align 1 +; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 2 +; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX149:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0 +; CHECK-NEXT: [[L0:%.*]] = load i8, ptr [[ARRAYIDX149]], align 1 ; CHECK-NEXT: [[CONV150:%.*]] = zext i8 [[L0]] to i16 -; CHECK-NEXT: [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 517), align 1 +; CHECK-NEXT: [[ADD152:%.*]] = add i16 10, [[CONV150]] +; CHECK-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1 +; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[ARRAYIDX155]], align 1 ; CHECK-NEXT: [[CONV156:%.*]] = zext i8 [[L1]] to i16 -; CHECK-NEXT: [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 776), align 1 +; CHECK-NEXT: [[ADD158:%.*]] = add i16 20, [[CONV156]] +; CHECK-NEXT: [[ARRAYIDX161:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2 +; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[ARRAYIDX161]], align 1 ; CHECK-NEXT: [[CONV162:%.*]] = zext i8 [[L2]] to i16 -; CHECK-NEXT: [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 1035), align 1 +; CHECK-NEXT: [[ADD164:%.*]] = add i16 30, [[CONV162]] +; CHECK-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3 +; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[ARRAYIDX167]], align 1 ; CHECK-NEXT: [[CONV168:%.*]] = zext i8 [[L3]] to i16 -; CHECK-NEXT: [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40 +; CHECK-NEXT: [[ADD170:%.*]] = add i16 40, [[CONV168]] ; CHECK-NEXT: store i16 [[ADD152]], ptr [[ARRAYIDX182]], align 2 ; CHECK-NEXT: store i16 [[ADD158]], ptr [[ARRAYIDX183]], align 2 ; CHECK-NEXT: store i16 [[ADD164]], ptr [[ARRAYIDX184]], align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll index 900d5f293b5b8..f83628ed17b65 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=GENERIC -; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=KRYO +; RUN: opt -S -passes=slp-vectorizer < %s | FileCheck %s --check-prefix=GENERIC +; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer < %s | FileCheck %s --check-prefix=KRYO target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -36,57 +36,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 ; GENERIC-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> ; GENERIC-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 ; GENERIC-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> ; GENERIC-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] -; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 -; GENERIC-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] +; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0 +; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]] ; GENERIC-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 ; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 -; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] -; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 -; GENERIC-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] +; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]] +; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1 +; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]] ; GENERIC-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 ; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] -; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 -; GENERIC-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 -; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] +; GENERIC-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2 +; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]] ; GENERIC-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 ; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 ; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] -; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 -; GENERIC-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] +; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3 +; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]] ; GENERIC-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 ; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 ; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] -; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 -; GENERIC-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 -; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] +; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4 +; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]] ; GENERIC-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 ; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 ; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] -; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 -; GENERIC-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 -; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] +; GENERIC-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5 +; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]] ; GENERIC-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 ; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 ; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] -; GENERIC-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 -; GENERIC-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 -; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] +; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6 +; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]] ; GENERIC-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 -; GENERIC-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 -; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] +; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 +; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]] ; GENERIC-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 ; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 ; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] @@ -109,57 +101,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 ; KRYO-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> ; KRYO-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 ; KRYO-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> ; KRYO-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] -; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 -; KRYO-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] +; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0 +; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]] ; KRYO-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 ; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 -; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] -; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 -; KRYO-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] +; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]] +; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1 +; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]] ; KRYO-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 ; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] -; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 -; KRYO-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 -; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] +; KRYO-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2 +; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]] ; KRYO-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 ; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 ; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] -; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 -; KRYO-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] +; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3 +; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]] ; KRYO-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 ; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 ; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] -; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 -; KRYO-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 -; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] +; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4 +; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]] ; KRYO-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 ; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 ; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] -; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 -; KRYO-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 -; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] +; KRYO-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5 +; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]] ; KRYO-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 ; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 ; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] -; KRYO-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 -; KRYO-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 -; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] +; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6 +; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]] ; KRYO-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 -; KRYO-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 -; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] +; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 +; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]] ; KRYO-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 ; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 ; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] @@ -293,55 +277,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 ; GENERIC-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> ; GENERIC-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 ; GENERIC-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; GENERIC-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] -; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 +; GENERIC-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]] +; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0 ; GENERIC-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] ; GENERIC-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 ; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 -; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] -; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 +; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]] +; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1 ; GENERIC-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] ; GENERIC-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 ; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] -; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 +; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2 ; GENERIC-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 ; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] ; GENERIC-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 ; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 ; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] -; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 +; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3 ; GENERIC-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 ; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] ; GENERIC-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 ; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 ; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] -; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 +; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4 ; GENERIC-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 ; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] ; GENERIC-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 ; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 ; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] -; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 +; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5 ; GENERIC-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 ; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] ; GENERIC-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 ; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 ; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] -; GENERIC-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 +; GENERIC-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6 ; GENERIC-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 ; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] ; GENERIC-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 +; GENERIC-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 ; GENERIC-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] ; GENERIC-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 @@ -366,55 +350,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 ; KRYO-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> ; KRYO-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 ; KRYO-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; KRYO-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] -; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 +; KRYO-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]] +; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0 ; KRYO-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] ; KRYO-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 ; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 -; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] -; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 +; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]] +; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1 ; KRYO-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] ; KRYO-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 ; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] -; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 +; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2 ; KRYO-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 ; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] ; KRYO-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 ; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 ; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] -; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 +; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3 ; KRYO-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 ; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] ; KRYO-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 ; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 ; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] -; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 +; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4 ; KRYO-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 ; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] ; KRYO-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 ; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 ; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] -; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 +; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5 ; KRYO-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 ; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] ; KRYO-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 ; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 ; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] -; KRYO-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 +; KRYO-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6 ; KRYO-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 ; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] ; KRYO-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 +; KRYO-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 ; KRYO-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] ; KRYO-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index c43b79e138a30..89e133bb1c6a1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s ; These tests check that we remove from consideration pairs of seed @@ -46,9 +46,9 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[X:%.*]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -58,29 +58,25 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]] ; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]] ; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]] ; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 ; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 @@ -143,7 +139,7 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[Y:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[Y:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -153,26 +149,22 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = zext nneg i32 [[TMP4]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]] ; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 ; CHECK-NEXT: [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll index 5808d64d925c6..ac476c521a591 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s @@ -25,42 +25,42 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) { ; CHECK-LABEL: @test_i16_extend( ; CHECK-NEXT: [[P_0:%.*]] = load ptr, ptr @global, align 8 ; CHECK-NEXT: [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64 -; CHECK-NEXT: [[T53:%.*]] = getelementptr inbounds nuw i16, ptr [[P_1:%.*]], i64 [[IDX_0]] -; CHECK-NEXT: [[T56:%.*]] = getelementptr inbounds nuw i16, ptr [[P_2:%.*]], i64 [[IDX_0]] +; CHECK-NEXT: [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]] +; CHECK-NEXT: [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]] ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[T53]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[T56]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP7]] ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[T60]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]] ; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[T71]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2 ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 ; CHECK-NEXT: [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]] ; CHECK-NEXT: [[L_3:%.*]] = load i32, ptr [[T82]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3 ; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 ; CHECK-NEXT: [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]] ; CHECK-NEXT: [[L_4:%.*]] = load i32, ptr [[T93]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 ; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 ; CHECK-NEXT: [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]] ; CHECK-NEXT: [[L_5:%.*]] = load i32, ptr [[T104]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i64 5 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 ; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 ; CHECK-NEXT: [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]] ; CHECK-NEXT: [[L_6:%.*]] = load i32, ptr [[T115]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i64 6 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6 ; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 ; CHECK-NEXT: [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]] ; CHECK-NEXT: [[L_7:%.*]] = load i32, ptr [[T126]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i64 7 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7 ; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 ; CHECK-NEXT: [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]] ; CHECK-NEXT: [[L_8:%.*]] = load i32, ptr [[T137]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index f138065101c4b..6c5220d13b7a2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=slp-vectorizer,instcombine -mtriple=aarch64--linux-gnu < %s | FileCheck %s +; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" @@ -133,7 +133,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-LABEL: @reduce_blockstrided2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[X:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] @@ -165,7 +165,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX32]], align 2 -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2 +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1 ; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX33]], align 2 ; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM4]] ; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX36]], align 2 @@ -254,9 +254,9 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-LABEL: @reduce_blockstrided3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[L0:%.*]] = load i16, ptr [[X:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1 ; CHECK-NEXT: [[L1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 2 ; CHECK-NEXT: [[L2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] @@ -269,10 +269,12 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM9]] ; CHECK-NEXT: [[L6:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 ; CHECK-NEXT: [[L8:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1 ; CHECK-NEXT: [[L9:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 2 ; CHECK-NEXT: [[L10:%.*]] = load i16, ptr [[ARRAYIDX16]], align 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[L12:%.*]] = load i16, ptr [[ARRAYIDX20]], align 2 @@ -347,9 +349,9 @@ define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) ; CHECK-NEXT: ret i16 [[TMP7]] ; @@ -414,33 +416,128 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4 +; TODO: Dead code must be removed below. +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32 +; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1 +; CHECK-NEXT: [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1 +; CHECK-NEXT: [[CONV14_2:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1 +; CHECK-NEXT: [[CONV16_2:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1 +; CHECK-NEXT: [[CONV21_2:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[CONV23_2:%.*]] = zext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1 +; CHECK-NEXT: [[CONV26_2:%.*]] = zext i8 [[TMP41]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[CONV28_2:%.*]] = zext i8 [[TMP42]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1 +; CHECK-NEXT: [[CONV33_2:%.*]] = zext i8 [[TMP43]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1 +; CHECK-NEXT: [[CONV35_2:%.*]] = zext i8 [[TMP44]] to i32 +; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1 +; CHECK-NEXT: [[CONV38_2:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1 +; CHECK-NEXT: [[CONV40_2:%.*]] = zext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[CONV2_3:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[CONV4_3:%.*]] = zext i8 [[TMP48]] to i32 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[CONV6_3:%.*]] = zext i8 [[TMP49]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1 +; CHECK-NEXT: [[CONV9_3:%.*]] = zext i8 [[TMP50]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1 +; CHECK-NEXT: [[CONV11_3:%.*]] = zext i8 [[TMP51]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1 +; CHECK-NEXT: [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1 +; CHECK-NEXT: [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1 +; CHECK-NEXT: [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1 +; CHECK-NEXT: [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1 +; CHECK-NEXT: [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1 +; CHECK-NEXT: [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1 +; CHECK-NEXT: [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP38]], <4 x i8> [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP5]], i64 12) ; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP46:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP45]], <4 x i8> [[TMP12]], i64 4) +; CHECK-NEXT: [[TMP47:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP46]], <4 x i8> [[TMP3]], i64 8) +; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP47]], <4 x i8> [[TMP13]], i64 12) ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul <16 x i32> [[TMP11]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,7 +774,7 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X:%.*]], i64 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 @@ -696,7 +793,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 ; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 ; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 2 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] ; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] @@ -705,20 +802,21 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 ; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1 ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24 +; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7 ; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44 +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11 ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4 @@ -727,9 +825,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 ; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4 ; CHECK-NEXT: ret void ; @@ -837,9 +936,10 @@ define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[DST0:%.*]], align 2 ; CHECK-NEXT: ret void ; @@ -912,35 +1012,132 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0:%.*]], i64 16 -; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 32 -; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 48 +; TODO: Dead code must be removed below. +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32 +; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1 +; CHECK-NEXT: [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1 +; CHECK-NEXT: [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1 +; CHECK-NEXT: [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32 +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1 +; CHECK-NEXT: [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32 +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1 +; CHECK-NEXT: [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1 +; CHECK-NEXT: [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3 +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1 +; CHECK-NEXT: [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32 +; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1 +; CHECK-NEXT: [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1 +; CHECK-NEXT: [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 +; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1 +; CHECK-NEXT: [[CONV9_3:%.*]] = zext i8 [[TMP20]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1 +; CHECK-NEXT: [[CONV11_3:%.*]] = zext i8 [[TMP21]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1 +; CHECK-NEXT: [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1 +; CHECK-NEXT: [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1 +; CHECK-NEXT: [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1 +; CHECK-NEXT: [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1 +; CHECK-NEXT: [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1 +; CHECK-NEXT: [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1 +; CHECK-NEXT: [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4 +; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8 +; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <4 x i32> [[TMP16]], [[TMP18]] ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[DST0]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[DST4]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[DST8]], align 4 @@ -1198,20 +1395,20 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 @@ -1225,33 +1422,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP13]], <4 x i8> [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP14]], <4 x i8> [[TMP8]], i64 8) +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP12]], i64 12) ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP20:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP5]], i64 4) +; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP20]], <4 x i8> [[TMP9]], i64 8) +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP19]], i64 12) ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP30:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP29]], <4 x i8> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP28:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP30]], <4 x i8> [[TMP10]], i64 8) +; CHECK-NEXT: [[TMP32:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP28]], <4 x i8> [[TMP27]], i64 12) ; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP36:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP35]], <4 x i8> [[TMP7]], i64 4) +; CHECK-NEXT: [[TMP37:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP36]], <4 x i8> [[TMP11]], i64 8) +; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP37]], <4 x i8> [[TMP34]], i64 12) ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16) @@ -1259,11 +1452,11 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP45]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP50]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]] ; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]] @@ -1271,7 +1464,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]] ; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]] -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP57]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], splat (i32 15) ; CHECK-NEXT: [[TMP61:%.*]] = and <16 x i32> [[TMP60]], splat (i32 65537) ; CHECK-NEXT: [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], splat (i32 65535) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index c84c333391350..0f47c6b3ac902 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index e4fcb1ed08be9..370ed1f258aca 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 783a1e83c6724..a8d1c94d59be3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; @@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) -; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) +; GFX8-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> +; GFX8-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) -; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) +; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) ; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> ; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 7e31ec9a0b39a..b09022e8289a1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -47,7 +47,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -82,7 +82,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -117,7 +117,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -152,7 +152,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -177,7 +177,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -202,7 +202,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -227,7 +227,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -255,7 +255,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: ret <3 x i16> [[INS_2]] @@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; @@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -309,7 +315,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) ; GFX7-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 @@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) -; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) +; GFX8-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> +; GFX8-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) -; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) +; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) ; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> ; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll index 46c6c10125b95..57ca3db075689 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -24,7 +24,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: ret <2 x i16> [[INS_1]] ; @@ -54,7 +54,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -66,7 +66,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: ret <2 x i16> [[INS_1]] ; @@ -96,7 +96,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -108,7 +108,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: ret <2 x i16> [[INS_1]] ; @@ -138,7 +138,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -150,7 +150,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: ret <2 x i16> [[INS_1]] ; @@ -180,7 +180,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -205,7 +205,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -230,7 +230,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -255,7 +255,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -283,7 +283,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: ret <3 x i16> [[INS_2]] @@ -299,7 +299,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] @@ -308,8 +308,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -344,7 +347,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) ; GFX7-NEXT: [[ADD_3:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 @@ -358,9 +361,10 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 ; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]]) -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP3]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 ; GFX8-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX8-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> @@ -368,10 +372,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]]) -; GFX9-NEXT: [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]]) +; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP1:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP8:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP7]]) ; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP4]]) +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> ; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll index 7faf11c34d76e..88a03bb72c276 100644 --- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll +++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s ; Regression test for a bug in the SLP vectorizer that was causing ; these rotates to be incorrectly combined into a vector rotate. @@ -9,16 +9,16 @@ target triple = "wasm32-unknown-unknown" define void @foo(<2 x i64> %x, <4 x i32> %y, ptr %out) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 -; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 2 +; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0 +; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 ; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64 ; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]]) ; CHECK-NEXT: store i64 [[C]], ptr [[OUT:%.*]], align 8 -; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i64 1 -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i64 3 +; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1 +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3 ; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64 ; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]]) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[OUT]], i32 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[OUT]], i32 1 ; CHECK-NEXT: store i64 [[F]], ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll index 77d36f0107665..92ffbad73d5f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-LABEL: @ceil_floor( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7 +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) ; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) ; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) @@ -24,31 +24,31 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) ; SSE-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]]) ; SSE-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]]) -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1 -; SSE-NEXT: [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6 -; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SSE-NEXT: ret <8 x float> [[R71]] ; ; SLM-LABEL: @ceil_floor( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> ; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 ; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> @@ -56,20 +56,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; SLM-NEXT: ret <8 x float> [[R71]] ; ; AVX-LABEL: @ceil_floor( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]]) +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> ; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> @@ -77,20 +77,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX-NEXT: ret <8 x float> [[R71]] ; ; AVX2-LABEL: @ceil_floor( -; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; AVX2-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX2-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]]) ; AVX2-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll index 18d79752b0b44..f504821b5fa67 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2 define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-LABEL: @ceil_floor( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7 +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) ; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) ; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) @@ -24,75 +24,75 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) ; SSE-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]]) ; SSE-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]]) -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1 -; SSE-NEXT: [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6 -; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SSE-NEXT: ret <8 x float> [[R71]] ; ; SLM-LABEL: @ceil_floor( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0 +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 ; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> ; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; SLM-NEXT: ret <8 x float> [[R71]] ; ; AVX-LABEL: @ceil_floor( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]]) +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> +; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> ; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> ; AVX-NEXT: ret <8 x float> [[R71]] ; ; AVX2-LABEL: @ceil_floor( -; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0 +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 ; AVX2-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX2-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]]) ; AVX2-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; AVX2-NEXT: ret <8 x float> [[R71]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index 6c73a9fdce851..65e5458b25d2f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE2-LABEL: @sitofp_uitofp( @@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @sitofp_uitofp( @@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @sitofp_uitofp( @@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @fptosi_fptoui( @@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX-LABEL: @fptosi_fptoui( @@ -132,37 +136,51 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { define <8 x float> @fneg_fabs(<8 x float> %a) { ; SSE2-LABEL: @fneg_fabs( -; SSE2-NEXT: [[A:%.*]] = fneg <8 x float> [[A1:%.*]] -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]]) -; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648) +; SSE2-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647) +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float> ; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; SLM-LABEL: @fneg_fabs( -; SLM-NEXT: [[A:%.*]] = fneg <8 x float> [[A1:%.*]] -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]]) -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648) +; SLM-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647) +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float> ; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX-LABEL: @fneg_fabs( -; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX2-LABEL: @fneg_fabs( -; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX2-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX512-LABEL: @fneg_fabs( -; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX512-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @sext_zext( @@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) { ; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX-LABEL: @sext_zext( @@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> ; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 225843613165a..fad46870ec475 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE2-LABEL: @sitofp_uitofp( @@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @sitofp_uitofp( @@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @sitofp_uitofp( @@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @fptosi_fptoui( @@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX-LABEL: @fptosi_fptoui( @@ -132,37 +136,51 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { define <8 x float> @fneg_fabs(<8 x float> %a) { ; SSE2-LABEL: @fneg_fabs( -; SSE2-NEXT: [[A:%.*]] = fneg <8 x float> [[A1:%.*]] -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]]) -; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648) +; SSE2-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647) +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) +; SSE2-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float> ; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; SLM-LABEL: @fneg_fabs( -; SLM-NEXT: [[A:%.*]] = fneg <8 x float> [[A1:%.*]] -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]]) -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648) +; SLM-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647) +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) +; SLM-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float> ; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX-LABEL: @fneg_fabs( -; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX2-LABEL: @fneg_fabs( -; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX2-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX2-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]] ; ; AVX512-LABEL: @fneg_fabs( -; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; AVX512-NEXT: [[TMP3:%.*]] = and <8 x i32> [[TMP1]], +; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; AVX512-NEXT: [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float> ; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 @@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) { ; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE2-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @sext_zext( @@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) { ; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX-LABEL: @sext_zext( @@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> ; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll index 5cee6984df04f..99b13bdc05082 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,34 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @fadd_fsub_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SSE-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @fadd_fsub_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @fadd_fsub_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX-NEXT: ret <8 x float> [[TMP5]] ; ; AVX2-LABEL: @fadd_fsub_v8f32( @@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @fmul_fdiv_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SSE-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX-NEXT: ret <8 x float> [[TMP5]] ; ; AVX2-LABEL: @fmul_fdiv_v8f32( -; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX2-NEXT: ret <8 x float> [[TMP5]] ; ; AVX512-LABEL: @fmul_fdiv_v8f32( @@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; SSE-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00 +; SLM-NEXT: [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] ; ; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX-NEXT: ret <4 x float> [[TMP1]] ; ; AVX2-LABEL: @fmul_fdiv_v4f32_const( -; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX2-NEXT: ret <4 x float> [[TMP1]] ; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX512-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX512-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll index 9a2f959ac63bc..7f9475917b566 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,34 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @fadd_fsub_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SSE-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @fadd_fsub_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @fadd_fsub_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX-NEXT: ret <8 x float> [[TMP5]] ; ; AVX2-LABEL: @fadd_fsub_v8f32( @@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @fmul_fdiv_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SSE-NEXT: ret <8 x float> [[TMP5]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; SLM-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; SLM-NEXT: ret <8 x float> [[TMP5]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX-NEXT: ret <8 x float> [[TMP5]] ; ; AVX2-LABEL: @fmul_fdiv_v8f32( -; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4) +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; AVX2-NEXT: ret <8 x float> [[TMP5]] ; ; AVX512-LABEL: @fmul_fdiv_v8f32( @@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; SSE-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00 +; SLM-NEXT: [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] ; ; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX-NEXT: ret <4 x float> [[TMP1]] ; ; AVX2-LABEL: @fmul_fdiv_v4f32_const( -; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX2-NEXT: ret <4 x float> [[TMP1]] ; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX512-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[A]], +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> ; AVX512-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index f8c5df9944538..11ab7770a5383 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -1,26 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @add_sub_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @add_sub_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]] ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @add_sub_v8i32( @@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]] ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( @@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( @@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( @@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( @@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( @@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX1-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX1-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX1-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( @@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( @@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) { define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { ; SSE-LABEL: @sdiv_v8i32_undefs( -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @sdiv_v8i32_undefs( -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @sdiv_v8i32_undefs( -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @sdiv_v8i32_undefs( -; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX2-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX2-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @sdiv_v8i32_undefs( -; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; AVX512-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX512-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX512-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; SSE-LABEL: @add_sub_v8i32_splat( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @add_sub_v8i32_splat( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX1-LABEL: @add_sub_v8i32_splat( -; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] @@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; AVX1-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX2-LABEL: @add_sub_v8i32_splat( -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] @@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; AVX2-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX512-LABEL: @add_sub_v8i32_splat( -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index b84ef027f67c5..9589ec24d49d4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -1,26 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @add_sub_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @add_sub_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]] ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @add_sub_v8i32( @@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]] ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]] ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32( @@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP5]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( @@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2) ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( @@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( @@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( @@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX1-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX1-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX1-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( @@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( @@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) { define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { ; SSE-LABEL: @sdiv_v8i32_undefs( -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @sdiv_v8i32_undefs( -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @sdiv_v8i32_undefs( -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3 -; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 -; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 -; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 ; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5 -; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 -; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @sdiv_v8i32_undefs( -; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX2-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX2-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @sdiv_v8i32_undefs( -; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; AVX512-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX512-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX512-NEXT: [[AB0:%.*]] = sdiv i32 [[A0]], undef ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB4:%.*]] = sdiv i32 [[A4]], undef ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; SSE-LABEL: @add_sub_v8i32_splat( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4) ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @add_sub_v8i32_splat( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4) ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX1-LABEL: @add_sub_v8i32_splat( -; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] @@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; AVX1-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX2-LABEL: @add_sub_v8i32_splat( -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] @@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; AVX2-NEXT: ret <8 x i32> [[TMP5]] ; ; AVX512-LABEL: @add_sub_v8i32_splat( -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0 ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll index 3ee6c55e429bd..c2bd9e648e4fd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @g( @@ -86,8 +86,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret i8 [[TMP8]] ; @@ -107,14 +107,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]] ; CHECK-NEXT: ret i8 [[TMP9]] ; @@ -136,14 +136,14 @@ define i8 @k_bb(<4 x i8> %x) { ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]] ; CHECK-NEXT: ret i8 [[TMP9]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll index 60e008d12e6e1..e4e45eb99fbdf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @g( @@ -39,8 +39,14 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) { define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @h_undef( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> , <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> undef, i32 0 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3 +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[X0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[X3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Y1]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[Y2]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[TMP2]], [[TMP2]] ; CHECK-NEXT: ret <4 x i8> [[TMP3]] ; @@ -87,8 +93,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret i8 [[TMP8]] ; @@ -108,14 +114,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]] ; CHECK-NEXT: ret i8 [[TMP9]] ; @@ -137,14 +143,14 @@ define i8 @k_bb(<4 x i8> %x) { ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]] ; CHECK-NEXT: ret i8 [[TMP9]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll index fd5f09bf2adc0..7d5ea22436c7c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s ; ; Check that we can commute operands based on the predicate. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll index 35619d6d3ad1d..25aa24b4cc81d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s ; ; Check that we can commute operands based on the predicate. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll index b85ec5bce8192..815ad044ca077 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX ; ; 128-bit vectors @@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( @@ -214,14 +214,14 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; PR50392 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @test_v4f64_partial_swizzle( -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 +; CHECK-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i32 3 ; CHECK-NEXT: ret <4 x double> [[R03]] ; %a0 = extractelement <4 x double> %a, i64 0 @@ -243,21 +243,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( @@ -305,21 +305,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE-LABEL: @test_v4i64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x i64> [[TMP7]] ; ; SLM-LABEL: @test_v4i64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x i64> [[TMP7]] ; ; AVX-LABEL: @test_v4i64( @@ -351,21 +351,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @test_v8i32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @test_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX-LABEL: @test_v8i32( @@ -413,21 +413,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll index b83d35541bbae..fb037b87da546 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX ; ; 128-bit vectors @@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( @@ -214,15 +214,15 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; PR50392 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64_partial_swizzle( -; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 +; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1 -; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3 +; SSE-NEXT: [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> +; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3 ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64_partial_swizzle( @@ -232,21 +232,21 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> ; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> , double [[R0]], i64 0 +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> zeroinitializer, double [[R0]], i32 0 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; SLM-NEXT: [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> ; SLM-NEXT: ret <4 x double> [[R031]] ; ; AVX-LABEL: @test_v4f64_partial_swizzle( -; AVX-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; AVX-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 +; AVX-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; AVX-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; AVX-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX-NEXT: [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1 -; AVX-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3 +; AVX-NEXT: [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> +; AVX-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3 ; AVX-NEXT: ret <4 x double> [[R03]] ; %a0 = extractelement <4 x double> %a, i64 0 @@ -268,21 +268,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( @@ -330,21 +330,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE-LABEL: @test_v4i64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x i64> [[TMP7]] ; ; SLM-LABEL: @test_v4i64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x i64> [[TMP7]] ; ; AVX-LABEL: @test_v4i64( @@ -376,21 +376,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @test_v8i32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @test_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX-LABEL: @test_v8i32( @@ -438,21 +438,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SSE-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SLM-NEXT: [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll index 40b6a8c32f5d0..6f655b3e9ae0b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; ; 128-bit vectors @@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( @@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( @@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE-LABEL: @test_v4i64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x i64> [[TMP7]] ; ; SLM-LABEL: @test_v4i64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x i64> [[TMP7]] ; ; AVX-LABEL: @test_v4i64( @@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @test_v8i32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @test_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX-LABEL: @test_v8i32( @@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll index 09113323d3ab7..2ab81d9b8d308 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; ; 128-bit vectors @@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( @@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( @@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE-LABEL: @test_v4i64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <4 x i64> [[TMP7]] ; ; SLM-LABEL: @test_v4i64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <4 x i64> [[TMP7]] ; ; AVX-LABEL: @test_v4i64( @@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @test_v8i32( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <8 x i32> [[TMP7]] ; ; SLM-LABEL: @test_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <8 x i32> [[TMP7]] ; ; AVX-LABEL: @test_v8i32( @@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SSE-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]] ; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] -; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> +; SLM-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]] ; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll index 9c9a89ac76d8b..58fb5f772207d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512f -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -17,15 +17,15 @@ target triple = "x86_64-unknown-linux-gnu" define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_zext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 ; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -33,15 +33,15 @@ define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; ; AVX-LABEL: @PR31243_zext( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]] -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 +; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] ; AVX-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; AVX-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; AVX-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -76,15 +76,15 @@ entry: define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_sext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 ; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i32 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i32 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -92,15 +92,15 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; ; AVX-LABEL: @PR31243_sext( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64 -; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]] -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; AVX-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64 -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i32 +; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i32 +; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] ; AVX-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; AVX-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; AVX-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index 61938d01e57ac..de72521345435 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" @@ -10,16 +10,16 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) { ; CHECK-LABEL: @shuffle_operands1( ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @shuffle_operands1( ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1 ; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: ret void @@ -43,7 +43,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -57,7 +57,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -91,7 +91,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -105,7 +105,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -139,7 +139,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -153,7 +153,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -185,12 +185,12 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 +; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1 ; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 ; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 @@ -205,7 +205,7 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -239,7 +239,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -253,7 +253,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -286,13 +286,13 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 +; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1 ; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 ; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -306,7 +306,7 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -347,23 +347,20 @@ define void @good_load_order() { ; CHECK: for.body3: ; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] ; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 @@ -382,23 +379,20 @@ define void @good_load_order() { ; SSE2: for.body3: ; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 -; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]] -; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]] -; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] +; SSE2-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]] +; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]] +; SSE2-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4 +; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]] ; SSE2-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 ; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 ; SSE2-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> ; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] ; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; SSE2-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] +; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]] ; SSE2-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 ; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] ; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll index 2588012847d09..6fd2de8ad8ab5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s define void @test1(float %a, float %b, float %c, float %d, ptr nocapture %p) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: ret void @@ -30,10 +30,10 @@ entry: define void @test1_vec(float %a, float %b, float %c, float %d, ptr nocapture %p) { ; CHECK-LABEL: @test1_vec( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 16, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void @@ -54,10 +54,10 @@ entry: define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, ptr nocapture %p) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 1) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void @@ -79,10 +79,10 @@ entry: define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, ptr nocapture %4) { ; CHECK-LABEL: @test2_vec( -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], splat (i32 1) ; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[TMP4:%.*]], align 16, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index bae127527bbbe..9e4f10ec7b349 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -1,19 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP4]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], splat (i32 15) -; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP8]], <4 x i32> splat (i32 255)) -; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], splat (i32 255) +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> splat (i32 255) +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %4 = load i32, ptr %0, align 4, !tbaa !2 @@ -50,13 +51,14 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i8( ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], splat (i32 15) -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP9]], <4 x i32> splat (i32 255)) -; CHECK-NEXT: [[TMP11:%.*]] = trunc nuw <4 x i32> [[TMP10]] to <4 x i8> -; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], splat (i32 255) +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> splat (i32 255) +; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP12]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]] ; CHECK-NEXT: ret void ; %4 = load i8, ptr %0, align 1, !tbaa !6 @@ -101,7 +103,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i64( ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index a9c0eb3f9f2b9..1b11c3dcc081c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 @b = global [8 x i32] zeroinitializer, align 16 @@ -13,14 +13,14 @@ define void @foo() { ; SSE-LABEL: @foo( ; SSE-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 ; SSE-NEXT: store i32 [[TMP1]], ptr @a, align 16 -; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 4), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 8), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 12), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 16), align 16 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 20), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 24), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 28), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 +; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4 +; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8 +; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4 +; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16 +; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4 +; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8 +; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index d2617a1986764..db38a62017391 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -1,89 +1,104 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; SSE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; SSE-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; SSE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; SSE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX2-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load( -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_2( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 ; SSE-NEXT: store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 ; SSE-NEXT: store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; SSE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12 +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 ; SSE-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; SSE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 ; SSE-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_2( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> ; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> @@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> ; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> @@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; SSE-LABEL: @gather_load_3( ; SSE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 ; SSE-NEXT: store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 ; SSE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 ; SSE-NEXT: store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 ; SSE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12 +; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 ; SSE-NEXT: store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 ; SSE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 ; SSE-NEXT: store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 ; SSE-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20 +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5 ; SSE-NEXT: store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 ; SSE-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24 +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6 ; SSE-NEXT: store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 ; SSE-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28 +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7 ; SSE-NEXT: store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 ; SSE-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 ; SSE-NEXT: store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]] @@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 ; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 ; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 ; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 ; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void @@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4 -; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8 -; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12 -; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16 -; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20 -; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; SSE-NEXT: [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24 -; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28 -; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1 +; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11 +; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2 +; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4 +; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3 +; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4 +; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18 +; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5 +; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9 +; SSE-NEXT: [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6 +; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6 +; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7 +; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21 ; SSE-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] @@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11 +; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4 +; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15 +; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18 +; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9 +; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6 +; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] @@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 ; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void @@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 ; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 ; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> ; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> ; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3 ; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] ; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 -; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 -; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 -; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3 ; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] ; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 ; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> ; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7 ; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] ; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 ; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> ; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7 ; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] ; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] @@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 8f31200a3683d..bfa3610804967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -1,89 +1,104 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; SSE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; SSE-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; SSE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; SSE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX2-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load( -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 ; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_2( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 ; SSE-NEXT: store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 ; SSE-NEXT: store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; SSE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12 +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 ; SSE-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; SSE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 ; SSE-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 ; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 ; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 ; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_2( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> ; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> @@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> ; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> @@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; SSE-LABEL: @gather_load_3( ; SSE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1 ; SSE-NEXT: store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 ; SSE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 ; SSE-NEXT: store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 ; SSE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12 +; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 ; SSE-NEXT: store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 ; SSE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 ; SSE-NEXT: store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 ; SSE-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20 +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5 ; SSE-NEXT: store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 ; SSE-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24 +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6 ; SSE-NEXT: store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 ; SSE-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28 +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7 ; SSE-NEXT: store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 ; SSE-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 ; SSE-NEXT: store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]] @@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 ; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 ; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 ; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 ; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void @@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4 -; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8 -; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12 -; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16 -; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20 -; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; SSE-NEXT: [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24 -; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28 -; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1 +; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11 +; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2 +; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4 +; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3 +; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4 +; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18 +; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5 +; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9 +; SSE-NEXT: [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6 +; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6 +; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7 +; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21 ; SSE-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] @@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11 +; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4 +; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15 +; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18 +; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9 +; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6 +; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] @@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 ; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void @@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 ; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 ; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> ; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> ; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> ; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3 ; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] ; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 -; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 -; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 -; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3 ; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] ; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 ; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> ; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7 ; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] ; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 ; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 ; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 ; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 ; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 ; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 ; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> ; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7 ; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] ; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] @@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index 42a50384787c8..a4949bc67b0f1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s | FileCheck %s ; These code should be fully vectorized by D57059 patch target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i64 0 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0 +; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> @@ -28,7 +28,7 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) { define <4 x i32> @bar(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll index e7239f906c59d..bada001ebbc6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s ; These conversions should be vectorized by reviews.llvm.org/D57059 define dso_local <4 x float> @foo(<4 x i32> %0) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0:%.*]] to <4 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %2 = extractelement <4 x i32> %0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll index 54120f5402f57..70ab6d09a9236 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s define float @dotf(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: @dotf( diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll index b85c78ec8d2d0..cec99c694391b 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s -; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE +; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE define void @test1(ptr %in, ptr %out) { ; CHECK-LABEL: @test1( @@ -19,8 +19,14 @@ define void @test1(ptr %in, ptr %out) { ; COMBINE-LABEL: @test1( ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 -; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> -; COMBINE-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8 +; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 +; COMBINE-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> +; COMBINE-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> +; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> +; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; COMBINE-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT]], align 8 ; COMBINE-NEXT: ret void ; entry: @@ -61,9 +67,14 @@ define void @test2(ptr %in, ptr %out) { ; COMBINE-LABEL: @test2( ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 -; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> +; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 +; COMBINE-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> +; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> +; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> ; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> -; COMBINE-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT:%.*]], align 8 +; COMBINE-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT]], align 8 ; COMBINE-NEXT: ret void ; entry: @@ -99,8 +110,11 @@ define void @test3(<16 x i32> %0, ptr %out) { ; ; COMBINE-LABEL: @test3( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <16 x i32> -; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0) +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> +; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 +; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 ; COMBINE-NEXT: ret void ; entry: @@ -134,8 +148,12 @@ define void @test4(ptr %in, ptr %out) { ; COMBINE-LABEL: @test4( ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4 -; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> -; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 +; COMBINE-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> +; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> +; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 ; COMBINE-NEXT: ret void ; entry: @@ -165,7 +183,11 @@ define void @test5(ptr %out) { ; ; COMBINE-LABEL: @test5( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0) +; COMBINE-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8) +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> +; COMBINE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0 +; COMBINE-NEXT: store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4 ; COMBINE-NEXT: ret void ; entry: @@ -219,28 +241,37 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; ; COMBINE-LABEL: @test6( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32 +; COMBINE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32 ; COMBINE-NEXT: [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16 ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; COMBINE-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1 -; COMBINE-NEXT: [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float> +; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> +; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> +; COMBINE-NEXT: [[TMP4:%.*]] = zext <32 x i8> [[TMP11]] to <32 x i16> +; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <64 x i32> +; COMBINE-NEXT: [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> +; COMBINE-NEXT: [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float> ; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> +; COMBINE-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP3]], <4 x float> [[LOAD2]], i64 8) ; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> ; COMBINE-NEXT: [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]] -; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32 -; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128 +; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32 +; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128 ; COMBINE-NEXT: [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2]], align 16 ; COMBINE-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 -; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float> -; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> -; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> -; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> +; COMBINE-NEXT: [[TMP13:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0) +; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <32 x i32> +; COMBINE-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> +; COMBINE-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16> +; COMBINE-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> +; COMBINE-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> +; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float> +; COMBINE-NEXT: [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0) +; COMBINE-NEXT: [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 0) +; COMBINE-NEXT: [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4) +; COMBINE-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 4) +; COMBINE-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8) ; COMBINE-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP17:%.*]] = fmul <16 x float> [[TMP16]], [[TMP9]] ; COMBINE-NEXT: store <16 x float> [[TMP17]], ptr [[GEP11]], align 16 @@ -353,7 +384,21 @@ define i32 @test7() { ; ; COMBINE-LABEL: @test7( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: store <16 x float> poison, ptr null, align 16 +; COMBINE-NEXT: [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0) +; COMBINE-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8) +; COMBINE-NEXT: [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]] +; COMBINE-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]] +; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> +; COMBINE-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> +; COMBINE-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0) +; COMBINE-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4) +; COMBINE-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8) +; COMBINE-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12) +; COMBINE-NEXT: [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]] +; COMBINE-NEXT: [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]] +; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> +; COMBINE-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]] +; COMBINE-NEXT: store <16 x float> [[TMP13]], ptr null, align 16 ; COMBINE-NEXT: ret i32 0 ; entry: